diff --git a/Makefile.am b/Makefile.am
index 0304cfb26..a643fa555 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,497 +1,493 @@
 # This file is part of Invenio.
 # Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 confignicedir = $(sysconfdir)/build
 confignice_SCRIPTS=config.nice
 
 SUBDIRS = po config modules scripts
 
 EXTRA_DIST = UNINSTALL THANKS RELEASE-NOTES configure-tests.py config.nice.in \
              config.rpath CONTRIBUTING.rst Dockerfile docker-compose.yml \
              requirements.txt Vagrantfile .inveniorc
 
 
 # current MathJax version and packages
 # See also modules/miscutil/lib/htmlutils.py (get_mathjax_header)
 MJV = 2.3
 MATHJAX = http://invenio-software.org/download/mathjax/MathJax-v$(MJV).zip
 
 # current CKeditor version
 CKV = 3.6.6
 CKEDITOR = ckeditor_$(CKV).zip
 
-# current MediaElement.js version
-MEV = master
-MEDIAELEMENT = http://github.com/johndyer/mediaelement/zipball/$(MEV)
-
 #for solrutils
 INVENIO_JAVA_PATH = org/invenio_software/solr
 solrdirname = apache-solr-3.1.0
 solrdir = $(prefix)/lib/$(solrdirname)
 solrutils_dir=$(CURDIR)/modules/miscutil/lib/solrutils
 
 CLASSPATH=.:${solrdir}/dist/solrj-lib/commons-io-1.4.jar:${solrdir}/dist/apache-solr-core-*jar:${solrdir}/contrib/jzlib-1.0.7.jar:${solrdir}/dist/apache-solr-solrj-3.1.0.jar:${solrdir}/dist/solrj-lib/slf4j-api-1.5.5.jar:${solrdir}/dist/*:${solrdir}/contrib/basic-lucene-libs/*:${solrdir}/contrib/analysis-extras/lucene-libs/*:${solrdir}/dist/solrj-lib/*
 
 # git-version-get stuff:
 BUILT_SOURCES = $(top_srcdir)/.version
 $(top_srcdir)/.version:
 	echo $(VERSION) > $@-t && mv $@-t $@
 dist-hook:
 	echo $(VERSION) > $(distdir)/.tarball-version
 
 check-upgrade:
 	$(PYTHON) $(top_srcdir)/modules/miscutil/lib/inveniocfg_upgrader.py $(top_srcdir) --upgrade-check
 
 kwalitee-check:
 	@$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --stats $(top_srcdir)
 
 kwalitee-check-errors-only:
 	@$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-errors $(top_srcdir)
 
 kwalitee-check-variables:
 	@$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-variables $(top_srcdir)
 
 kwalitee-check-indentation:
 	@$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-indentation $(top_srcdir)
 
 kwalitee-check-sql-queries:
 	@$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-sql $(top_srcdir)
 
 etags:
 	\rm -f $(top_srcdir)/TAGS
 	(cd $(top_srcdir) && find $(top_srcdir) -name "*.py" -print | xargs etags)
 
 install-data-local:
 	for d in / /cache /cache/RTdata /log /tmp /tmp-shared /data /run /tmp-shared/bibencode/jobs/done /tmp-shared/bibedit-cache; do	\
 		mkdir -p $(localstatedir)$$d ;		\
 	done
 	@echo "************************************************************"
 	@echo "** Invenio software has been successfully installed!      **"
 	@echo "**                                                        **"
 	@echo "** You may proceed to customizing your installation now.  **"
 	@echo "************************************************************"
 
 install-mathjax-plugin:
 	@echo "***********************************************************"
 	@echo "** Installing MathJax plugin, please wait...             **"
 	@echo "***********************************************************"
 	rm -rf /tmp/invenio-mathjax-plugin
 	mkdir /tmp/invenio-mathjax-plugin
 	rm -fr ${prefix}/var/www/MathJax
 	mkdir -p ${prefix}/var/www/MathJax
 	(cd /tmp/invenio-mathjax-plugin && \
 	wget '$(MATHJAX)' -O mathjax.zip && \
 	unzip -q mathjax.zip && cd mathjax-MathJax-* && cp -r * \
 	${prefix}/var/www/MathJax)
 	rm -fr /tmp/invenio-mathjax-plugin
 	@echo "************************************************************"
 	@echo "** The MathJax plugin was successfully installed.         **"
 	@echo "** Please do not forget to properly set the option        **"
 	@echo "** CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS and              **"
 	@echo "** CFG_WEBSUBMIT_USE_MATHJAX in invenio.conf.             **"
 	@echo "************************************************************"
 
 uninstall-mathjax-plugin:
 	@rm -rvf ${prefix}/var/www/MathJax
 	@echo "***********************************************************"
 	@echo "** The MathJax plugin was successfully uninstalled.      **"
 	@echo "***********************************************************"
 
 install-jscalendar-plugin:
 	@echo "***********************************************************"
 	@echo "** Installing jsCalendar plugin, please wait...          **"
 	@echo "***********************************************************"
 	rm -rf /tmp/invenio-jscalendar-plugin
 	mkdir /tmp/invenio-jscalendar-plugin
 	(cd /tmp/invenio-jscalendar-plugin && \
 	wget 'http://www.dynarch.com/static/jscalendar-1.0.zip' && \
 	unzip -u jscalendar-1.0.zip && \
 	mkdir -p ${prefix}/var/www/jsCalendar && \
 	cp jscalendar-1.0/img.gif ${prefix}/var/www/jsCalendar/jsCalendar.gif && \
 	cp jscalendar-1.0/calendar.js ${prefix}/var/www/jsCalendar/ && \
 	cp jscalendar-1.0/calendar-setup.js ${prefix}/var/www/jsCalendar/ && \
 	cp jscalendar-1.0/lang/calendar-en.js ${prefix}/var/www/jsCalendar/ && \
 	cp jscalendar-1.0/calendar-blue.css ${prefix}/var/www/jsCalendar/)
 	rm -fr /tmp/invenio-jscalendar-plugin
 	@echo "***********************************************************"
 	@echo "** The jsCalendar plugin was successfully installed.     **"
 	@echo "***********************************************************"
 
 uninstall-jscalendar-plugin:
 	@rm -rvf ${prefix}/var/www/jsCalendar
 	@echo "***********************************************************"
 	@echo "** The jsCalendar plugin was successfully uninstalled.   **"
 	@echo "***********************************************************"
 
 install-js-test-driver:
 	@echo "*******************************************************"
 	@echo "** Installing js-test-driver, please wait...         **"
 	@echo "*******************************************************"
 	mkdir -p $(prefix)/lib/java/js-test-driver && \
 	cd $(prefix)/lib/java/js-test-driver && \
 	wget http://invenio-software.org/download/js-test-driver/JsTestDriver-1.3.5.jar -O JsTestDriver.jar
 
 uninstall-js-test-driver:
 	@rm -rvf ${prefix}/lib/java/js-test-driver
 	@echo "*********************************************************"
 	@echo "** The js-test-driver was successfully uninstalled.    **"
 	@echo "*********************************************************"
 
 install-jquery-plugins:
 	@echo "***********************************************************"
 	@echo "** Installing various jQuery plugins, please wait...     **"
 	@echo "***********************************************************"
 	mkdir -p ${prefix}/var/www/js
 	mkdir -p $(prefix)/var/www/css
 	(cd ${prefix}/var/www/js && \
 	wget -O jquery.min.js http://invenio-software.org/download/jquery/jquery-1.7.1.min.js && \
 	wget -N http://ajax.googleapis.com/ajax/libs/jqueryui/1.8.17/jquery-ui.min.js && \
 	wget -O jquery.jeditable.mini.js http://invenio-software.org/download/jquery/jquery.jeditable.custom.min.js && \
 	wget -N https://raw.githubusercontent.com/malsup/form/3.51/jquery.form.js --no-check-certificate && \
-	wget -N http://jquery-multifile-plugin.googlecode.com/svn-history/r54/trunk/jquery.MultiFile.pack.js && \
+	wget -N http://invenio-software.org/download/jquery/jquery.MultiFile.pack.js && \
 	wget -O jquery.tablesorter.zip http://invenio-software.org/download/jquery/jquery.tablesorter.20111208.zip && \
 	wget -O uploadify.zip http://invenio-software.org/download/jquery/uploadify-v2.1.4.zip && \
 	wget -N http://cdn.datatables.net/1.10.4/js/jquery.dataTables.min.js && \
 	wget -N http://invenio-software.org/download/jquery/jquery.bookmark.package-1.4.0.zip && \
 	unzip -u jquery.tablesorter.zip -d tablesorter && \
 	wget -N http://invenio-software.org/download/jquery/jquery.fileTree-1.01.zip && \
 	unzip -u jquery.fileTree-1.01.zip && \
 	rm jquery.fileTree-1.01.zip && \
 	wget -N http://invenio-software.org/download/jquery/ColVis.min.js && \
 	mv ColVis.min.js jquery.dataTables.ColVis.min.js && \
 	rm jquery.tablesorter.zip && \
 	rm -rf uploadify && \
 	unzip -u uploadify.zip -d uploadify && \
 	wget -N http://invenio-software.org/download/jquery/flot-0.6.zip && \
 	wget -N http://www.csspace.net/tmp/jquery-lightbox-0.5.zip && \
 	rm -rf jquery-lightbox && \
 	unzip -u jquery-lightbox-0.5.zip -d jquery-lightbox && \
 	sed -i 's/images\//\/js\/jquery-lightbox\/images\//g' jquery-lightbox/js/jquery.lightbox-0.5.js && \
 	rm -rf jquery-lightbox-0.5.zip && \
 	wget -O jquery-ui-timepicker-addon.js http://invenio-software.org/download/jquery/jquery-ui-timepicker-addon-1.0.3.js && \
 	unzip -u flot-0.6.zip && \
 	mv flot/jquery.flot.selection.min.js flot/jquery.flot.min.js flot/excanvas.min.js ./ && \
 	rm flot-0.6.zip && rm -r flot && \
 	mv uploadify/swfobject.js ./ && \
 	mv uploadify/cancel.png uploadify/uploadify.css uploadify/uploadify.allglyphs.swf uploadify/uploadify.fla uploadify/uploadify.swf ../img/ && \
 	mv uploadify/jquery.uploadify.v2.1.4.min.js ./jquery.uploadify.min.js && \
 	rm uploadify.zip && rm -r uploadify && \
-	wget -N https://github.com/douglascrockford/JSON-js/raw/master/json2.js --no-check-certificate && \
+	wget -N http://invenio-software.org/download/jquery/json2.js --no-check-certificate && \
 	wget -O jquery.hotkeys.js http://invenio-software.org/download/jquery/jquery.hotkeys-0.8.js && \
 	wget -N http://invenio-software.org/download/jquery/jquery.treeview.zip && \
 	unzip -u jquery.treeview.zip -d jquery-treeview && \
 	rm jquery.treeview.zip && \
 	wget -N http://invenio-software.org/download/jquery/v1.5/js/jquery.ajaxPager.js && \
 	unzip -u jquery.bookmark.package-1.4.0.zip && \
 	rm -f jquery.bookmark.ext.* bookmarks-big.png bookmarkBasic.html jquery.bookmark.js jquery.bookmark.pack.js && \
 	mv bookmarks.png ../img/ && \
 	mv jquery.bookmark.css ../css/ && \
 	wget -N --no-check-certificate http://invenio-software.org/download/jquery/jquery.omniwindow.js && \
 	wget -N --no-check-certificate http://invenio-software.org/download/jquery/jquery.blockUI.js && \
 	wget -N --no-check-certificate http://invenio-software.org/download/jquery/sly.min.js &&\
 	wget -N --no-check-certificate http://invenio-software.org/download/jquery/parsley.js &&\
 	wget -N --no-check-certificate http://invenio-software.org/download/jquery/spin.min.js &&\
 	rm -f jquery.bookmark.package-1.4.0.zip && \
 	wget https://cdnjs.cloudflare.com/ajax/libs/handlebars.js/1.3.0/handlebars.min.js && \
 	wget https://twitter.github.com/typeahead.js/releases/0.10.5/typeahead.bundle.min.js && \
 	wget https://raw.githubusercontent.com/es-shims/es5-shim/v4.0.3/es5-shim.min.js && \
 	wget https://raw.githubusercontent.com/es-shims/es5-shim/v4.0.3/es5-shim.map && \
 	mkdir -p ${prefix}/var/www/img && \
 	cd ${prefix}/var/www/img && \
-	wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/base/ && \
-	wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/smoothness/ && \
-	wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/redmond/ && \
-	wget --no-check-certificate -O datatables_jquery-ui.css https://raw.githubusercontent.com/DataTables/DataTables/1.10.0/media/css/demo_table_jui.css && \
-	wget -N http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/redmond/jquery-ui.css && \
-	wget -N http://jquery-ui.googlecode.com/svn/tags/1.8.17/demos/images/calendar.gif && \
-	wget -r -np -nH --cut-dirs=5 -A "png" http://jquery-ui.googlecode.com/svn/tags/1.8.17/themes/redmond/images/)
+	wget http://invenio-software.org/download/jquery/jquery-ui.tar.gz && \
+	tar xvfz jquery-ui.tar.gz && \
+	rm jquery-ui.tar.gz && \
+	wget -O datatables_jquery-ui.css https://raw.githubusercontent.com/DataTables/DataTables/1.10.0/media/css/demo_table_jui.css --no-check-certificate && \
+	cp jquery-ui/themes/redmond/jquery-ui.css . && \
+	wget -N http://invenio-software.org/download/jquery/v1.5/img/calendar.gif && \
+	cp jquery-ui/themes/redmond/images/*.png .)
 	@echo "***********************************************************"
 	@echo "** The jQuery plugins were successfully installed.       **"
 	@echo "***********************************************************"
 
 uninstall-jquery-plugins:
 	(cd ${prefix}/var/www/js && \
 	rm -f jquery.min.js && \
 	rm -f jquery.MultiFile.pack.js && \
 	rm -f jquery.jeditable.mini.js && \
 	rm -f jquery.flot.selection.min.js && \
 	rm -f jquery.flot.min.js && \
 	rm -f excanvas.min.js && \
 	rm -f jquery-ui-timepicker-addon.min.js && \
 	rm -f json2.js && \
 	rm -f jquery.uploadify.min.js && \
 	rm -rf tablesorter && \
 	rm -rf jquery-treeview && \
 	rm -f jquery.ajaxPager.js && \
 	rm -f jquery.form.js && \
 	rm -f jquery.dataTables.min.js && \
 	rm -f ui.core.js && \
 	rm -f jquery.bookmark.min.js && \
 	rm -f handlebars.min.js && \
 	rm -f typeahead.bundle.min.js && \
 	rm -f es5-shim.min.js && \
 	rm -f es5-shim.map && \
 	rm -f jquery.dataTables.ColVis.min.js && \
 	rm -f jquery.hotkeys.js && \
 	rm -f jquery.tablesorter.min.js && \
 	rm -f jquery-ui-1.7.3.custom.min.js && \
 	rm -f jquery.metadata.js && \
 	rm -f jquery-latest.js && \
 	rm -f jquery-ui.min.js && \
 	rm -rf jquery-lightbox && \
 	rm -f jquery-ui-timepicker-addon.js && \
 	rm -rf jqueryFileTree && \
 	rm -f swfobject.js && \
 	rm -f jquery.blockUI.js && \
 	rm -f sly.min.js && \
 	rm -f parsley.js && \
 	rm -f spin.min.js && \
 	rm -f jquery.omniwindow.js)
 	(cd ${prefix}/var/www/img && \
 	rm -f cancel.png uploadify.css uploadify.swf uploadify.allglyphs.swf uploadify.fla && \
 	rm -f datatables_jquery-ui.css \
 	rm -f bookmarks.png \
 	rm -f demo_table_jui.css \
 	rm -f calendar.gif \
 	rm -rf jquery-ui/themes) && \
 	(cd ${prefix}/var/www/css && \
 	rm -f jquery.bookmark.css)
 	@echo "***********************************************************"
 	@echo "** The jquery plugins were successfully uninstalled.     **"
 	@echo "***********************************************************"
 
 install-ckeditor-plugin:
 	@echo "***********************************************************"
 	@echo "** Installing CKeditor plugin, please wait...           **"
 	@echo "***********************************************************"
 	rm -rf ${prefix}/lib/python/invenio/ckeditor/
 	rm -rf /tmp/invenio-ckeditor-plugin
 	mkdir /tmp/invenio-ckeditor-plugin
 	(cd /tmp/invenio-ckeditor-plugin && \
 	wget 'http://invenio-software.org/download/ckeditor/$(CKEDITOR)' && \
 	unzip -u -d ${prefix}/var/www $(CKEDITOR)) && \
 	find ${prefix}/var/www/ckeditor/ -depth -name '_*' -exec rm -rf {} \; && \
 	find ${prefix}/var/www/ckeditor/ckeditor* -maxdepth 0 ! -name "ckeditor.js" -exec rm -r {} \; && \
 	rm -fr /tmp/invenio-ckeditor-plugin
 	@echo "* Installing Invenio-specific CKeditor config..."
 	(cd $(top_srcdir)/modules/webstyle/etc && make install)
 	@echo "***********************************************************"
 	@echo "** The CKeditor plugin was successfully installed.      **"
 	@echo "** Please do not forget to properly set the option       **"
 	@echo "** CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR in invenio.conf.  **"
 	@echo "***********************************************************"
 
 uninstall-ckeditor-plugin:
 	@rm -rvf ${prefix}/var/www/ckeditor
 	@rm -rvf ${prefix}/lib/python/invenio/ckeditor
 	@echo "***********************************************************"
 	@echo "** The CKeditor plugin was successfully uninstalled.    **"
 	@echo "***********************************************************"
 
 install-pdfa-helper-files:
 	@echo "***********************************************************"
 	@echo "** Installing PDF/A helper files, please wait...         **"
 	@echo "***********************************************************"
 	wget 'http://invenio-software.org/download/invenio-demo-site-files/ISOCoatedsb.icc' -O ${prefix}/etc/websubmit/file_converter_templates/ISOCoatedsb.icc
 	@echo "***********************************************************"
 	@echo "** The PDF/A helper files were successfully installed.   **"
 	@echo "***********************************************************"
 
 install-mediaelement:
 	@echo "***********************************************************"
 	@echo "** MediaElement.js, please wait...                       **"
 	@echo "***********************************************************"
 	rm -rf /tmp/mediaelement
 	mkdir /tmp/mediaelement
-	wget 'http://github.com/johndyer/mediaelement/zipball/master' -O '/tmp/mediaelement/mediaelement.zip' --no-check-certificate
+	wget 'http://github.com/johndyer/mediaelement/zipball/2.18.1' -O '/tmp/mediaelement/mediaelement.zip' --no-check-certificate
 	unzip -u -d '/tmp/mediaelement' '/tmp/mediaelement/mediaelement.zip'
 	rm -rf ${prefix}/var/www/mediaelement
 	mkdir ${prefix}/var/www/mediaelement
 	mv /tmp/mediaelement/johndyer-mediaelement-*/build/* ${prefix}/var/www/mediaelement
 	rm -rf /tmp/mediaelement
 	@echo "***********************************************************"
 	@echo "** MediaElement.js was successfully installed.           **"
 	@echo "***********************************************************"
 
 uninstall-pdfa-helper-files:
 	rm -f ${prefix}/etc/websubmit/file_converter_templates/ISOCoatedsb.icc
 	@echo "***********************************************************"
 	@echo "** The PDF/A helper files were successfully uninstalled. **"
 	@echo "***********************************************************"
 
 #Solrutils allows automatic installation, running and searching of an external Solr index.
 install-solrutils:
 	@echo "***********************************************************"
 	@echo "** Installing Solrutils and solr, please wait...         **"
 	@echo "***********************************************************"
 	cd $(prefix)/lib && \
 	if test -d apache-solr*; then echo A solr directory already exists in `pwd` . \
 	Please remove it manually, if you are sure it is not needed; exit 2; fi ; \
         if test -f apache-solr*; then echo solr tarball already exists in `pwd` . \
         Please remove it manually.; exit 2; fi ; \
 	wget http://archive.apache.org/dist/lucene/solr/3.1.0/apache-solr-3.1.0.tgz && \
 	tar -xzf apache-solr-3.1.0.tgz && \
 	rm apache-solr-3.1.0.tgz
 	cd $(solrdir)/contrib/ ;\
 	wget http://mirrors.ibiblio.org/pub/mirrors/maven2/com/jcraft/jzlib/1.0.7/jzlib-1.0.7.jar && \
 	cd $(solrdir)/contrib/ ;\
 	jar -xf ../example/webapps/solr.war WEB-INF/lib/lucene-core-3.1.0.jar ; \
         if test -d basic-lucene-libs; then rm -rf basic-lucene-libs; fi ; \
         mv WEB-INF/lib/ basic-lucene-libs ; \
         cp $(solrutils_dir)/schema.xml $(solrdir)/example/solr/conf/
 	cp $(solrutils_dir)/solrconfig.xml $(solrdir)/example/solr/conf/
 	cd $(solrutils_dir) && \
 	javac -classpath $(CLASSPATH) -d $(solrdir)/contrib @$(solrutils_dir)/java_sources.txt && \
 	cd $(solrdir)/contrib/ && \
 	jar -cf invenio-solr.jar org/invenio_software/solr/*class
 
 update-v0.99.0-tables:
 	cat $(top_srcdir)/modules/miscutil/sql/tabcreate.sql | grep -v 'INSERT INTO upgrade' | ${prefix}/bin/dbexec
 	echo "DROP TABLE IF EXISTS oaiREPOSITORY;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bibdoc ADD COLUMN more_info mediumblob NULL default NULL;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE schTASK ADD COLUMN priority tinyint(4) NOT NULL default 0;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE schTASK ADD KEY priority (priority);" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE rnkCITATIONDATA DROP PRIMARY KEY;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE rnkCITATIONDATA ADD PRIMARY KEY (id);" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE rnkCITATIONDATA CHANGE id id mediumint(8) unsigned NOT NULL auto_increment;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE rnkCITATIONDATA ADD UNIQUE KEY object_name (object_name);" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE sbmPARAMETERS CHANGE value value text NOT NULL default '';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE sbmAPPROVAL ADD note text NOT NULL default '';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE hstDOCUMENT CHANGE docsize docsize bigint(15) unsigned NOT NULL;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtACTIONHISTORY CHANGE client_host client_host int(10) unsigned default NULL;" | ${prefix}/bin/dbexec
 
 update-v0.99.1-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.1 and v0.99.2."
 
 update-v0.99.2-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.2 and v0.99.3."
 
 update-v0.99.3-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.3 and v0.99.4."
 
 update-v0.99.4-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.4 and v0.99.5."
 
 update-v0.99.5-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.5 and v0.99.6."
 
 update-v0.99.6-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.6 and v0.99.7."
 
 update-v0.99.7-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.7 and v0.99.8."
 
 update-v0.99.8-tables:
 	@echo "Nothing to do; table structure did not change between v0.99.8 and v0.99.9."
 
 update-v0.99.9-tables: # from v0.99.9 to v1.0.0-rc0
 	echo "RENAME TABLE oaiARCHIVE TO oaiREPOSITORY;" | ${prefix}/bin/dbexec
 	cat $(top_srcdir)/modules/miscutil/sql/tabcreate.sql | grep -v 'INSERT INTO upgrade' | ${prefix}/bin/dbexec
 	echo "INSERT INTO knwKB (id,name,description,kbtype) SELECT id,name,description,'' FROM fmtKNOWLEDGEBASES;" | ${prefix}/bin/dbexec
 	echo "INSERT INTO knwKBRVAL (id,m_key,m_value,id_knwKB) SELECT id,m_key,m_value,id_fmtKNOWLEDGEBASES FROM fmtKNOWLEDGEBASEMAPPINGS;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE sbmPARAMETERS CHANGE name name varchar(40) NOT NULL default '';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bibdoc CHANGE docname docname varchar(250) COLLATE utf8_bin NOT NULL default 'file';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bibdoc CHANGE status status text NOT NULL default '';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bibdoc ADD COLUMN text_extraction_date datetime NOT NULL default '0000-00-00';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE collection DROP COLUMN restricted;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE schTASK CHANGE host host varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE hstTASK CHANGE host host varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bib85x DROP INDEX kv, ADD INDEX kv (value(100));" | ${prefix}/bin/dbexec
 	echo "UPDATE clsMETHOD SET location='http://invenio-software.org/download/invenio-demo-site-files/HEP.rdf' WHERE name='HEP' AND location='';" | ${prefix}/bin/dbexec
 	echo "UPDATE clsMETHOD SET location='http://invenio-software.org/download/invenio-demo-site-files/NASA-subjects.rdf' WHERE name='NASA-subjects' AND location='';" | ${prefix}/bin/dbexec
 	echo "UPDATE accACTION SET name='runoairepository', description='run oairepositoryupdater task' WHERE name='runoaiarchive';" | ${prefix}/bin/dbexec
 	echo "UPDATE accACTION SET name='cfgoaiharvest', description='configure OAI Harvest' WHERE name='cfgbibharvest';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE accARGUMENT CHANGE value value varchar(255);" | ${prefix}/bin/dbexec
 	echo "UPDATE accACTION SET allowedkeywords='doctype,act,categ' WHERE name='submit';" | ${prefix}/bin/dbexec
 	echo "INSERT INTO accARGUMENT(keyword,value) VALUES ('categ','*');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO accROLE_accACTION_accARGUMENT(id_accROLE,id_accACTION,id_accARGUMENT,argumentlistid) SELECT DISTINCT raa.id_accROLE,raa.id_accACTION,accARGUMENT.id,raa.argumentlistid FROM accROLE_accACTION_accARGUMENT as raa JOIN accACTION on id_accACTION=accACTION.id,accARGUMENT WHERE accACTION.name='submit' and accARGUMENT.keyword='categ' and accARGUMENT.value='*';" | ${prefix}/bin/dbexec
 	echo "UPDATE accACTION SET allowedkeywords='name,with_editor_rights' WHERE name='cfgwebjournal';" | ${prefix}/bin/dbexec
 	echo "INSERT INTO accARGUMENT(keyword,value) VALUES ('with_editor_rights','yes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO accROLE_accACTION_accARGUMENT(id_accROLE,id_accACTION,id_accARGUMENT,argumentlistid) SELECT DISTINCT raa.id_accROLE,raa.id_accACTION,accARGUMENT.id,raa.argumentlistid FROM accROLE_accACTION_accARGUMENT as raa JOIN accACTION on id_accACTION=accACTION.id,accARGUMENT WHERE accACTION.name='cfgwebjournal' and accARGUMENT.keyword='with_editor_rights' and accARGUMENT.value='yes';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskEXTREC CHANGE id id int(15) unsigned NOT NULL auto_increment;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskEXTREC ADD external_id int(15) NOT NULL default '0';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskEXTREC ADD collection_id int(15) unsigned NOT NULL default '0';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskEXTREC ADD original_url text;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD status char(2) NOT NULL default 'ok';" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD KEY status (status);" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Photos_to_Storage','Attach/edit the pictures uploaded with the \"create_photos_manager_interface()\" function');"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFIELDDESC VALUES ('Upload_Photos',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a photos upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Photos_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\n\r\nfrom invenio.websubmit_functions.ParamFile import ParamFromFile\r\nfrom invenio.websubmit_functions.Move_Photos_to_Storage import read_param_file, create_photos_manager_interface, get_session_id\r\n\r\n# Retrieve session id\r\ntry:\r\n    # User info is defined only in MBI/MPI actions...\r\n    session_id = get_session_id(None, uid, user_info) \r\nexcept:\r\n    session_id = get_session_id(req, uid, {})\r\n\r\n# Retrieve context\r\nindir = curdir.split(\'/\')[-3]\r\ndoctype = curdir.split(\'/\')[-2]\r\naccess = curdir.split(\'/\')[-1]\r\n\r\n# Get the record ID, if any\r\nsysno = ParamFromFile(\"%s/%s\" % (curdir,\'SN\')).strip()\r\n\r\n\"\"\"\r\nModify below the configuration of the photos manager interface.\r\nNote: \'can_reorder_photos\' parameter is not yet fully taken into consideration\r\n\r\nDocumentation of the function is available by running:\r\necho -e \'from invenio.websubmit_functions.Move_Photos_to_Storage import create_photos_manager_interface as f\\nprint f.__doc__\' | python\r\n\"\"\"\r\ntext += create_photos_manager_interface(sysno, session_id, uid,\r\n                                        doctype, indir, curdir, access,\r\n                                        can_delete_photos=True,\r\n                                        can_reorder_photos=True,\r\n                                        can_upload_photos=True,\r\n                                        editor_width=700,\r\n                                        editor_height=400,\r\n                                        initial_slider_value=100,\r\n                                        max_slider_value=200,\r\n                                        min_slider_value=80)','0000-00-00','0000-00-00',NULL,NULL,0);"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Photos_to_Storage','iconsize');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFIELDDESC VALUES ('Upload_Files',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a file upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Uploaded_Files_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\nfrom invenio.websubmit_managedocfiles import create_file_upload_interface\r\nfrom invenio.websubmit_functions.Shared_Functions import ParamFromFile\r\n\r\nindir = ParamFromFile(os.path.join(curdir, \'indir\'))\r\ndoctype = ParamFromFile(os.path.join(curdir, \'doctype\'))\r\naccess = ParamFromFile(os.path.join(curdir, \'access\'))\r\ntry:\r\n    sysno = int(ParamFromFile(os.path.join(curdir, \'SN\')).strip())\r\nexcept:\r\n    sysno = -1\r\nln = ParamFromFile(os.path.join(curdir, \'ln\'))\r\n\r\n\"\"\"\r\nRun the following to get the list of parameters of function \'create_file_upload_interface\':\r\necho -e \'from invenio.websubmit_managedocfiles import create_file_upload_interface as f\\nprint f.__doc__\' | python\r\n\"\"\"\r\ntext = create_file_upload_interface(recid=sysno,\r\n                                 print_outside_form_tag=False,\r\n                                 include_headers=True,\r\n                                 ln=ln,\r\n                                 doctypes_and_desc=[(\'main\',\'Main document\'),\r\n                                                    (\'additional\',\'Figure, schema, etc.\')],\r\n                                 can_revise_doctypes=[\'*\'],\r\n                                 can_describe_doctypes=[\'main\'],\r\n                                 can_delete_doctypes=[\'additional\'],\r\n                                 can_rename_doctypes=[\'main\'],\r\n                                 sbm_indir=indir, sbm_doctype=doctype, sbm_access=access)[1]\r\n','0000-00-00','0000-00-00',NULL,NULL,0);"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','forceFileRevision');"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Create_Upload_Files_Interface','Display generic interface to add/revise/delete files. To be used before function \"Move_Uploaded_Files_to_Storage\"');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Uploaded_Files_to_Storage','Attach files uploaded with \"Create_Upload_Files_Interface\"')" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','elementNameToDoctype');"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createIconDoctypes');"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createRelatedFormats');"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','iconsize');"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','keepPreviousVersionDoctypes');"  | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Revised_Files_to_Storage','Revise files initially uploaded with \"Move_Files_to_Storage\"')" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxsize');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','minsize');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','doctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictions');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDeleteDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canReviseDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDescribeDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canCommentDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canKeepDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canAddFormatDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRestrictDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRenameDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canNameNewFiles');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','createRelatedFormats');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','keepDefault');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','showLinks');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','fileLabel');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','filenameLabel');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','descriptionLabel');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','commentLabel');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictionLabel');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','startDoc');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','endDoc');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','defaultFilenameDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxFilesDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','iconsize');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','createIconDoctypes');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','nblength');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_nb_length');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Get_Recid','record_search_pattern');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_FCKeditor_Files_to_Storage','Transfer files attached to the record with the FCKeditor');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_FCKeditor_Files_to_Storage','input_fields');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','layer');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','layer');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','switch_file');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','switch_file');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_restrictions');" | ${prefix}/bin/dbexec
 	echo "INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_doctypes');" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD round_name varchar(255) NOT NULL default ''" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD restriction varchar(50) NOT NULL default ''" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD in_reply_to_id_cmtRECORDCOMMENT int(15) unsigned NOT NULL default '0'" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD KEY in_reply_to_id_cmtRECORDCOMMENT (in_reply_to_id_cmtRECORDCOMMENT);" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskRECORDCOMMENT ADD in_reply_to_id_bskRECORDCOMMENT int(15) unsigned NOT NULL default '0'" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskRECORDCOMMENT ADD KEY in_reply_to_id_bskRECORDCOMMENT (in_reply_to_id_bskRECORDCOMMENT);" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD reply_order_cached_data blob NULL default NULL;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskRECORDCOMMENT ADD reply_order_cached_data blob NULL default NULL;" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE cmtRECORDCOMMENT ADD INDEX (reply_order_cached_data(40));" | ${prefix}/bin/dbexec
 	echo "ALTER TABLE bskRECORDCOMMENT ADD INDEX (reply_order_cached_data(40));" | ${prefix}/bin/dbexec
 	echo -e 'from invenio.webcommentadminlib import migrate_comments_populate_threads_index;\
 	migrate_comments_populate_threads_index()' | $(PYTHON)
 	echo -e 'from invenio.access_control_firerole import repair_role_definitions;\
 	repair_role_definitions()' | $(PYTHON)
 
 CLEANFILES = *~ *.pyc *.tmp
diff --git a/Vagrantfile b/Vagrantfile
index 5f90d314a..3d5359b0a 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -1,68 +1,68 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
 # Copyright (C) 2016 CERN.
 #
 # Invenio is free software; you can redistribute it
 # and/or modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be
 # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the
 # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 # MA 02111-1307, USA.
 #
 # In applying this license, CERN does not
 # waive the privileges and immunities granted to it by virtue of its status
 # as an Intergovernmental Organization or submit itself to any jurisdiction.
 
 # This Vagrant configuration is suitable for Invenio demo site installation as
 # governed by `.inveniorc`. It uses separate dedicated VMs for various services
 # in order to better emulate production environment conditions. You can install
 # an Invenio demo site by running:
 #
 # $ vagrant up --no-parallel
 # $ vagrant ssh web -c 'source .inveniorc && /vagrant/scripts/create-instance.sh'
 # $ vagrant ssh web -c 'source .inveniorc && /vagrant/scripts/populate-instance.sh'
 # $ firefox http://192.168.50.10/record/1
 # $ vagrant ssh web -c 'source .inveniorc && sudo -u www-data /opt/invenio/bin/inveniocfg --run-unit-tests'
 # $ vagrant ssh web -c 'source .inveniorc && sudo -u www-data /opt/invenio/bin/inveniocfg --run-regression-tests --yes-i-know'
 
 #OS = 'hfm4/centos6'
 OS = 'ubuntu/precise64'
 
 Vagrant.configure("2") do |config|
 
   if Vagrant.has_plugin?("vagrant-cachier")
     config.cache.scope = :box
   end
 
   config.vm.define "web" do |web|
     web.vm.box = OS
     web.vm.hostname = 'web'
     web.vm.provision "file", source: ".inveniorc", destination: ".inveniorc"
     web.vm.provision "shell", inline: "source .inveniorc && /vagrant/scripts/provision-web.sh", privileged: false
     web.vm.network "forwarded_port", guest: 80, host: 80
     web.vm.network "forwarded_port", guest: 443, host: 443
     web.vm.network "private_network", ip: ENV.fetch('INVENIO_WEB_HOST','192.168.50.10')
     web.vm.provider :virtualbox do |vb|
-      vb.customize ["modifyvm", :id, "--memory", "3072"]
+      vb.customize ["modifyvm", :id, "--memory", "4096"]
       vb.customize ["modifyvm", :id, "--cpus", 2]
     end
   end
 
   config.vm.define "mysql" do |mysql|
     mysql.vm.box = OS
     mysql.vm.hostname = 'mysql'
     mysql.vm.provision "file", source: ".inveniorc", destination: ".inveniorc"
     mysql.vm.provision "shell", inline: "source .inveniorc && /vagrant/scripts/provision-mysql.sh", privileged: false
     mysql.vm.network "private_network", ip: ENV.fetch('INVENIO_MYSQL_HOST','192.168.50.11')
   end
 
 end
diff --git a/modules/bibdocfile/lib/bibdocfile.py b/modules/bibdocfile/lib/bibdocfile.py
index 0cada52da..4f645a990 100644
--- a/modules/bibdocfile/lib/bibdocfile.py
+++ b/modules/bibdocfile/lib/bibdocfile.py
@@ -1,4976 +1,4976 @@
 # This file is part of Invenio.
-# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 CERN.
+# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 This module implements the low-level API for dealing with fulltext files.
     - All the files associated to a I{record} (identified by a I{recid}) can be
       managed via an instance of the C{BibRecDocs} class.
     - A C{BibRecDocs} is a wrapper of the list of I{documents} attached to the
       record.
     - Each document is represented by an instance of the C{BibDoc} class.
     - A document is identified by a C{docid} and name (C{docname}). The docname
       must be unique within the record. A document is the set of all the
       formats and revisions of a piece of information.
     - A document has a type called C{doctype} and can have a restriction.
     - Each physical file, i.e. the concretization of a document into a
       particular I{version} and I{format} is represented by an instance of the
       C{BibDocFile} class.
     - The format is infact the extension of the physical file.
     - A comment and a description and other information can be associated to a
       BibDocFile.
     - A C{bibdoc} is a synonim for a document, while a C{bibdocfile} is a
       synonim for a physical file.
 
 @group Main classes: BibRecDocs,BibDoc,BibDocFile
 @group Other classes: BibDocMoreInfo,Md5Folder,InvenioBibDocFileError
 @group Main functions: decompose_file,stream_file,bibdocfile_*,download_url
 @group Configuration Variables: CFG_*
 """
 
 __revision__ = "$Id$"
 
 import os
 import re
 import shutil
 import filecmp
 import time
 import random
 import socket
 import urllib2
 import urllib
 import tempfile
 import cPickle
 import base64
 import binascii
 import cgi
 import sys
 import copy
 import tarfile
 
 if sys.hexversion < 0x2060000:
     from md5 import md5
 else:
     from hashlib import md5 # pylint: disable=E0611
 
 try:
     import magic
     if hasattr(magic, "open"):
         CFG_HAS_MAGIC = 1
         if not hasattr(magic, "MAGIC_MIME_TYPE"):
             ## Patching RHEL6/CentOS6 version
             magic.MAGIC_MIME_TYPE = 16
     elif hasattr(magic, "Magic"):
         CFG_HAS_MAGIC = 2
 except ImportError:
     CFG_HAS_MAGIC = 0
 
 from datetime import datetime
 from mimetypes import MimeTypes
 from thread import get_ident
 from weakref import ref
 from urlparse import urlsplit, parse_qs
 
 
 from invenio import webinterface_handler_config as apache
 
 # Let's set a reasonable timeout for URL request (e.g. FFT)
 socket.setdefaulttimeout(40)
 
 if sys.hexversion < 0x2040000:
     # pylint: disable=W0622
     from sets import Set as set
     # pylint: enable=W0622
 
 from invenio.shellutils import escape_shell_arg, run_shell_command
 from invenio.dbquery import run_sql, DatabaseError
 from invenio.errorlib import register_exception
 from invenio.bibrecord import record_get_field_instances, \
     field_get_subfield_values, field_get_subfield_instances, \
     encode_for_xml
 from invenio.urlutils import create_url, make_user_agent_string
 from invenio.textutils import nice_size
 from invenio.webuser import collect_user_info
 from invenio.access_control_engine import acc_authorize_action
 from invenio.access_control_admin import acc_is_user_in_role, acc_get_role_id
 from invenio.access_control_firerole import compile_role_definition, acc_firerole_check_user
 from invenio.access_control_config import SUPERADMINROLE, CFG_WEBACCESS_WARNING_MSGS
 from invenio.config import CFG_SITE_URL, \
     CFG_WEBDIR, CFG_BIBDOCFILE_FILEDIR,\
     CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS, \
     CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT, CFG_SITE_SECURE_URL, \
     CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, \
     CFG_TMPDIR, CFG_TMPSHAREDDIR, CFG_PATH_MD5SUM, \
     CFG_WEBSUBMIT_STORAGEDIR, \
     CFG_BIBDOCFILE_USE_XSENDFILE, \
     CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY, \
     CFG_SITE_RECORD, CFG_PYLIBDIR, \
     CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS, \
     CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE, \
     CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES, \
     CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING, \
     CFG_BIBCATALOG_SYSTEM
 from invenio.bibcatalog import BIBCATALOG_SYSTEM
 from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, \
     CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS
 from invenio.pluginutils import PluginContainer
 
 import invenio.template
 
 def _plugin_bldr(dummy, plugin_code):
     """Preparing the plugin dictionary structure"""
     ret = {}
     ret['create_instance'] = getattr(plugin_code, "create_instance", None)
     ret['supports'] = getattr(plugin_code, "supports", None)
     return ret
 
 
 _CFG_BIBDOC_PLUGINS = None
 def get_plugins():
     """
     Lazy loading of plugins
     """
     global _CFG_BIBDOC_PLUGINS
     if _CFG_BIBDOC_PLUGINS is None:
         _CFG_BIBDOC_PLUGINS = PluginContainer(
             os.path.join(CFG_PYLIBDIR,
                     'invenio', 'bibdocfile_plugins', 'bom_*.py'),
             plugin_builder=_plugin_bldr)
     return _CFG_BIBDOC_PLUGINS
 
 
 
 bibdocfile_templates = invenio.template.load('bibdocfile')
 # The above flag controls whether HTTP range requests are supported or not
 # when serving static files via Python. This is disabled by default as
 # it currently breaks support for opening PDF files on Windows platforms
 # using Acrobat reader brower plugin.
 
 CFG_ENABLE_HTTP_RANGE_REQUESTS = False
 
 #: block size when performing I/O.
 CFG_BIBDOCFILE_BLOCK_SIZE = 1024 * 8
 
 #: threshold used do decide when to use Python MD5 of CLI MD5 algorithm.
 CFG_BIBDOCFILE_MD5_THRESHOLD = 256 * 1024
 
 #: chunks loaded by the Python MD5 algorithm.
 CFG_BIBDOCFILE_MD5_BUFFER = 1024 * 1024
 
 #: whether to normalize e.g. ".JPEG" and ".jpg" into .jpeg.
 CFG_BIBDOCFILE_STRONG_FORMAT_NORMALIZATION = False
 
 #: flags that can be associated to files.
 CFG_BIBDOCFILE_AVAILABLE_FLAGS = (
     'PDF/A',
     'STAMPED',
     'PDFOPT',
     'HIDDEN',
     'CONVERTED',
     'PERFORM_HIDE_PREVIOUS',
     'OCRED'
 )
 
 DBG_LOG_QUERIES = False
 
 #: constant used if FFT correct with the obvious meaning.
 KEEP_OLD_VALUE = 'KEEP-OLD-VALUE'
 
 _CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [(re.compile(_regex), _headers)
         for _regex, _headers in CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS]
 
 _mimes = MimeTypes(strict=False)
 _mimes.suffix_map.update({'.tbz2' : '.tar.bz2'})
 _mimes.encodings_map.update({'.bz2' : 'bzip2'})
 
 if CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES:
     for key, value in CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES.iteritems():
         _mimes.add_type(key, value)
         del key, value
 
 _magic_cookies = {}
 
 if CFG_HAS_MAGIC == 1:
     def _get_magic_cookies():
         """
         @return: a tuple of magic object.
         @rtype: (MAGIC_NONE, MAGIC_COMPRESS, MAGIC_MIME, MAGIC_COMPRESS + MAGIC_MIME)
         @note: ... not real magic. Just see: man file(1)
         """
         thread_id = get_ident()
         if thread_id not in _magic_cookies:
             _magic_cookies[thread_id] = {
                 magic.MAGIC_NONE: magic.open(magic.MAGIC_NONE),
                 magic.MAGIC_COMPRESS: magic.open(magic.MAGIC_COMPRESS),
                 magic.MAGIC_MIME: magic.open(magic.MAGIC_MIME),
                 magic.MAGIC_COMPRESS + magic.MAGIC_MIME: magic.open(magic.MAGIC_COMPRESS + magic.MAGIC_MIME),
                 magic.MAGIC_MIME_TYPE: magic.open(magic.MAGIC_MIME_TYPE),
             }
             for key in _magic_cookies[thread_id].keys():
                 _magic_cookies[thread_id][key].load()
         return _magic_cookies[thread_id]
 elif CFG_HAS_MAGIC == 2:
     def _magic_wrapper(local_path, mime=True, mime_encoding=False):
         thread_id = get_ident()
         if (thread_id, mime, mime_encoding) not in _magic_cookies:
             magic_object = _magic_cookies[thread_id, mime, mime_encoding] = magic.Magic(mime=mime, mime_encoding=mime_encoding)
         else:
             magic_object = _magic_cookies[thread_id, mime, mime_encoding]
         return magic_object.from_file(local_path) # pylint: disable=E1103
 
 def _generate_extensions():
     """
     Generate the regular expression to match all the known extensions.
 
     @return: the regular expression.
     @rtype: regular expression object
     """
     _tmp_extensions = _mimes.encodings_map.keys() + \
                 _mimes.suffix_map.keys() + \
                 _mimes.types_map[1].keys() + \
                 CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS
     extensions = []
     for ext in _tmp_extensions:
         if ext.startswith('.'):
             extensions.append(ext)
         else:
             extensions.append('.' + ext)
     extensions.sort()
     extensions.reverse()
     extensions = set([ext.lower() for ext in extensions])
     extensions = '\\' + '$|\\'.join(extensions) + '$'
     extensions = extensions.replace('+', '\\+')
     return re.compile(extensions, re.I)
 
 #: Regular expression to recognized extensions.
 _extensions = _generate_extensions()
 
 class InvenioBibDocFileError(Exception):
     """
     Exception raised in case of errors related to fulltext files.
     """
     pass
 
 class InvenioBibdocfileUnauthorizedURL(InvenioBibDocFileError):
     """
     Exception raised in case of errors related to fulltext files.
     """
     ## NOTE: this is a legacy Exception
     pass
 
 def _val_or_null(val, eq_name = None, q_str = None, q_args = None):
     """
     Auxiliary function helpful while building WHERE clauses of SQL queries
     that should contain field=val or field is val
 
     If optional parameters q_str and q_args are provided, lists are updated
     if val == None, a statement of the form "eq_name is Null" is returned
     otherwise, otherwise the function returns a parametrised comparison
     "eq_name=%s" with val as an argument added to the query args list.
 
     Using parametrised queries diminishes the likelihood of having
     SQL injection.
 
     @param val Value to compare with
     @type val
     @param eq_name The name of the database column
     @type eq_name string
     @param q_str Query string builder - list of clauses
                  that should be connected by AND operator
     @type q_str list
 
     @param q_args Query arguments list. This list will be applied as
                   a second argument of run_sql command
     @type q_args list
 
     @result string of a single part of WHERE clause
     @rtype string
 
     """
     res = ""
     if eq_name != None:
         res += eq_name
     if val == None:
         if eq_name != None:
             res += " is "
         res += "NULL"
         if q_str != None:
             q_str.append(res)
         return res
     else:
         if eq_name != None:
             res += "="
         res += "%s"
         if q_str != None:
             q_str.append(res)
         if q_args != None:
             q_args.append(str(val))
         return res
 
 def _sql_generate_conjunctive_where(to_process):
     """Generating WHERE clause of a SQL statement, consisting of conjunction
        of declared terms. Terms are defined by the to_process argument.
        the method creates appropriate entries different in the case, value
        should be NULL (None in the list) and in the case of not-none arguments.
        In the second case, parametrised query is generated decreasing the
        chance of an SQL-injection.
 
        @param to_process List of tuples (value, database_column)
        @type to_process list"""
     q_str = []
     q_args = []
     for entry in to_process:
         q_str.append(_val_or_null(entry[0], eq_name = entry[1], q_args = q_args))
     return (" AND ".join(q_str), q_args)
 
 def file_strip_ext(afile, skip_version=False, only_known_extensions=False, allow_subformat=True):
     """
     Strip in the best way the extension from a filename.
 
     >>> file_strip_ext("foo.tar.gz")
     'foo'
     >>> file_strip_ext("foo.buz.gz")
     'foo.buz'
     >>> file_strip_ext("foo.buz")
     'foo'
     >>> file_strip_ext("foo.buz", only_known_extensions=True)
     'foo.buz'
     >>> file_strip_ext("foo.buz;1", skip_version=False,
     ... only_known_extensions=True)
     'foo.buz;1'
     >>> file_strip_ext("foo.gif;icon")
     'foo'
     >>> file_strip_ext("foo.gif:icon", allow_subformat=False)
     'foo.gif:icon'
 
     @param afile: the path/name of a file.
     @type afile: string
     @param skip_version: whether to skip a trailing ";version".
     @type skip_version: bool
     @param only_known_extensions: whether to strip out only known extensions or
         to consider as extension anything that follows a dot.
     @type only_known_extensions: bool
     @param allow_subformat: whether to consider also subformats as part of
         the extension.
     @type allow_subformat: bool
     @return: the name/path without the extension (and version).
     @rtype: string
     """
     if skip_version or allow_subformat:
         afile = afile.split(';')[0]
     nextfile = _extensions.sub('', afile)
     if nextfile == afile and not only_known_extensions:
         nextfile = os.path.splitext(afile)[0]
     while nextfile != afile:
         afile = nextfile
         nextfile = _extensions.sub('', afile)
     return nextfile
 
 def normalize_format(docformat, allow_subformat=True):
     """
     Normalize the format, e.g. by adding a dot in front.
 
     @param format: the format/extension to be normalized.
     @type format: string
     @param allow_subformat: whether to consider also subformats as part of
         the extension.
     @type allow_subformat: bool
     @return: the normalized format.
     @rtype; string
     """
     if not docformat:
         return ''
     if allow_subformat:
         subformat = docformat[docformat.rfind(';'):]
         docformat = docformat[:docformat.rfind(';')]
     else:
         subformat = ''
     if docformat and docformat[0] != '.':
         docformat = '.' + docformat
     if CFG_BIBDOCFILE_STRONG_FORMAT_NORMALIZATION:
         if docformat not in ('.Z', '.H', '.C', '.CC'):
             docformat = docformat.lower()
         docformat = {
             '.jpg' : '.jpeg',
             '.htm' : '.html',
             '.tif' : '.tiff'
         }.get(docformat, docformat)
     return docformat + subformat
 
 def guess_format_from_url(url):
     """
     Given a URL tries to guess it's extension.
 
     Different method will be used, including HTTP HEAD query,
     downloading the resource and using mime
 
     @param url: the URL for which the extension shuld be guessed.
     @type url: string
     @return: the recognized extension or '.bin' if it's impossible to
         recognize it.
     @rtype: string
     """
     def guess_via_magic(local_path):
         try:
             if CFG_HAS_MAGIC == 1:
                 magic_cookie = _get_magic_cookies()[magic.MAGIC_MIME_TYPE]
                 mimetype = magic_cookie.file(local_path)
             elif CFG_HAS_MAGIC == 2:
                 mimetype = _magic_wrapper(local_path, mime=True, mime_encoding=False)
             if CFG_HAS_MAGIC:
                 if mimetype in CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING:
                     return normalize_format(CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING[mimetype])
                 else:
                     return normalize_format(_mimes.guess_extension(mimetype))
         except Exception:
             pass
 
     ## Let's try to guess the extension by considering the URL as a filename
     ext = decompose_file(url, skip_version=True, only_known_extensions=True)[2]
     if ext.startswith('.'):
         return ext
 
     if is_url_a_local_file(url):
         ## The URL corresponds to a local file, so we can safely consider
         ## traditional extensions after the dot.
         ext = decompose_file(url, skip_version=True, only_known_extensions=False)[2]
         if ext.startswith('.'):
             return ext
         ## No extensions? Let's use Magic.
         ext = guess_via_magic(url)
         if ext:
             return ext
     else:
         ## Since the URL is remote, let's try to perform a HEAD request
         ## and see the corresponding headers
         try:
             response = open_url(url, head_request=True)
         except (InvenioBibdocfileUnauthorizedURL, urllib2.URLError):
             return ".bin"
         ext = get_format_from_http_response(response)
         if ext:
             return ext
 
         if CFG_HAS_MAGIC:
             ## Last solution: let's download the remote resource
             ## and use the Python magic library to guess the extension
             filename = ""
             try:
                 try:
                     filename = download_url(url, docformat='')
                     ext = guess_via_magic(filename)
                     if ext:
                         return ext
                 except Exception:
                     pass
             finally:
                 if os.path.exists(filename):
                     ## Let's free space
                     os.remove(filename)
     return ".bin"
 
 _docname_re = re.compile(r'[^-\w.]*')
 def normalize_docname(docname):
     """
     Normalize the docname.
 
     At the moment the normalization is just returning the same string.
 
     @param docname: the docname to be normalized.
     @type docname: string
     @return: the normalized docname.
     @rtype: string
     """
     #return _docname_re.sub('', docname)
     return docname
 
 def normalize_version(version):
     """
     Normalize the version.
 
     The version can be either an integer or the keyword 'all'. Any other
     value will be transformed into the empty string.
 
     @param version: the version (either a number or 'all').
     @type version: integer or string
     @return: the normalized version.
     @rtype: string
     """
     try:
         int(version)
     except ValueError:
         if version.lower().strip() == 'all':
             return 'all'
         else:
             return ''
     return str(version)
 
 def compose_file(dirname, extension, subformat=None, version=None, storagename=None):
     """
     Construct back a fullpath given the separate components.
 
     @param
     @param storagename Name under which the file should be stored in the filesystem
     @type storagename string
 
     @return a fullpath to the file
     @rtype string
     """
     if version:
         version = ";%i" % int(version)
     else:
         version = ""
     if subformat:
         if not subformat.startswith(";"):
             subformat = ";%s" % subformat
     else:
         subformat = ""
     if extension and not extension.startswith("."):
         extension = ".%s" % extension
 
     if not storagename:
         storagename = "content"
     return os.path.join(dirname, storagename + extension + subformat + version)
 
 def compose_format(extension, subformat=None):
     """
     Construct the format string
     """
     if not extension.startswith("."):
         extension = ".%s" % extension
     if subformat:
         if not subformat.startswith(";"):
             subformat = ";%s" % subformat
     else:
         subformat = ""
     return extension + subformat
 
 def decompose_file(afile, skip_version=False, only_known_extensions=False,
         allow_subformat=True):
     """
     Decompose a file/path into its components dirname, basename and extension.
 
     >>> decompose_file('/tmp/foo.tar.gz')
     ('/tmp', 'foo', '.tar.gz')
     >>> decompose_file('/tmp/foo.tar.gz;1', skip_version=True)
     ('/tmp', 'foo', '.tar.gz')
     >>> decompose_file('http://www.google.com/index.html')
     ('http://www.google.com', 'index', '.html')
 
     @param afile: the path/name of a file.
     @type afile: string
     @param skip_version: whether to skip a trailing ";version".
     @type skip_version: bool
     @param only_known_extensions: whether to strip out only known extensions or
         to consider as extension anything that follows a dot.
     @type only_known_extensions: bool
     @param allow_subformat: whether to consider also subformats as part of
         the extension.
     @type allow_subformat: bool
     @return: a tuple with the directory name, the basename and extension.
     @rtype: (dirname, basename, extension)
 
     @note: if a URL is provided, the scheme will be part of the dirname.
     @see: L{file_strip_ext} for the algorithm used to retrieve the extension.
     """
     if skip_version:
         version = afile.split(';')[-1]
         try:
             int(version)
             afile = afile[:-len(version)-1]
         except ValueError:
             pass
     basename = os.path.basename(afile)
     dirname = afile[:-len(basename)-1]
     base = file_strip_ext(
         basename,
         only_known_extensions=only_known_extensions,
         allow_subformat=allow_subformat)
     extension = basename[len(base) + 1:]
     if extension:
         extension = '.' + extension
     return (dirname, base, extension)
 
 def decompose_file_with_version(afile):
     """
     Decompose a file into dirname, basename, extension and version.
 
     >>> decompose_file_with_version('/tmp/foo.tar.gz;1')
     ('/tmp', 'foo', '.tar.gz', 1)
 
     @param afile: the path/name of a file.
     @type afile: string
     @return: a tuple with the directory name, the basename, extension and
         version.
     @rtype: (dirname, basename, extension, version)
 
     @raise ValueError: in case version does not exist it will.
     @note: if a URL is provided, the scheme will be part of the dirname.
     """
     version_str = afile.split(';')[-1]
     version = int(version_str)
     afile = afile[:-len(version_str)-1]
     basename = os.path.basename(afile)
     dirname = afile[:-len(basename)-1]
     base = file_strip_ext(basename)
     extension = basename[len(base) + 1:]
     if extension:
         extension = '.' + extension
     return (dirname, base, extension, version)
 
 def get_subformat_from_format(docformat):
     """
     @return the subformat if any.
     @rtype: string
     >>> get_subformat_from_format('foo;bar')
     'bar'
     >>> get_subformat_from_format('foo')
     ''
     """
     try:
         return docformat[docformat.rindex(';') + 1:]
     except ValueError:
         return ''
 
 def get_superformat_from_format(docformat):
     """
     @return the superformat if any.
     @rtype: string
 
     >>> get_superformat_from_format('foo;bar')
     'foo'
     >>> get_superformat_from_format('foo')
     'foo'
     """
     try:
         return docformat[:docformat.rindex(';')]
     except ValueError:
         return docformat
 
 def propose_next_docname(docname):
     """
     Given a I{docname}, suggest a new I{docname} (useful when trying to generate
     a unique I{docname}).
 
     >>> propose_next_docname('foo')
     'foo_1'
     >>> propose_next_docname('foo_1')
     'foo_2'
     >>> propose_next_docname('foo_10')
     'foo_11'
 
     @param docname: the base docname.
     @type docname: string
     @return: the next possible docname based on the given one.
     @rtype: string
     """
     if '_' in docname:
         split_docname = docname.split('_')
         try:
             split_docname[-1] = str(int(split_docname[-1]) + 1)
             docname = '_'.join(split_docname)
         except ValueError:
             docname += '_1'
     else:
         docname += '_1'
     return docname
 
 class BibRecDocs(object):
     """
     This class represents all the files attached to one record.
 
     @param recid: the record identifier.
     @type recid: integer
     @param deleted_too: whether to consider deleted documents as normal
         documents (useful when trying to recover deleted information).
     @type deleted_too: bool
     @param human_readable: whether numbers should be printed in human readable
         format (e.g. 2048 bytes -> 2Kb)
     @ivar id: the record identifier as passed to the constructor.
     @type id: integer
     @ivar human_readable: the human_readable flag as passed to the constructor.
     @type human_readable: bool
     @ivar deleted_too: the deleted_too flag as passed to the constructor.
     @type deleted_too: bool
     @ivar bibdocs: the list of documents attached to the record.
     @type bibdocs: list of BibDoc
     """
     def __init__(self, recid, deleted_too=False, human_readable=False):
         try:
             self.id = int(recid)
         except ValueError:
             raise ValueError("BibRecDocs: recid is %s but must be an integer." % repr(recid))
         self.human_readable = human_readable
         self.deleted_too = deleted_too
         self.attachment_types = {} # dictionary docname->attachment type
         self._bibdocs = []
         self.dirty = True
 
     @property
     def bibdocs(self):
         if self.dirty:
             self.build_bibdoc_list()
         return self._bibdocs
 
 
     def __repr__(self):
         """
         @return: the canonical string representation of the C{BibRecDocs}.
         @rtype: string
         """
         return 'BibRecDocs(%s%s%s)' % (self.id,
             self.deleted_too and ', True' or '',
             self.human_readable and ', True' or ''
         )
 
     def __str__(self):
         """
         @return: an easy to be I{grepped} string representation of the
             whole C{BibRecDocs} content.
         @rtype: string
         """
         out = '%i::::total bibdocs attached=%i\n' % (self.id, len(self.bibdocs))
         out += '%i::::total size latest version=%s\n' % (self.id, nice_size(self.get_total_size_latest_version()))
         out += '%i::::total size all files=%s\n' % (self.id, nice_size(self.get_total_size()))
         for (docname, (bibdoc, dummy)) in self.bibdocs.items():
             out += str(docname) + ":" + str(bibdoc)
         return out
 
     def empty_p(self):
         """
         @return: True when the record has no attached documents.
         @rtype: bool
         """
         return len(self.bibdocs) == 0
 
     def deleted_p(self):
         """
         @return: True if the correxsponding record has been deleted.
         @rtype: bool
         """
         from invenio.search_engine import record_exists
         return record_exists(self.id) == -1
 
     def get_xml_8564(self):
         """
         Return a snippet of I{MARCXML} representing the I{8564} fields
         corresponding to the current state.
 
         @return: the MARCXML representation.
         @rtype: string
         """
         from invenio.search_engine import get_record
         out = ''
         record = get_record(self.id)
         fields = record_get_field_instances(record, '856', '4', ' ')
         for field in fields:
             urls = field_get_subfield_values(field, 'u')
             if urls and not bibdocfile_url_p(urls[0]):
                 out += '\t<datafield tag="856" ind1="4" ind2=" ">\n'
                 for subfield, value in field_get_subfield_instances(field):
                     out += '\t\t<subfield code="%s">%s</subfield>\n' % (subfield, encode_for_xml(value))
                 out += '\t</datafield>\n'
 
         for afile in self.list_latest_files(list_hidden=False):
             out += '\t<datafield tag="856" ind1="4" ind2=" ">\n'
             url = afile.get_url()
             description = afile.get_description()
             comment = afile.get_comment()
             if url:
                 out += '\t\t<subfield code="u">%s</subfield>\n' % encode_for_xml(url)
             if description:
                 out += '\t\t<subfield code="y">%s</subfield>\n' % encode_for_xml(description)
             if comment:
                 out += '\t\t<subfield code="z">%s</subfield>\n' % encode_for_xml(comment)
             out += '\t</datafield>\n'
 
         return out
 
     def get_total_size_latest_version(self, user_info=None, subformat=None):
         """
         Returns the total size used on disk by all the files belonging
         to this record and corresponding to the latest version.
 
         @param user_info: the user_info dictionary, used to check restrictions
         @type: dict
         @param subformat: if subformat is specified, it limits files
             only to those from that specific subformat
         @type subformat: string
         @return: the total size.
         @rtype: integer
         """
         size = 0
         for (bibdoc, _) in self.bibdocs.values():
             size += bibdoc.get_total_size_latest_version(user_info, subformat)
         return size
 
     def get_total_size(self):
         """
         Return the total size used on disk of all the files belonging
         to this record of any version (not only the last as in
         L{get_total_size_latest_version}).
 
         @return: the total size.
         @rtype: integer
         """
         size = 0
         for (bibdoc, _) in self.bibdocs.values():
             size += bibdoc.get_total_size()
         return size
 
     def build_bibdoc_list(self):
         """
         This method must be called everytime a I{bibdoc} is added, removed or
         modified.
         """
         self._bibdocs = {}
         if self.deleted_too:
             res = run_sql("""SELECT brbd.id_bibdoc, brbd.docname, brbd.type FROM bibrec_bibdoc as brbd JOIN
                          bibdoc as bd ON bd.id=brbd.id_bibdoc WHERE brbd.id_bibrec=%s
                          ORDER BY brbd.docname ASC""", (self.id,))
 
         else:
             res = run_sql("""SELECT brbd.id_bibdoc, brbd.docname, brbd.type FROM bibrec_bibdoc as brbd JOIN
                          bibdoc as bd ON bd.id=brbd.id_bibdoc WHERE brbd.id_bibrec=%s AND
                          bd.status<>'DELETED' ORDER BY brbd.docname ASC""", (self.id,))
         for row in res:
             cur_doc = BibDoc.create_instance(docid=row[0], recid=self.id,
                                              human_readable=self.human_readable)
             self._bibdocs[row[1]] = (cur_doc, row[2])
         self.dirty = False
 
     def list_bibdocs_by_names(self, doctype=None):
         """
         Returns the dictionary of all bibdocs object belonging to a recid.
         Keys in the dictionary are names of documetns and values are BibDoc objects.
         If C{doctype} is set, it returns just the bibdocs of that doctype.
 
         @param doctype: the optional doctype.
         @type doctype: string
         @return: the dictionary of bibdocs.
         @rtype: dictionary of Dcname -> BibDoc
         """
 
         if not doctype:
             return dict((k, v) for (k, (v, _)) in self.bibdocs.iteritems())
 
 
         res = {}
         for (docname, (doc, attachmenttype)) in self.bibdocs.iteritems():
             if attachmenttype == doctype:
                 res[docname] = doc
         return res
 
     def list_bibdocs(self, doctype=None, rel_type=None):
         """
         Returns the list all bibdocs object belonging to a recid.
         If C{doctype} is set, it returns just the bibdocs of that doctype.
 
         @param doctype: the optional doctype.
         @type doctype: string
         @return: the list of bibdocs.
         @rtype: list of BibDoc
         """
         return [bibdoc for (bibdoc, rtype) in self.bibdocs.values()
                     if (not doctype or doctype == bibdoc.doctype) and
                        (rel_type is None or rel_type == rtype)]
 
 
     def get_bibdoc_names(self, doctype=None):
         """
         Returns all the names of the documents associated with the bibrec.
         If C{doctype} is set, restrict the result to all the matching doctype.
 
         @param doctype: the optional doctype.
         @type doctype: string
         @return: the list of document names.
         @rtype: list of string
         """
         return [docname for (docname, dummy) in self.list_bibdocs_by_names(doctype).items()]
 
     def check_file_exists(self, path, f_format):
         """
         Check if a file with the same content of the file pointed in C{path}
         is already attached to this record.
 
         @param path: the file to be checked against.
         @type path: string
         @return: True if a file with the requested content is already attached
         to the record.
         @rtype: bool
         """
         size = os.path.getsize(path)
 
         # Let's consider all the latest files
         files = self.list_latest_files()
 
         # Let's consider all the latest files with same size
         potential = [afile for afile in files if afile.get_size() == size and afile.format == f_format]
 
         if potential:
             checksum = calculate_md5(path)
 
             # Let's consider all the latest files with the same size and the
             # same checksum
             potential = [afile for afile in potential if afile.get_checksum() == checksum]
 
             if potential:
                 potential = [afile for afile in potential if
                                  filecmp.cmp(afile.get_full_path(), path)]
 
                 if potential:
                     return True
                 else:
                     # Gosh! How unlucky, same size, same checksum but not same
                     # content!
                     pass
         return False
 
     def propose_unique_docname(self, docname):
         """
         Given C{docname}, return a new docname that is not already attached to
         the record.
 
         @param docname: the reference docname.
         @type docname: string
         @return: a docname not already attached.
         @rtype: string
         """
         docname = normalize_docname(docname)
         goodname = docname
         i = 1
         while goodname in self.get_bibdoc_names():
             i += 1
             goodname = "%s_%s" % (docname, i)
         return goodname
 
     def merge_bibdocs(self, docname1, docname2):
         """
         This method merge C{docname2} into C{docname1}.
 
             1. Given all the formats of the latest version of the files
                attached to C{docname2}, these files are added as new formats
                into C{docname1}.
             2. C{docname2} is marked as deleted.
 
         @raise InvenioBibDocFileError: if at least one format in C{docname2}
             already exists in C{docname1}. (In this case the two bibdocs are
             preserved)
         @note: comments and descriptions are also copied.
         @note: if C{docname2} has a I{restriction}(i.e. if the I{status} is
             set) and C{docname1} doesn't, the restriction is imported.
         """
         bibdoc1 = self.get_bibdoc(docname1)
         bibdoc2 = self.get_bibdoc(docname2)
 
         ## Check for possibility
         for bibdocfile in bibdoc2.list_latest_files():
             docformat = bibdocfile.get_format()
             if bibdoc1.format_already_exists_p(docformat):
                 raise InvenioBibDocFileError('Format %s already exists in bibdoc %s of record %s. It\'s impossible to merge bibdoc %s into it.' % (docformat, docname1, self.id, docname2))
 
         ## Importing restriction if needed.
         restriction1 = bibdoc1.get_status()
         restriction2 = bibdoc2.get_status()
         if restriction2 and not restriction1:
             bibdoc1.set_status(restriction2)
 
         ## Importing formats
         for bibdocfile in bibdoc2.list_latest_files():
             docformat = bibdocfile.get_format()
             comment = bibdocfile.get_comment()
             description = bibdocfile.get_description()
             bibdoc1.add_file_new_format(bibdocfile.get_full_path(),
                                         description=description,
                                         comment=comment, docformat=docformat)
 
         ## Finally deleting old bibdoc2
         bibdoc2.delete()
         self.dirty = True
 
     def get_docid(self, docname):
         """
         @param docname: the document name.
         @type docname: string
         @return: the identifier corresponding to the given C{docname}.
         @rtype: integer
         @raise InvenioBibDocFileError: if the C{docname} does not
             corresponds to a document attached to this record.
         """
         if docname in self.bibdocs:
             return self.bibdocs[docname][0].id
         raise InvenioBibDocFileError, "Recid '%s' is not connected with a " \
             "docname '%s'" % (self.id, docname)
 
     def get_docname(self, docid):
         """
         @param docid: the document identifier.
         @type docid: integer
         @return: the name of the document corresponding to the given document
             identifier.
         @rtype: string
         @raise InvenioBibDocFileError: if the C{docid} does not
             corresponds to a document attached to this record.
         """
         for (docname, (bibdoc, _)) in self.bibdocs.items():
             if bibdoc.id == docid:
                 return docname
         raise InvenioBibDocFileError, "Recid '%s' is not connected with a " \
             "docid '%s'" % (self.id, docid)
 
     def change_name(self, newname, oldname=None, docid=None):
         """
         Renames document of a given name.
 
         @param newname: the new name.
         @type newname: string
         @raise InvenioBibDocFileError: if the new name corresponds to
             a document already attached to the record owning this document.
         """
         if not oldname and not docid:
             raise StandardError("Trying to rename unspecified document")
 
         if not oldname:
             oldname = self.get_docname(docid)
         if not docid:
             docid = self.get_docid(oldname)
 
         doc, atttype = self.bibdocs[oldname]
 
         newname = normalize_docname(newname)
 
         res = run_sql("SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s AND docname=%s", (self.id, newname))
         if res:
             raise InvenioBibDocFileError, "A bibdoc called %s already exists for recid %s" % (newname, self.id)
 
         doc.change_name(self.id, newname)
         # updating the record structure
         del self._bibdocs[oldname]
         self._bibdocs[newname] = (doc, atttype)
 
     def has_docname_p(self, docname):
         """
         @param docname: the document name,
         @type docname: string
         @return: True if a document with the given name is attached to this
             record.
         @rtype: bool
         """
         return docname in self.bibdocs.keys()
 
     def get_bibdoc(self, docname):
         """
         @return: the bibdoc with a particular docname associated with
         this recid"""
         if docname in self.bibdocs:
             return self.bibdocs[docname][0]
         raise InvenioBibDocFileError, "Recid '%s' is not connected with " \
             " docname '%s'" % (self.id, docname)
 
     def delete_bibdoc(self, docname):
         """
         Deletes the document with the specified I{docname}.
 
         @param docname: the document name.
         @type docname: string
         """
         if docname in self.bibdocs:
             self.bibdocs[docname][0].delete()
         self.dirty = True
 
     def add_bibdoc(self, doctype="Main", docname='file', never_fail=False):
         """
         Add a new empty document object (a I{bibdoc}) to the list of
         documents of this record.
 
         @param doctype: the document type.
         @type doctype: string
         @param docname: the document name.
         @type docname: string
         @param never_fail: if True, this procedure will not fail, even if
             a document with the given name is already attached to this
             record. In this case a new name will be generated (see
             L{propose_unique_docname}).
         @type never_fail: bool
         @return: the newly created document object.
         @rtype: BibDoc
         @raise InvenioBibDocFileError: in case of any error.
         """
         try:
             docname = normalize_docname(docname)
             if never_fail:
                 docname = self.propose_unique_docname(docname)
             if docname in self.get_bibdoc_names():
                 raise InvenioBibDocFileError, \
                     "%s has already a bibdoc with docname %s" % (self.id, docname)
             else:
                 bibdoc = BibDoc.create_instance(recid=self.id, doctype=doctype,
                                                 docname=docname,
                                                 human_readable=self.human_readable)
                 self.dirty = True
                 return bibdoc
         except Exception, e:
             register_exception()
             raise InvenioBibDocFileError(str(e))
 
     def add_new_file(self, fullpath, doctype="Main", docname=None,
                      never_fail=False, description=None, comment=None,
                      docformat=None, flags=None, modification_date=None):
         """
         Directly add a new file to this record.
 
         Adds a new file with the following policy:
             - if the C{docname} is not set it is retrieved from the name of the
               file.
             - If a bibdoc with the given docname doesn't already exist, it is
               created and the file is added to it.
             - It it exist but it doesn't contain the format that is being
               added, the new format is added.
             - If the format already exists then if C{never_fail} is True a new
               bibdoc is created with a similar name but with a progressive
               number as a suffix and the file is added to it (see
               L{propose_unique_docname}).
 
         @param fullpath: the filesystme path of the document to be added.
         @type fullpath: string
         @param doctype: the type of the document.
         @type doctype: string
         @param docname: the document name.
         @type docname: string
         @param never_fail: if True, this procedure will not fail, even if
             a document with the given name is already attached to this
             record. In this case a new name will be generated (see
             L{propose_unique_docname}).
         @type never_fail: bool
         @param description: an optional description of the file.
         @type description: string
         @param comment: an optional comment to the file.
         @type comment: string
         @param format: the extension of the file. If not specified it will
             be guessed (see L{guess_format_from_url}).
         @type format: string
         @param flags: a set of flags to be associated with the file (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS})
         @type flags: list of string
         @return: the elaborated document object.
         @rtype: BibDoc
         @raise InvenioBibDocFileError: in case of error.
         """
         if docname is None:
             docname = decompose_file(fullpath)[1]
 
         if docformat is None:
             docformat = decompose_file(fullpath)[2]
         docname = normalize_docname(docname)
         try:
             bibdoc = self.get_bibdoc(docname)
         except InvenioBibDocFileError:
             # bibdoc doesn't already exists!
             bibdoc = self.add_bibdoc(doctype, docname, False)
             bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date)
         else:
             try:
                 bibdoc.add_file_new_format(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date)
             except InvenioBibDocFileError, dummy:
                 # Format already exist!
                 if never_fail:
                     bibdoc = self.add_bibdoc(doctype, docname, True)
                     bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date)
                 else:
                     raise
         return bibdoc
 
     def add_new_version(self, fullpath, docname=None, description=None, comment=None, docformat=None, flags=None):
         """
         Adds a new file to an already existent document object as a new
         version.
 
         @param fullpath: the filesystem path of the file to be added.
         @type fullpath: string
         @param docname: the document name. If not specified it will be
             extracted from C{fullpath} (see L{decompose_file}).
         @type docname: string
         @param description: an optional description for the file.
         @type description: string
         @param comment: an optional comment to the file.
         @type comment: string
         @param format: the extension of the file. If not specified it will
             be guessed (see L{guess_format_from_url}).
         @type format: string
         @param flags: a set of flags to be associated with the file (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS})
         @type flags: list of string
         @return: the elaborated document object.
         @rtype: BibDoc
         @raise InvenioBibDocFileError: in case of error.
         @note: previous files associated with the same document will be
             considered obsolete.
         """
         if docname is None:
             docname = decompose_file(fullpath)[1]
         if docformat is None:
             docformat = decompose_file(fullpath)[2]
         if flags is None:
             flags = []
         if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags:
             flags.append('PDF/A')
         bibdoc = self.get_bibdoc(docname=docname)
         bibdoc.add_file_new_version(fullpath, description=description, comment=comment, docformat=docformat, flags=flags)
         return bibdoc
 
     def add_new_format(self, fullpath, docname=None, description=None, comment=None, docformat=None, flags=None, modification_date=None):
         """
         Adds a new file to an already existent document object as a new
         format.
 
         @param fullpath: the filesystem path of the file to be added.
         @type fullpath: string
         @param docname: the document name. If not specified it will be
             extracted from C{fullpath} (see L{decompose_file}).
         @type docname: string
         @param description: an optional description for the file.
         @type description: string
         @param comment: an optional comment to the file.
         @type comment: string
         @param format: the extension of the file. If not specified it will
             be guessed (see L{guess_format_from_url}).
         @type format: string
         @param flags: a set of flags to be associated with the file (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS})
         @type flags: list of string
         @return: the elaborated document object.
         @rtype: BibDoc
         @raise InvenioBibDocFileError: in case the same format already
             exists.
         """
         if docname is None:
             docname = decompose_file(fullpath)[1]
         if docformat is None:
             docformat = decompose_file(fullpath)[2]
         if flags is None:
             flags = []
         if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags:
             flags.append('PDF/A')
         bibdoc = self.get_bibdoc(docname=docname)
         bibdoc.add_file_new_format(fullpath, description=description, comment=comment, docformat=docformat, flags=flags, modification_date=modification_date)
         return bibdoc
 
     def list_latest_files(self, doctype=None, list_hidden=True):
         """
         Returns a list of the latest files.
 
         @param doctype: if set, only document of the given type will be listed.
         @type doctype: string
         @param list_hidden: if True, will list also files with the C{HIDDEN}
             flag being set.
         @type list_hidden: bool
         @return: the list of latest files.
         @rtype: list of BibDocFile
         """
         docfiles = []
         for bibdoc in self.list_bibdocs(doctype):
             docfiles += bibdoc.list_latest_files(list_hidden=list_hidden)
         return docfiles
 
     def fix(self, docname):
         """
         Algorithm that transform a broken/old bibdoc into a coherent one.
         Think of it as being the fsck of BibDocs.
             - All the files in the bibdoc directory will be renamed according
               to the document name. Proper .recid, .type, .md5 files will be
               created/updated.
             - In case of more than one file with the same format version a new
               bibdoc will be created in order to put does files.
         @param docname: the document name that need to be fixed.
         @type docname: string
         @return: the list of newly created bibdocs if any.
         @rtype: list of BibDoc
         @raise InvenioBibDocFileError: in case of issues that can not be
             fixed automatically.
         """
         bibdoc = self.get_bibdoc(docname)
         versions = {}
         res = []
         new_bibdocs = [] # List of files with the same version/format of
                         # existing file which need new bibdoc.
         counter = 0
         zero_version_bug = False
         if os.path.exists(bibdoc.basedir):
             from invenio.config import CFG_CERN_SITE, CFG_INSPIRE_SITE, CFG_BIBDOCFILE_AFS_VOLUME_PATTERN, CFG_BIBDOCFILE_AFS_VOLUME_QUOTA
             if os.path.realpath(bibdoc.basedir).startswith('/afs') and (CFG_CERN_SITE or CFG_INSPIRE_SITE):
                 ## We are on AFS at CERN! Let's allocate directories the CERN/AFS way. E.g.
                 ## $ afs_admin create -q 1000000 /afs/cern.ch/project/cds/files/g40 p.cds.g40
                 ## NOTE: This might be extended to use low-level OpenAFS CLI tools
                 ## so that this technique could be extended to other AFS users outside CERN.
                 mount_point = os.path.dirname(os.path.realpath(bibdoc.basedir))
                 if not os.path.exists(mount_point):
                     volume = CFG_BIBDOCFILE_AFS_VOLUME_PATTERN % os.path.basename(mount_point)
                     quota = str(CFG_BIBDOCFILE_AFS_VOLUME_QUOTA)
                     exit_code, stdout, stderr = run_shell_command("afs_admin create -q %s %s %s", (quota, mount_point, volume))
                     if exit_code or stderr:
                         raise IOError("Error in creating AFS mount point %s with quota %s and volume %s: exit_code=%s. Captured stdout:\n: %s\nCaptured stderr:\n: %s" % (mount_point, quota, volume, exit_code, stdout, stderr))
             for filename in os.listdir(bibdoc.basedir):
                 if filename[0] != '.' and ';' in filename:
                     name, version = filename.rsplit(';', 1)
                     try:
                         version = int(version)
                     except ValueError:
                         # Strange name
                         register_exception()
                         raise InvenioBibDocFileError, "A file called %s exists under %s. This is not a valid name. After the ';' there must be an integer representing the file version. Please, manually fix this file either by renaming or by deleting it." % (filename, bibdoc.basedir)
                     if version == 0:
                         zero_version_bug = True
                     docformat = name[len(file_strip_ext(name)):]
                     docformat = normalize_format(docformat)
                     if not versions.has_key(version):
                         versions[version] = {}
                     new_name = 'FIXING-%s-%s' % (str(counter), name)
                     try:
                         shutil.move('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, new_name))
                     except Exception, e:
                         register_exception()
                         raise InvenioBibDocFileError, "Error in renaming '%s' to '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, new_name), e)
                     if versions[version].has_key(docformat):
                         new_bibdocs.append((new_name, version))
                     else:
                         versions[version][docformat] = new_name
                     counter += 1
                 elif filename[0] != '.':
                     # Strange name
                     register_exception()
                     raise InvenioBibDocFileError, "A file called %s exists under %s. This is not a valid name. There should be a ';' followed by an integer representing the file version. Please, manually fix this file either by renaming or by deleting it." % (filename, bibdoc.basedir)
         else:
             # we create the corresponding storage directory
             old_umask = os.umask(022)
             os.makedirs(bibdoc.basedir)
             # and save the father record id if it exists
             try:
                 if self.id != "":
                     recid_fd = open("%s/.recid" % bibdoc.basedir, "w")
                     recid_fd.write(str(self.id))
                     recid_fd.close()
                 if bibdoc.doctype != "":
                     type_fd = open("%s/.type" % bibdoc.basedir, "w")
                     type_fd.write(str(bibdoc.doctype))
                     type_fd.close()
             except Exception, e:
                 register_exception()
                 raise InvenioBibDocFileError, e
             os.umask(old_umask)
 
         if not versions:
             bibdoc.delete()
             self.dirty = True
         else:
             for version, formats in versions.iteritems():
                 if zero_version_bug:
                     version += 1
                 for docformat, filename in formats.iteritems():
                     destination = '%s%s;%i' % (docname, docformat, version)
                     try:
                         shutil.move('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, destination))
                     except Exception, e:
                         register_exception()
                         raise InvenioBibDocFileError, "Error in renaming '%s' to '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), '%s/%s' % (bibdoc.basedir, destination), e)
 
             try:
                 recid_fd = open("%s/.recid" % bibdoc.basedir, "w")
                 recid_fd.write(str(self.id))
                 recid_fd.close()
                 type_fd = open("%s/.type" % bibdoc.basedir, "w")
                 type_fd.write(str(bibdoc.doctype))
                 type_fd.close()
             except Exception, e:
                 register_exception()
                 raise InvenioBibDocFileError, "Error in creating .recid and .type file for '%s' folder: '%s'" % (bibdoc.basedir, e)
 
             res = []
 
             for (filename, version) in new_bibdocs:
                 if zero_version_bug:
                     version += 1
                 new_bibdoc = self.add_bibdoc(doctype=bibdoc.doctype, docname=docname, never_fail=True)
                 new_bibdoc.add_file_new_format('%s/%s' % (bibdoc.basedir, filename), version)
                 res.append(new_bibdoc)
                 try:
                     os.remove('%s/%s' % (bibdoc.basedir, filename))
                 except Exception, e:
                     register_exception()
                     raise InvenioBibDocFileError, "Error in removing '%s': '%s'" % ('%s/%s' % (bibdoc.basedir, filename), e)
 
             Md5Folder(bibdoc.basedir).update(only_new=False)
         bibdoc._build_file_list()
 
         for (bibdoc, dummyatttype) in self.bibdocs.values():
             if not run_sql('SELECT data_value FROM bibdocmoreinfo WHERE id_bibdoc=%s', (bibdoc.id,)):
                 ## Import from MARC only if the bibdoc has never had
                 ## its more_info initialized.
                 try:
                     bibdoc.import_descriptions_and_comments_from_marc()
                 except Exception, e:
                     register_exception()
                     raise InvenioBibDocFileError, "Error in importing description and comment from %s for record %s: %s" % (repr(bibdoc), self.id, e)
         return res
 
     def check_format(self, docname):
         """
         Check for any format related issue.
         In case L{CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS} is
         altered or Python version changes, it might happen that a docname
         contains files which are no more docname + .format ; version, simply
         because the .format is now recognized (and it was not before, so
         it was contained into the docname).
         This algorithm verify if it is necessary to fix (seel L{fix_format}).
 
         @param docname: the document name whose formats should be verified.
         @type docname: string
         @return: True if format is correct. False if a fix is needed.
         @rtype: bool
         @raise InvenioBibDocFileError: in case of any error.
         """
         bibdoc = self.get_bibdoc(docname)
         correct_docname = decompose_file(docname + '.pdf')[1]
         if docname != correct_docname:
             return False
         for filename in os.listdir(bibdoc.basedir):
             if not filename.startswith('.'):
                 try:
                     dummy, dummy, docformat, version = decompose_file_with_version(filename)
                 except Exception:
                     raise InvenioBibDocFileError('Incorrect filename "%s" for docname %s for recid %i' % (filename, docname, self.id))
                 if '%s%s;%i' % (correct_docname, docformat, version) != filename:
                     return False
         return True
 
     def check_duplicate_docnames(self):
         """
         Check wethever the record is connected with at least tho documents
         with the same name.
 
         @return: True if everything is fine.
         @rtype: bool
         """
         docnames = set()
         for docname in self.get_bibdoc_names():
             if docname in docnames:
                 return False
             else:
                 docnames.add(docname)
         return True
 
     def uniformize_bibdoc(self, docname):
         """
         This algorithm correct wrong file name belonging to a bibdoc.
 
         @param docname: the document name whose formats should be verified.
         @type docname: string
         """
         bibdoc = self.get_bibdoc(docname)
         for filename in os.listdir(bibdoc.basedir):
             if not filename.startswith('.'):
                 try:
                     dummy, dummy, docformat, version = decompose_file_with_version(filename)
                 except ValueError:
                     register_exception(alert_admin=True, prefix= "Strange file '%s' is stored in %s" % (filename, bibdoc.basedir))
                 else:
                     os.rename(os.path.join(bibdoc.basedir, filename), os.path.join(bibdoc.basedir, '%s%s;%i' % (docname, docformat, version)))
         Md5Folder(bibdoc.basedir).update()
         bibdoc.touch('rename')
 
     def fix_format(self, docname, skip_check=False):
         """
         Fixes format related inconsistencies.
 
         @param docname: the document name whose formats should be verified.
         @type docname: string
         @param skip_check: if True assume L{check_format} has already been
             called and the need for fix has already been found.
             If False, will implicitly call L{check_format} and skip fixing
             if no error is found.
         @type skip_check: bool
         @return: in case merging two bibdocs is needed but it's not possible.
         @rtype: bool
         """
         if not skip_check:
             if self.check_format(docname):
                 return True
         bibdoc = self.get_bibdoc(docname)
 
         correct_docname = decompose_file(docname + '.pdf')[1]
         need_merge = False
 
         if correct_docname != docname:
             need_merge = self.has_docname_p(correct_docname)
             if need_merge:
                 proposed_docname = self.propose_unique_docname(correct_docname)
                 run_sql('UPDATE bibdoc SET docname=%s WHERE id=%s', (proposed_docname, bibdoc.id))
                 self.dirty = True
                 self.uniformize_bibdoc(proposed_docname)
                 try:
                     self.merge_bibdocs(docname, proposed_docname)
                 except InvenioBibDocFileError:
                     return False
             else:
                 run_sql('UPDATE bibdoc SET docname=%s WHERE id=%s', (correct_docname, bibdoc.id))
                 self.dirty = True
                 self.uniformize_bibdoc(correct_docname)
         else:
             self.uniformize_bibdoc(docname)
         return True
 
     def fix_duplicate_docnames(self, skip_check=False):
         """
         Algotirthm to fix duplicate docnames.
         If a record is connected with at least two bibdoc having the same
         docname, the algorithm will try to merge them.
 
         @param skip_check: if True assume L{check_duplicate_docnames} has
             already been called and the need for fix has already been found.
             If False, will implicitly call L{check_duplicate_docnames} and skip
             fixing if no error is found.
         @type skip_check: bool
         """
         if not skip_check:
             if self.check_duplicate_docnames():
                 return
         docnames = set()
         for bibdoc in self.list_bibdocs():
             docname = self.get_docname(bibdoc.id)
             if docname in docnames:
                 new_docname = self.propose_unique_docname(self.get_docname(bibdoc.id))
                 self.change_name(docid=bibdoc.id, newname=new_docname)
                 self.merge_bibdocs(docname, new_docname)
             docnames.add(docname)
 
     def get_text(self, extract_text_if_necessary=True):
         """
         @return: concatenated texts of all bibdocs separated by " ": string
         """
         texts = []
         for bibdoc in self.list_bibdocs():
             if hasattr(bibdoc, 'has_text'):
                 if extract_text_if_necessary and not bibdoc.has_text(require_up_to_date=True):
                     perform_ocr = hasattr(bibdoc, 'is_ocr_required') and bibdoc.is_ocr_required()
                     from invenio.bibtask import write_message
                     write_message("... will extract words from %s %s" % (bibdoc, perform_ocr and 'with OCR' or ''), verbose=2)
                     bibdoc.extract_text(perform_ocr=perform_ocr)
                 texts.append(bibdoc.get_text())
 
         return " ".join(texts)
 
     def stream_archive_of_latest_files(self, req, files_size=''):
         """
         Streams the tar archive with all files of a certain file size (that
         are not restricted or hidden) to the user.
         File size should be a string that can be compared with the output of
         BibDocFile.get_subformat() function.
 
         @param req: Apache Request Object
         @type req: Apache Request Object
         @param files_size: size of the files (they can be defined in
         bibdocfile_config). Empty string means the original size.
         @type files_size: string
         """
         # Get the internal size from the user-friendly file size name
         internal_format = [f[1] for f in CFG_BIBDOCFILE_STREAM_ARCHIVE_FORMATS if f[0] == files_size]
         if len(internal_format) < 1:
             # Incorrect file size
             return
         internal_format = internal_format[0]
         tarname = str(self.id) + "_" + files_size + '.tar'
 
         # Select files that user can download (not hidden nor restricted)
         user_info = collect_user_info(req)
         req.content_type = "application/x-tar"
         req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % tarname
         tar = tarfile.open(fileobj=req, mode='w|')
         for f in self.list_latest_files():
             if f.get_subformat() == internal_format and f.is_restricted(user_info)[0] == 0 and not f.hidden:
                 tar.add(f.get_path(), arcname=f.get_full_name(), recursive=False)
         tar.close()
 
 class BibDoc(object):
     """
     This class represents one document (i.e. a set of files with different
     formats and with versioning information that consitutes a piece of
     information.
 
     To instanciate a new document, the recid and the docname are mandatory.
     To instanciate an already existing document, either the recid and docname
     or the docid alone are sufficient to retrieve it.
 
     @param docid: the document identifier.
     @type docid: integer
     @param recid: the record identifier of the record to which this document
         belongs to. If the C{docid} is specified the C{recid} is automatically
         retrieven from the database.
     @type recid: integer
     @param docname: the document name.
     @type docname: string
     @param doctype: the document type (used when instanciating a new document).
     @type doctype: string
     @param human_readable: whether sizes should be represented in a human
         readable format.
     @type human_readable: bool
     @raise InvenioBibDocFileError: in case of error.
     """
 
     @staticmethod
     def create_new_document(doc_type="Main", rec_links=None):
         if rec_links is None:
             rec_links = []
         status = ''
         doc_id = run_sql("INSERT INTO bibdoc (status, creation_date, modification_date, doctype) "
                           "values(%s,NOW(),NOW(), %s)", (status, doc_type))
 
         if not doc_id:
             raise InvenioBibDocFileError, "New docid cannot be created"
 
         # creating the representation on disk ... preparing the directory
         try:
             BibDoc.prepare_basedir(doc_id)
         except Exception, e:
             run_sql('DELETE FROM bibdoc WHERE id=%s', (doc_id, ))
             register_exception(alert_admin=True)
             raise InvenioBibDocFileError, e
 
         # the object has been created: linking to bibliographical records
         doc = BibDoc(doc_id)
         for link in rec_links:
             if "rec_id" in link and link["rec_id"]:
                 rec_id = link["rec_id"]
                 doc_name = normalize_docname(link["doc_name"])
                 a_type = link["a_type"]
                 doc.attach_to_record(rec_id, str(a_type), str(doc_name))
         return doc_id
 
 
     def __init__(self, docid, human_readable=False, initial_data=None):
         """Constructor of a bibdoc. At least the docid or the recid/docname
         pair is needed.
         specifying recid, docname and doctype without specifying docid results in
         attaching newly created document to a record
         """
         # docid is known, the document already exists
         res2 = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (docid,))
         self.bibrec_types = [(r[0], r[1], r[2]) for r in res2 ] # just in case the result was behaving like tuples but was something else
         if not res2:
             # fake attachment
             self.bibrec_types = [(0, None, "fake_name_for_unattached_document")]
 
         if initial_data is None:
             initial_data = BibDoc._retrieve_data(docid)
 
         self._docfiles = []
         self.__md5s = None
         self._related_files = {}
         self.human_readable = human_readable
         self.cd = initial_data["cd"] # creation date
         self.md = initial_data["md"] # modification date
         self.td = initial_data["td"] # text extraction date # should be moved from here !!!!
         self.bibrec_links = initial_data["bibrec_links"]
 
         self.id = initial_data["id"]
         self.status = initial_data["status"]
         self.basedir = initial_data["basedir"]
         self.doctype = initial_data["doctype"]
         self.storagename = initial_data["storagename"] # the old docname -> now used as a storage name for old records
 
         self.more_info = BibDocMoreInfo(self.id)
         self.dirty = True
         self.dirty_related_files = True
         self.last_action = 'init'
 
     def __del__(self):
         if self.dirty and self.last_action != 'init':
             ## The object is dirty and we did something more than initializing it
             self._build_file_list()
 
     @property
     def docfiles(self):
         if self.dirty:
             self._build_file_list(self.last_action)
             self.dirty = False
         return self._docfiles
 
     @property
     def related_files(self):
         if self.dirty_related_files:
             self._build_related_file_list()
             self.dirty_related_files = False
         return self._related_files
 
     @staticmethod
     def prepare_basedir(doc_id):
         """Prepares the directory serving as root of a BibDoc"""
         basedir = _make_base_dir(doc_id)
         # we create the corresponding storage directory
         if not os.path.exists(basedir):
             from invenio.config import CFG_CERN_SITE, CFG_INSPIRE_SITE, CFG_BIBDOCFILE_AFS_VOLUME_PATTERN, CFG_BIBDOCFILE_AFS_VOLUME_QUOTA
             if os.path.realpath(basedir).startswith('/afs') and (CFG_CERN_SITE or CFG_INSPIRE_SITE):
                 ## We are on AFS at CERN! Let's allocate directories the CERN/AFS way. E.g.
                 ## $ afs_admin create -q 1000000 /afs/cern.ch/project/cds/files/g40 p.cds.g40
                 ## NOTE: This might be extended to use low-level OpenAFS CLI tools
                 ## so that this technique could be extended to other AFS users outside CERN.
                 mount_point = os.path.dirname(os.path.realpath(basedir))
                 if not os.path.exists(mount_point):
                     volume = CFG_BIBDOCFILE_AFS_VOLUME_PATTERN % os.path.basename(mount_point)
                     quota = str(CFG_BIBDOCFILE_AFS_VOLUME_QUOTA)
                     exit_code, stdout, stderr = run_shell_command("afs_admin create -q %s %s %s", (quota, mount_point, volume))
                     if exit_code or stderr:
                         raise IOError("Error in creating AFS mount point %s with quota %s and volume %s: exit_code=%s. Captured stdout:\n: %s\nCaptured stderr:\n: %s" % (mount_point, quota, volume, exit_code, stdout, stderr))
             old_umask = os.umask(022)
             os.makedirs(basedir)
             os.umask(old_umask)
 
     def _update_additional_info_files(self):
         """Update the hidden file in the document directory ... the file contains all links to records"""
         try:
             reclinks_fd = open("%s/.reclinks" % (self.basedir, ), "w")
             reclinks_fd.write("RECID DOCNAME TYPE\n")
             for link in self.bibrec_links:
                 reclinks_fd.write("%(recid)s %(docname)s %(doctype)s\n" % link)
             reclinks_fd.close()
         except Exception, e:
             register_exception(alert_admin=True)
             raise InvenioBibDocFileError, e
 
 
     @staticmethod
     def _retrieve_data(docid = None):
         """
            Filling information about a document from the database entry
         """
         container = {}
         container["bibrec_links"] = []
         container["id"] = docid
         container["basedir"] = _make_base_dir(container["id"])
 
         # retrieving links betwen records and documents
 
         res = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (str(docid),), 1)
         if res:
             for r in res:
                 container["bibrec_links"].append({"recid": r[0], "doctype": r[1], "docname": r[2]})
 
         # gather the other information
         res = run_sql("SELECT status, creation_date, modification_date, text_extraction_date, doctype, docname FROM bibdoc WHERE id=%s LIMIT 1", (docid,), 1)
 
         if res:
             container["status"] = res[0][0]
             container["cd"] = res[0][1]
             container["md"] = res[0][2]
             container["td"] = res[0][3]
             container["doctype"] = res[0][4]
             container["storagename"] = res[0][5]
         else:
             # this bibdoc doesn't exist
             raise InvenioBibDocFileError, "The docid %s does not exist." % docid
 
         # retreiving all available formats
         fprefix = container["storagename"] or "content"
         try:
             if CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE:
                 ## We take all extensions from the existing formats in the DB.
                 container["extensions"] = set([ext[0] for ext in run_sql("SELECT format FROM bibdocfsinfo WHERE id_bibdoc=%s", (docid, ))])
             else:
                 ## We take all the extensions by listing the directory content, stripping name
                 ## and version.
                 container["extensions"] = set([fname[len(fprefix):].rsplit(";", 1)[0] for fname in filter(lambda x: x.startswith(fprefix), os.listdir(container["basedir"]))])
         except OSError:
             container["extensions"] = []
             register_exception()
         return container
 
     @staticmethod
     def create_instance(docid=None, recid=None, docname=None,
                         doctype='Fulltext', a_type = '', human_readable=False):
         """
         Parameters of an attachement to the record:
         a_type, recid, docname
         @param a_type Type of the attachment to the record (by default Main)
         @type a_type String
 
         @param doctype Type of the document itself (by default Fulltext)
         @type doctype String
         """
 
         # first try to retrieve existing record based on obtained data
         data = None
         extensions = []
         if docid is not None:
             data = BibDoc._retrieve_data(docid)
             doctype = data["doctype"]
             extensions = data["extensions"]
 
         # Loading an appropriate plugin (by default a generic BibDoc)
         used_plugin = None
 
         for dummy, plugin in get_plugins().iteritems():
             if plugin['supports'](doctype, extensions):
                 used_plugin = plugin
 
         if not a_type:
             a_type = doctype or 'Main'
 
         if not docid:
             rec_links = []
             if recid:
                 rec_links.append({"rec_id": recid, "doc_name" : docname, "a_type": a_type})
 
             if used_plugin and 'create_new' in used_plugin:
                 docid = used_plugin['create_new'](doctype, rec_links)
             else:
                 docid = BibDoc.create_new_document(doctype, rec_links)
 
         if used_plugin:
             return used_plugin['create_instance'](docid=docid,
                                                   human_readable=human_readable,
                                                   initial_data=data)
         return BibDoc(docid=docid,
                       human_readable=human_readable,
                       initial_data=data)
 
     def attach_to_record(self, recid, a_type, docname):
         """ Attaches given document to a record given by its identifier.
             @param recid The identifier of the record
             @type recid Integer
             @param a_type Function of a document in the record
             @type a_type String
             @param docname Name of a document inside of a record
             @type docname String
         """
         run_sql("INSERT INTO bibrec_bibdoc (id_bibrec, id_bibdoc, type, docname) VALUES (%s,%s,%s,%s)",
                 (str(recid), str(self.id), a_type, docname))
         self._update_additional_info_files()
 
     def __repr__(self):
         """
         @return: the canonical string representation of the C{BibDoc}.
         @rtype: string
         """
         return 'BibDoc(%s, %s, %s)' % (repr(self.id), repr(self.doctype), repr(self.human_readable))
 
     def format_recids(self):
         """Returns a string representation of related record ids"""
         if len(self.bibrec_links) == 1:
             return self.bibrec_links[0]["recid"]
         return "[" + ",".join([str(el["recid"]) for el in self.bibrec_links]) + "]"
 
     def __str__(self):
         """
         @return: an easy to be I{grepped} string representation of the
             whole C{BibDoc} content.
         @rtype: string
         """
         recids = self.format_recids()
         out = '%s:%i:::doctype=%s\n' % (recids, self.id, self.doctype)
         out += '%s:%i:::status=%s\n' % (recids, self.id, self.status)
         out += '%s:%i:::basedir=%s\n' % (recids, self.id, self.basedir)
         out += '%s:%i:::creation date=%s\n' % (recids, self.id, self.cd)
         out += '%s:%i:::modification date=%s\n' % (recids, self.id, self.md)
         out += '%s:%i:::text extraction date=%s\n' % (recids, self.id, self.td)
         out += '%s:%i:::total file attached=%s\n' % (recids, self.id, len(self.docfiles))
         if self.human_readable:
             out += '%s:%i:::total size latest version=%s\n' % (recids, self.id, nice_size(self.get_total_size_latest_version()))
             out += '%s:%i:::total size all files=%s\n' % (recids, self.id, nice_size(self.get_total_size()))
         else:
             out += '%s:%i:::total size latest version=%s\n' % (recids, self.id, self.get_total_size_latest_version())
             out += '%s:%i:::total size all files=%s\n' % (recids, self.id, self.get_total_size())
         for docfile in self.docfiles:
             out += str(docfile)
         return out
 
     def get_md5s(self):
         """
         @return: an instance of the Md5Folder class to access MD5 information
             of the current BibDoc
         @rtype: Md5Folder
         """
         if self.__md5s is None:
             self.__md5s = Md5Folder(self.basedir)
         return self.__md5s
     md5s = property(get_md5s)
 
     def format_already_exists_p(self, docformat):
         """
         @param format: a format to be checked.
         @type format: string
         @return: True if a file of the given format already exists among the
             latest files.
         @rtype: bool
         """
         docformat = normalize_format(docformat)
         for afile in self.list_latest_files():
             if docformat == afile.get_format():
                 return True
         return False
 
     def get_status(self):
         """
         @return: the status information.
         @rtype: string
         """
         return self.status
 
     @staticmethod
     def get_fileprefix(basedir, storagename=None):
         fname = "%s" % (storagename or "content", )
         return os.path.join(basedir, fname )
 
     def get_filepath(self, docformat, version):
         """ Generaters the path inside of the filesystem where the document should be stored.
         @param format The format of the document
         @type format string
         @param version version to be stored in the file
         @type version string
         TODO: this should be completely replaced. File storage (and so, also path building)
         should be abstracted from BibDoc and be using loadable extensions
         @param format Format of the document to be stored
         @type format string
         @param version Version of the document to be stored
         @type version String
         @return Full path to the file encoding a particular version and format of the document
         @trype string
         """
 
         return "%s%s;%i" % (BibDoc.get_fileprefix(self.basedir, self.storagename),  docformat, version)
 
     def get_docname(self):
         """Obsolete !! (will return empty String for new format documents"""
         return self.storagename
 
     def get_doctype(self, recid):
         """Retrieves the type of this document in the scope of a given recid"""
         link_types = [attachement["doctype"] for attachement in
                       self.bibrec_links
                       if str(attachement["recid"]) == str(recid)]
         if link_types:
             return link_types[0]
         return ""
 
     def touch(self, action=''):
         """
         Update the modification time of the bibdoc (as in the UNIX command
         C{touch}).
         """
         run_sql('UPDATE bibdoc SET modification_date=NOW() WHERE id=%s', (self.id, ))
         self.dirty = True
         self.last_action = action
 
     def change_doctype(self, new_doctype):
         """
         Modify the doctype of a BibDoc
         """
         run_sql('UPDATE bibdoc SET doctype=%s WHERE id=%s', (new_doctype, self.id))
         run_sql('UPDATE bibrec_bibdoc SET type=%s WHERE id_bibdoc=%s', (new_doctype, self.id))
         self.dirty = True
 
     def set_status(self, new_status):
         """
         Set a new status. A document with a status information is a restricted
         document that can be accessed only to user which as an authorization
         to the I{viewrestrdoc} WebAccess action with keyword status with value
         C{new_status}.
 
         @param new_status: the new status. If empty the document will be
             unrestricted.
         @type new_status: string
         @raise InvenioBibDocFileError: in case the reserved word
             'DELETED' is used.
         """
         if new_status != KEEP_OLD_VALUE:
             if new_status == 'DELETED':
                 raise InvenioBibDocFileError('DELETED is a reserved word and can not be used for setting the status')
             run_sql('UPDATE bibdoc SET status=%s WHERE id=%s', (new_status, self.id))
             self.status = new_status
             self.touch('status')
 
     def add_file_new_version(self, filename, description=None, comment=None, docformat=None, flags=None, modification_date=None):
         """
         Add a new version of a file. If no physical file is already attached
         to the document a the given file will have version 1. Otherwise the
         new file will have the current version number plus one.
 
         @param filename: the local path of the file.
         @type filename: string
         @param description: an optional description for the file.
         @type description: string
         @param comment: an optional comment to the file.
         @type comment: string
         @param format: the extension of the file. If not specified it will
             be retrieved from the filename (see L{decompose_file}).
         @type format: string
         @param flags: a set of flags to be associated with the file (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS})
         @type flags: list of string
         @raise InvenioBibDocFileError: in case of error.
         """
         latestVersion = self.get_latest_version()
         if latestVersion == 0:
             myversion = 1
         else:
             myversion = latestVersion + 1
         if os.path.exists(filename):
             if not os.path.getsize(filename) > 0:
                 raise InvenioBibDocFileError, "%s seems to be empty" % filename
             if docformat is None:
                 docformat = decompose_file(filename)[2]
             else:
                 docformat = normalize_format(docformat)
 
             destination = self.get_filepath(docformat, myversion)
             if run_sql("SELECT id_bibdoc FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, myversion, docformat)):
                 raise InvenioBibDocFileError("According to the database a file of format %s is already attached to the docid %s" % (docformat, self.id))
             try:
                 shutil.copyfile(filename, destination)
                 os.chmod(destination, 0644)
                 if modification_date: # if the modification time of the file needs to be changed
                     update_modification_date_of_file(destination, modification_date)
             except Exception, e:
                 register_exception()
                 raise InvenioBibDocFileError("Encountered an exception while copying '%s' to '%s': '%s'" % (filename, destination, e))
             self.more_info.set_description(description, docformat, myversion)
             self.more_info.set_comment(comment, docformat, myversion)
             if flags is None:
                 flags = []
             if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags:
                 flags.append('PDF/A')
             for flag in flags:
                 if flag == 'PERFORM_HIDE_PREVIOUS':
                     for afile in self.list_all_files():
                         docformat = afile.get_format()
                         version = afile.get_version()
                         if version < myversion:
                             self.more_info.set_flag('HIDDEN', docformat, myversion)
                 else:
                     self.more_info.set_flag(flag, docformat, myversion)
         else:
             raise InvenioBibDocFileError("'%s' does not exists!" % filename)
         self.touch('newversion')
         Md5Folder(self.basedir).update()
         just_added_file = self.get_file(docformat, myversion)
         run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, true, %s, %s, %s, %s, %s)", (self.id, myversion, docformat, just_added_file.cd, just_added_file.md, just_added_file.get_checksum(), just_added_file.get_size(), just_added_file.mime))
         run_sql("UPDATE bibdocfsinfo SET last_version=false WHERE id_bibdoc=%s AND version<%s", (self.id, myversion))
 
     def add_file_new_format(self, filename, version=None, description=None, comment=None, docformat=None, flags=None, modification_date=None):
         """
         Add a file as a new format.
 
         @param filename: the local path of the file.
         @type filename: string
         @param version: an optional specific version to which the new format
             should be added. If None, the last version will be used.
         @type version: integer
         @param description: an optional description for the file.
         @type description: string
         @param comment: an optional comment to the file.
         @type comment: string
         @param format: the extension of the file. If not specified it will
             be retrieved from the filename (see L{decompose_file}).
         @type format: string
         @param flags: a set of flags to be associated with the file (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS})
         @type flags: list of string
         @raise InvenioBibDocFileError: if the given format already exists.
         """
         if version is None:
             version = self.get_latest_version()
         if version == 0:
             version = 1
         if os.path.exists(filename):
             if not os.path.getsize(filename) > 0:
                 raise InvenioBibDocFileError, "%s seems to be empty" % filename
             if docformat is None:
                 docformat = decompose_file(filename)[2]
             else:
                 docformat = normalize_format(docformat)
 
             if run_sql("SELECT id_bibdoc FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, version, docformat)):
                 raise InvenioBibDocFileError("According to the database a file of format %s is already attached to the docid %s" % (docformat, self.id))
             destination = self.get_filepath(docformat, version)
             if os.path.exists(destination):
                 raise InvenioBibDocFileError, "A file for docid '%s' already exists for the format '%s'" % (str(self.id), docformat)
             try:
                 shutil.copyfile(filename, destination)
                 os.chmod(destination, 0644)
                 if modification_date: # if the modification time of the file needs to be changed
                     update_modification_date_of_file(destination, modification_date)
             except Exception, e:
                 register_exception()
                 raise InvenioBibDocFileError, "Encountered an exception while copying '%s' to '%s': '%s'" % (filename, destination, e)
             self.more_info.set_comment(comment, docformat, version)
             self.more_info.set_description(description, docformat, version)
             if flags is None:
                 flags = []
             if 'pdfa' in get_subformat_from_format(docformat).split(';') and not 'PDF/A' in flags:
                 flags.append('PDF/A')
             for flag in flags:
                 if flag != 'PERFORM_HIDE_PREVIOUS':
                     self.more_info.set_flag(flag, docformat, version)
         else:
             raise InvenioBibDocFileError, "'%s' does not exists!" % filename
         Md5Folder(self.basedir).update()
         self.touch('newformat')
         just_added_file = self.get_file(docformat, version)
         run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, true, %s, %s, %s, %s, %s)", (self.id, version, docformat, just_added_file.cd, just_added_file.md, just_added_file.get_checksum(), just_added_file.get_size(), just_added_file.mime))
 
     def change_docformat(self, oldformat, newformat):
         """
         Renames a format name on disk and in all BibDoc structures.
         The change will touch only the last version files.
         The change will take place only if the newformat doesn't already exist.
         @param oldformat: the format that needs to be renamed
         @type oldformat: string
         @param newformat: the format new name
         @type newformat: string
         """
         oldformat = normalize_format(oldformat)
         newformat = normalize_format(newformat)
         if self.format_already_exists_p(newformat):
             # same format already exists in the latest files, abort
             return
         for bibdocfile in self.list_latest_files():
             if bibdocfile.get_format() == oldformat:
                 # change format -> rename x.oldformat -> x.newformat
                 dirname, base, docformat, version = decompose_file_with_version(bibdocfile.get_full_path())
                 os.rename(bibdocfile.get_full_path(), os.path.join(dirname, '%s%s;%i' %(base, newformat, version)))
                 Md5Folder(self.basedir).update()
                 self.touch('rename')
                 self._sync_to_db()
                 return
 
     def purge(self):
         """
         Physically removes all the previous version of the given bibdoc.
         Everything but the last formats will be erased.
         """
         version = self.get_latest_version()
         if version > 1:
             for afile in self.docfiles:
                 if afile.get_version() < version:
                     self.more_info.unset_comment(afile.get_format(), afile.get_version())
                     self.more_info.unset_description(afile.get_format(), afile.get_version())
                     for flag in CFG_BIBDOCFILE_AVAILABLE_FLAGS:
                         self.more_info.unset_flag(flag, afile.get_format(), afile.get_version())
                     try:
                         os.remove(afile.get_full_path())
                     except Exception, dummy:
                         register_exception()
             Md5Folder(self.basedir).update()
             self.touch('purge')
             run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s AND version<%s", (self.id, version))
 
     def expunge(self):
         """
         Physically remove all the traces of a given document.
         @note: an expunged BibDoc object shouldn't be used anymore or the
         result might be unpredicted.
         """
         del self.__md5s
         self.more_info.delete()
         del self.more_info
         os.system('rm -rf %s' % escape_shell_arg(self.basedir))
         run_sql('DELETE FROM bibrec_bibdoc WHERE id_bibdoc=%s', (self.id, ))
         run_sql('DELETE FROM bibdoc_bibdoc WHERE id_bibdoc1=%s OR id_bibdoc2=%s', (self.id, self.id))
         run_sql('DELETE FROM bibdoc WHERE id=%s', (self.id, ))
         run_sql('INSERT INTO hstDOCUMENT(action, id_bibdoc, doctimestamp) VALUES("EXPUNGE", %s, NOW())', (self.id, ))
         run_sql('DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s', (self.id, ))
         del self._docfiles
         del self.id
         del self.cd
         del self.md
         del self.td
         del self.basedir
         del self.doctype
         del self.bibrec_links
 
     def revert(self, version):
         """
         Revert the document to a given version. All the formats corresponding
         to that version are copied forward to a new version.
 
         @param version: the version to revert to.
         @type version: integer
         @raise InvenioBibDocFileError: in case of errors
         """
         version = int(version)
         docfiles = self.list_version_files(version)
         if docfiles:
             self.add_file_new_version(docfiles[0].get_full_path(), description=docfiles[0].get_description(), comment=docfiles[0].get_comment(), docformat=docfiles[0].get_format(), flags=docfiles[0].flags)
         for docfile in docfiles[1:]:
             self.add_file_new_format(docfile.filename, description=docfile.get_description(), comment=docfile.get_comment(), docformat=docfile.get_format(), flags=docfile.flags)
 
     def import_descriptions_and_comments_from_marc(self, record=None):
         """
         Import descriptions and comments from the corresponding MARC metadata.
 
         @param record: the record (if None it will be calculated).
         @type record: bibrecord recstruct
         @note: If record is passed it is directly used, otherwise it is retrieved
         from the MARCXML stored in the database.
         """
         ## Let's get the record
         from invenio.search_engine import get_record
         if record is None:
             record = get_record(self.id)
 
         fields = record_get_field_instances(record, '856', '4', ' ')
 
         global_comment = None
         global_description = None
         local_comment = {}
         local_description = {}
 
         for field in fields:
             url = field_get_subfield_values(field, 'u')
             if url:
                 ## Given a url
                 url = url[0]
                 if re.match('%s/%s/[0-9]+/files/' % (CFG_SITE_URL, CFG_SITE_RECORD), url):
                     ## If it is a traditional /CFG_SITE_RECORD/1/files/ one
                     ## We have global description/comment for all the formats
                     description = field_get_subfield_values(field, 'y')
                     if description:
                         global_description = description[0]
                     comment = field_get_subfield_values(field, 'z')
                     if comment:
                         global_comment = comment[0]
                 elif bibdocfile_url_p(url):
                     ## Otherwise we have description/comment per format
                     dummy, docname, docformat = decompose_bibdocfile_url(url)
                     brd = BibRecDocs(self.id)
                     if docname == brd.get_docname(self.id):
                         description = field_get_subfield_values(field, 'y')
                         if description:
                             local_description[docformat] = description[0]
                         comment = field_get_subfield_values(field, 'z')
                         if comment:
                             local_comment[docformat] = comment[0]
 
         ## Let's update the tables
         version = self.get_latest_version()
         for docfile in self.list_latest_files():
             docformat = docfile.get_format()
             if docformat in local_comment:
                 self.set_comment(local_comment[docformat], docformat, version)
             else:
                 self.set_comment(global_comment, docformat, version)
             if docformat in local_description:
                 self.set_description(local_description[docformat], docformat, version)
             else:
                 self.set_description(global_description, docformat, version)
         self.dirty = True
 
     def get_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE, display_hidden=True):
         """
         @param subformat_re: by default the convention is that
             L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to
             mean that a particular format is to be used as an icon.
             Specifiy a different subformat if you need to use a different
             convention.
         @type subformat_re: compiled regular expression
         @return: the bibdocfile corresponding to CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT
             or, if this does not exist, the smallest size icon of this
             document, or None if no icon exists for this document.
         @rtype: BibDocFile
         @warning: before I{subformat} were introduced this method was
             returning a BibDoc, while now is returning a BibDocFile. Check
             if your client code is compatible with this.
         """
         icons = []
         for docfile in self.list_latest_files(list_hidden=display_hidden):
             subformat = docfile.get_subformat()
             if subformat.lower() == CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT.lower():
                 # If it's the default icon subformat, return it
                 return docfile
             if subformat_re.match(subformat):
                 icons.append((docfile.get_size(), docfile))
         if icons:
             # Sort by size, retrieve the smallest one
             icons.sort()
             return icons[0][1]
         return None
 
     def add_icon(self, filename, docformat=None, subformat=CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT, modification_date=None):
         """
         Attaches icon to this document.
 
         @param filename: the local filesystem path to the icon.
         @type filename: string
         @param format: an optional format for the icon. If not specified it
             will be calculated after the filesystem path.
         @type format: string
         @param subformat: by default the convention is that
             CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT is used as a subformat indicator to
             mean that a particular format is to be used as an icon.
             Specifiy a different subformat if you need to use a different
             convention.
         @type subformat: string
         @raise InvenioBibDocFileError: in case of errors.
         """
         #first check if an icon already exists
         if not docformat:
             docformat = decompose_file(filename)[2]
         if subformat:
             docformat += ";%s" % subformat
 
         self.add_file_new_format(filename, docformat=docformat, modification_date=modification_date)
 
     def delete_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE):
         """
         @param subformat_re: by default the convention is that
             L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to
             mean that a particular format is to be used as an icon.
             Specifiy a different subformat if you need to use a different
             convention.
         @type subformat: compiled regular expression
         Removes the icon attached to the document if it exists.
         """
         for docfile in self.list_latest_files():
             if subformat_re.match(docfile.get_subformat()):
                 self.delete_file(docfile.get_format(), docfile.get_version())
 
     def change_name(self, recid, newname):
         """
         Renames this document in connection with a given record.
 
         @param newname: the new name.
         @type newname: string
         @raise InvenioBibDocFileError: if the new name corresponds to
             a document already attached to the record owning this document or
             if the name was not changed.
         """
         newname = normalize_docname(newname)
 
         res = run_sql("SELECT id_bibdoc FROM bibrec_bibdoc WHERE id_bibrec=%s AND docname=%s", (recid, newname))
         if res:
             raise InvenioBibDocFileError("A bibdoc called %s already exists for recid %s" % (newname, recid))
 
         updated = run_sql("update bibrec_bibdoc set docname=%s where id_bibdoc=%s and id_bibrec=%s", (newname, self.id, recid))
         if not updated:
             raise InvenioBibDocFileError("Docname for bibdoc %s in record %s was not changed" % (self.id, recid))
         # docid is known, the document already exists
         res2 = run_sql("SELECT id_bibrec, type, docname FROM bibrec_bibdoc WHERE id_bibdoc=%s", (self.id,))
         ## Refreshing names and types.
         self.bibrec_types = [(r[0], r[1], r[2]) for r in res2 ] # just in case the result was behaving like tuples but was something else
         if not res2:
             # fake attachment
             self.bibrec_types = [(0, None, "fake_name_for_unattached_document")]
         self.touch('rename')
 
     def set_comment(self, comment, docformat, version=None):
         """
         Updates the comment of a specific format/version of the document.
 
         @param comment: the new comment.
         @type comment: string
         @param format: the specific format for which the comment should be
             updated.
         @type format: string
         @param version: the specific version for which the comment should be
             updated. If not specified the last version will be used.
         @type version: integer
         """
         if version is None:
             version = self.get_latest_version()
         docformat = normalize_format(docformat)
         self.more_info.set_comment(comment, docformat, version)
         self.dirty = True
 
     def set_description(self, description, docformat, version=None):
         """
         Updates the description of a specific format/version of the document.
 
         @param description: the new description.
         @type description: string
         @param format: the specific format for which the description should be
             updated.
         @type format: string
         @param version: the specific version for which the description should be
             updated. If not specified the last version will be used.
         @type version: integer
         """
         if version is None:
             version = self.get_latest_version()
         docformat = normalize_format(docformat)
         self.more_info.set_description(description, docformat, version)
         self.dirty = True
 
     def set_flag(self, flagname, docformat, version=None):
         """
         Sets a flag for a specific format/version of the document.
 
         @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}.
         @type flagname: string
         @param format: the specific format for which the flag should be
             set.
         @type format: string
         @param version: the specific version for which the flag should be
             set. If not specified the last version will be used.
         @type version: integer
         """
         if version is None:
             version = self.get_latest_version()
         docformat = normalize_format(docformat)
         self.more_info.set_flag(flagname, docformat, version)
         self.dirty = True
 
     def has_flag(self, flagname, docformat, version=None):
         """
         Checks if a particular flag for a format/version is set.
 
         @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}.
         @type flagname: string
         @param format: the specific format for which the flag should be
             set.
         @type format: string
         @param version: the specific version for which the flag should be
             set. If not specified the last version will be used.
         @type version: integer
         @return: True if the flag is set.
         @rtype: bool
         """
         if version is None:
             version = self.get_latest_version()
         docformat = normalize_format(docformat)
         return self.more_info.has_flag(flagname, docformat, version)
 
     def unset_flag(self, flagname, docformat, version=None):
         """
         Unsets a flag for a specific format/version of the document.
 
         @param flagname: a flag from L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}.
         @type flagname: string
         @param format: the specific format for which the flag should be
             unset.
         @type format: string
         @param version: the specific version for which the flag should be
             unset. If not specified the last version will be used.
         @type version: integer
         """
         if version is None:
             version = self.get_latest_version()
         docformat = normalize_format(docformat)
         self.more_info.unset_flag(flagname, docformat, version)
         self.dirty = True
 
     def get_comment(self, docformat, version=None):
         """
         Retrieve the comment of a specific format/version of the document.
 
         @param format: the specific format for which the comment should be
             retrieved.
         @type format: string
         @param version: the specific version for which the comment should be
             retrieved. If not specified the last version will be used.
         @type version: integer
         @return: the comment.
         @rtype: string
         """
         if version is None:
             version = self.get_latest_version()
         docformat = normalize_format(docformat)
         return self.more_info.get_comment(docformat, version)
 
     def get_description(self, docformat, version=None):
         """
         Retrieve the description of a specific format/version of the document.
 
         @param format: the specific format for which the description should be
             retrieved.
         @type format: string
         @param version: the specific version for which the description should
             be retrieved. If not specified the last version will be used.
         @type version: integer
         @return: the description.
         @rtype: string
         """
         if version is None:
             version = self.get_latest_version()
         docformat = normalize_format(docformat)
         return self.more_info.get_description(docformat, version)
 
     def hidden_p(self, docformat, version=None):
         """
         Returns True if the file specified by the given format/version is
         hidden.
 
         @param format: the specific format for which the description should be
             retrieved.
         @type format: string
         @param version: the specific version for which the description should
             be retrieved. If not specified the last version will be used.
         @type version: integer
         @return: True if hidden.
         @rtype: bool
         """
         if version is None:
             version = self.get_latest_version()
         return self.more_info.has_flag('HIDDEN', docformat, version)
 
     def get_base_dir(self):
         """
         @return: the base directory on the local filesystem for this document
             (e.g. C{/soft/cdsweb/var/data/files/g0/123})
         @rtype: string
         """
         return self.basedir
 
     def get_type(self):
         """
         @return: the type of this document.
         @rtype: string"""
         return self.doctype
 
 
     def get_id(self):
         """
         @return: the id of this document.
         @rtype: integer
         """
         return self.id
 
 
     def get_file(self, docformat, version="", exact_docformat=False):
         """
         Returns a L{BibDocFile} instance of this document corresponding to the
         specific format and version.
 
         @param format: the specific format.
         @type format: string
         @param version: the specific version for which the description should
             be retrieved. If not specified the last version will be used.
         @type version: integer
         @param exact_docformat: if True, consider always the
             complete docformat (including subformat if any)
         @type exact_docformat: bool
         @return: the L{BibDocFile} instance.
         @rtype: BibDocFile
         """
         if version == "":
             docfiles = self.list_latest_files()
         else:
             version = int(version)
             docfiles = self.list_version_files(version)
 
         docformat = normalize_format(docformat)
 
         for docfile in docfiles:
             if (docfile.get_format() == docformat or not docformat):
                 return docfile
 
         ## Let's skip the subformat specification and consider just the
         ## superformat
         if not exact_docformat:
             superformat = get_superformat_from_format(docformat)
             for docfile in docfiles:
                 if get_superformat_from_format(docfile.get_format()) == superformat:
                     return docfile
 
         raise InvenioBibDocFileError("No file for doc %i of format '%s', version '%s'" % (self.id, docformat, version))
 
     def list_versions(self):
         """
         @return: the list of existing version numbers for this document.
         @rtype: list of integer
         """
         versions = []
         for docfile in self.docfiles:
             if not docfile.get_version() in versions:
                 versions.append(docfile.get_version())
         versions.sort()
         return versions
 
     def delete(self, recid=None):
         """
         Delete this document.
         @see: L{undelete} for how to undelete the document.
         @raise InvenioBibDocFileError: in case of errors.
         """
         try:
             today = datetime.today()
             recids = []
             if recid:
                 recids = [recid]
             else:
                 recids = [link["recid"] for link in self.bibrec_links]
 
             for rid in recids:
                 brd = BibRecDocs(rid)
                 docname = brd.get_docname(self.id)
                 # if the document is attached to some records
                 brd.change_name(docid=self.id, newname = 'DELETED-%s%s-%s' % (today.strftime('%Y%m%d%H%M%S'), today.microsecond, docname))
 
             run_sql("UPDATE bibdoc SET status='DELETED' WHERE id=%s", (self.id,))
             self.status = 'DELETED'
         except Exception, e:
             register_exception(alert_admin=True)
             raise InvenioBibDocFileError, "It's impossible to delete bibdoc %s: %s" % (self.id, e)
 
     def deleted_p(self):
         """
         @return: True if this document has been deleted.
         @rtype: bool
         """
         return self.status == 'DELETED'
 
     def empty_p(self):
         """
         @return: True if this document is empty, i.e. it has no bibdocfile
         connected.
         @rtype: bool
         """
         return len(self.docfiles) == 0
 
     def undelete(self, previous_status='', recid=None):
         """
         Undelete a deleted file (only if it was actually deleted via L{delete}).
         The previous C{status}, i.e. the restriction key can be provided.
         Otherwise the undeleted document will be public.
         @param previous_status: the previous status the should be restored.
         @type previous_status: string
         @raise InvenioBibDocFileError: in case of any error.
         """
 
         try:
             run_sql("UPDATE bibdoc SET status=%s WHERE id=%s AND status='DELETED'", (previous_status, self.id))
         except Exception, e:
             raise InvenioBibDocFileError, "It's impossible to undelete bibdoc %s: %s" % (self.id, e)
 
         if recid:
             bibrecdocs = BibRecDocs(recid)
             docname = bibrecdocs.get_docname(self.id)
             if docname.startswith('DELETED-'):
                 try:
                     # Let's remove DELETED-20080214144322- in front of the docname
                     original_name = '-'.join(docname.split('-')[2:])
                     original_name = bibrecdocs.propose_unique_docname(original_name)
                     bibrecdocs.change_name(docid=self.id, newname=original_name)
                 except Exception, e:
                     raise InvenioBibDocFileError, "It's impossible to restore the previous docname %s. %s kept as docname because: %s" % (original_name, docname, e)
             else:
                 raise InvenioBibDocFileError, "Strange just undeleted docname isn't called DELETED-somedate-docname but %s" % docname
 
     def delete_file(self, docformat, version):
         """
         Delete a specific format/version of this document on the filesystem.
         @param format: the particular format to be deleted.
         @type format: string
         @param version: the particular version to be deleted.
         @type version: integer
         @note: this operation is not reversible!"""
         try:
             afile = self.get_file(docformat, version)
         except InvenioBibDocFileError:
             return
         try:
             os.remove(afile.get_full_path())
             run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s AND version=%s AND format=%s", (self.id, afile.get_version(), afile.get_format()))
             last_version = run_sql("SELECT max(version) FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id, ))[0][0]
             if last_version:
                 ## Updating information about last version
                 run_sql("UPDATE bibdocfsinfo SET last_version=true WHERE id_bibdoc=%s AND version=%s", (self.id, last_version))
                 run_sql("UPDATE bibdocfsinfo SET last_version=false WHERE id_bibdoc=%s AND version<>%s", (self.id, last_version))
         except OSError:
             pass
         self.touch('delete')
 
     def get_history(self):
         """
         @return: a human readable and parsable string that represent the
             history of this document.
         @rtype: string
         """
         ret = []
         hst = run_sql("""SELECT action, docname, docformat, docversion,
                 docsize, docchecksum, doctimestamp
                 FROM hstDOCUMENT
                 WHERE id_bibdoc=%s ORDER BY doctimestamp ASC""", (self.id, ))
         for row in hst:
             ret.append("%s %s '%s', format: '%s', version: %i, size: %s, checksum: '%s'" % (row[6].strftime('%Y-%m-%d %H:%M:%S'), row[0], row[1], row[2], row[3], nice_size(row[4]), row[5]))
         return ret
 
     def _build_file_list(self, context=''):
         """
         Lists all files attached to the bibdoc. This function should be
         called everytime the bibdoc is modified.
         As a side effect it log everything that has happened to the bibdocfiles
         in the log facility, according to the context:
         "init": means that the function has been called;
         for the first time by a constructor, hence no logging is performed
         "": by default means to log every deleted file as deleted and every
         added file as added;
         "rename": means that every appearently deleted file is logged as
         renamef and every new file as renamet.
         """
 
         def log_action(action, docid, docname, docformat, version, size, checksum, timestamp=''):
             """Log an action into the bibdoclog table."""
             try:
                 if timestamp:
                     run_sql('INSERT INTO hstDOCUMENT(action, id_bibdoc, docname, docformat, docversion, docsize, docchecksum, doctimestamp) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)', (action, docid, docname, docformat, version, size, checksum, timestamp))
                 else:
                     run_sql('INSERT INTO hstDOCUMENT(action, id_bibdoc, docname, docformat, docversion, docsize, docchecksum, doctimestamp) VALUES(%s, %s, %s, %s, %s, %s, %s, NOW())', (action, docid, docname, docformat, version, size, checksum))
             except DatabaseError:
                 register_exception()
 
         def make_removed_added_bibdocfiles(previous_file_list):
             """Internal function for build the log of changed files."""
 
             # Let's rebuild the previous situation
             old_files = {}
             for bibdocfile in previous_file_list:
                 old_files[(bibdocfile.name, bibdocfile.format, bibdocfile.version)] = (bibdocfile.size, bibdocfile.checksum, bibdocfile.md)
 
             # Let's rebuild the new situation
             new_files = {}
             for bibdocfile in self._docfiles:
                 new_files[(bibdocfile.name, bibdocfile.format, bibdocfile.version)] = (bibdocfile.size, bibdocfile.checksum, bibdocfile.md)
 
             # Let's subtract from added file all the files that are present in
             # the old list, and let's add to deleted files that are not present
             # added file.
             added_files = dict(new_files)
             deleted_files = {}
             for key, value in old_files.iteritems():
                 if added_files.has_key(key):
                     del added_files[key]
                 else:
                     deleted_files[key] = value
             return (added_files, deleted_files)
 
         if context != ('init', 'init_from_disk'):
             previous_file_list = list(self._docfiles)
         res = run_sql("SELECT status, creation_date,"
             "modification_date FROM bibdoc WHERE id=%s", (self.id,))
 
         self.cd = res[0][1]
         self.md = res[0][2]
         self.status = res[0][0]
 
         self.more_info = BibDocMoreInfo(self.id)
         self._docfiles = []
 
 
         if CFG_BIBDOCFILE_ENABLE_BIBDOCFSINFO_CACHE and context == 'init':
             ## In normal init context we read from DB
             res = run_sql("SELECT version, format, cd, md, checksum, filesize FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id, ))
             for version, docformat, cd, md, checksum, size in res:
                 filepath = self.get_filepath(docformat, version)
                 self._docfiles.append(BibDocFile(
                     filepath, self.bibrec_types,
                     version, docformat,  self.id, self.status, checksum,
                     self.more_info, human_readable=self.human_readable, cd=cd, md=md, size=size, bibdoc=self))
         else:
             if os.path.exists(self.basedir):
                 files = os.listdir(self.basedir)
                 files.sort()
                 for afile in files:
                     if not afile.startswith('.'):
                         try:
                             filepath = os.path.join(self.basedir, afile)
                             dummy, dummy, docformat, fileversion = decompose_file_with_version(filepath)
                             checksum = self.md5s.get_checksum(afile)
                             self._docfiles.append(BibDocFile(filepath, self.bibrec_types,
                                     fileversion, docformat,
                                     self.id, self.status, checksum,
                                     self.more_info, human_readable=self.human_readable, bibdoc=self))
                         except Exception, e:
                             register_exception()
                             raise InvenioBibDocFileError, e
         if context in ('init', 'init_from_disk'):
             return
         else:
             added_files, deleted_files = make_removed_added_bibdocfiles(previous_file_list)
             deletedstr = "DELETED"
             addedstr = "ADDED"
             if context == 'rename':
                 deletedstr = "RENAMEDFROM"
                 addedstr = "RENAMEDTO"
             for (docname, docformat, version), (size, checksum, md) in added_files.iteritems():
                 if context == 'rename':
                     md = '' # No modification time
                 log_action(addedstr, self.id, docname, docformat, version, size, checksum, md)
             for (docname, docformat, version), (size, checksum, md) in deleted_files.iteritems():
                 if context == 'rename':
                     md = '' # No modification time
                 log_action(deletedstr, self.id, docname, docformat, version, size, checksum, md)
 
     def _sync_to_db(self):
         """
         Update the content of the bibdocfile table by taking what is available on the filesystem.
         """
         self._build_file_list('init_from_disk')
         run_sql("DELETE FROM bibdocfsinfo WHERE id_bibdoc=%s", (self.id,))
         for afile in self.docfiles:
             run_sql("INSERT INTO bibdocfsinfo(id_bibdoc, version, format, last_version, cd, md, checksum, filesize, mime) VALUES(%s, %s, %s, false, %s, %s, %s, %s, %s)", (self.id, afile.get_version(), afile.get_format(), afile.cd, afile.md, afile.get_checksum(), afile.get_size(), afile.mime))
             run_sql("UPDATE bibdocfsinfo SET last_version=true WHERE id_bibdoc=%s AND version=%s", (self.id, self.get_latest_version()))
 
     def _build_related_file_list(self):
         """Lists all files attached to the bibdoc. This function should be
         called everytime the bibdoc is modified within e.g. its icon.
         @deprecated: use subformats instead.
         """
         self.related_files = {}
         res = run_sql("SELECT ln.id_bibdoc2,ln.rel_type,bibdoc.status FROM "
             "bibdoc_bibdoc AS ln,bibdoc WHERE bibdoc.id=ln.id_bibdoc2 AND "
             "ln.id_bibdoc1=%s", (str(self.id),))
         for row in res:
             docid = row[0]
             doctype = row[1]
             if row[2] != 'DELETED':
                 if not self.related_files.has_key(doctype):
                     self.related_files[doctype] = []
                 cur_doc = BibDoc.create_instance(docid=docid, human_readable=self.human_readable)
                 self.related_files[doctype].append(cur_doc)
 
     def get_total_size_latest_version(self, user_info=None, subformat=None):
         """Return the total size used on disk of all the files belonging
         to this bibdoc and corresponding to the latest version. Restricted
         and hidden files are not counted, unless there is no user_info.
         @param user_info: the user_info dictionary, used to check restrictions
         @type: dict
         @param subformat: if subformat is specified, it limits files
             only to those from that specific subformat
         @type subformat: string
         """
         ret = 0
         all_files = False
         # If we are calling this function without user_info, then we want to
         # see all the files
         if not user_info:
             all_files = True
         for bibdocfile in self.list_latest_files():
             # First check for restrictions
             if all_files or (bibdocfile.is_restricted(user_info)[0] == 0 and not bibdocfile.hidden):
                 # Then check if the format is correct
                 if subformat is None or bibdocfile.get_subformat() == subformat:
                     ret += bibdocfile.get_size()
         return ret
 
     def get_total_size(self):
         """Return the total size used on disk of all the files belonging
         to this bibdoc."""
         ret = 0
         for bibdocfile in self.list_all_files():
             ret += bibdocfile.get_size()
         return ret
 
     def list_all_files(self, list_hidden=True):
         """Returns all the docfiles linked with the given bibdoc."""
         if list_hidden:
             return self.docfiles
         else:
             return [afile for afile in self.docfiles if not afile.hidden_p()]
 
     def list_latest_files(self, list_hidden=True):
         """Returns all the docfiles within the last version."""
         return self.list_version_files(self.get_latest_version(), list_hidden=list_hidden)
 
     def list_version_files(self, version, list_hidden=True):
         """Return all the docfiles of a particular version."""
         version = int(version)
         return [docfile for docfile in self.docfiles if docfile.get_version() == version and (list_hidden or not docfile.hidden_p())]
 
     def get_latest_version(self):
         """ Returns the latest existing version number for the given bibdoc.
         If no file is associated to this bibdoc, returns '0'.
         """
         version = 0
         for bibdocfile in self.docfiles:
             if bibdocfile.get_version() > version:
                 version = bibdocfile.get_version()
         return version
 
     def get_file_number(self):
         """Return the total number of files."""
         return len(self.docfiles)
 
     def register_download(self, ip_address, version, docformat, userid=0, recid=0):
         """Register the information about a download of a particular file."""
 
         docformat = normalize_format(docformat)
         if docformat[:1] == '.':
             docformat = docformat[1:]
         docformat = docformat.upper()
         if not version:
             version = self.get_latest_version()
         return run_sql("INSERT INTO rnkDOWNLOADS "
             "(id_bibrec,id_bibdoc,file_version,file_format,"
             "id_user,client_host,download_time) VALUES "
             "(%s,%s,%s,%s,%s,INET_ATON(%s),NOW())",
             (recid, self.id, version, docformat,
             userid, ip_address,))
 
     def get_incoming_relations(self, rel_type=None):
         """Return all relations in which this BibDoc appears on target position
         @param rel_type: Type of the relation, to which we want to limit our search. None = any type
         @type rel_type: string
 
         @return: List of BibRelation instances
         @rtype: list
         """
         return BibRelation.get_relations(rel_type = rel_type,
                                          bibdoc2_id = self.id)
 
 
     def get_outgoing_relations(self, rel_type=None):
         """Return all relations in which this BibDoc appears on target position
         @param rel_type: Type of the relation, to which we want to limit our search. None = any type
         @type rel_type: string
 
         @return: List of BibRelation instances
         @rtype: list
         """
         return BibRelation.get_relations(rel_type = rel_type,
                                          bibdoc1_id = self.id)
     def create_outgoing_relation(self, bibdoc2, rel_type):
         """
         Create an outgoing relation between current BibDoc and a different one
         """
         return BibRelation.create(bibdoc1_id = self.id, bibdoc2_id = bibdoc2.id, rel_type = rel_type)
 
     def create_incoming_relation(self, bibdoc1, rel_type):
         """
         Create an outgoing relation between a particular version of
         current BibDoc and a particular version of a different BibDoc
         """
         return BibRelation.create(bibdoc1_id = bibdoc1.id, bibdoc2_id = self.id, rel_type = rel_type)
 
 def generic_path2bidocfile(fullpath):
     """
     Returns a BibDocFile objects that wraps the given fullpath.
     @note: the object will contain the minimum information that can be
         guessed from the fullpath (e.g. docname, format, subformat, version,
         md5, creation_date, modification_date). It won't contain for example
         a comment, a description, a doctype, a restriction.
     """
     fullpath = os.path.abspath(fullpath)
     try:
         path, name, docformat, version = decompose_file_with_version(fullpath)
     except ValueError:
         ## There is no version
         version = 0
         path, name, docformat = decompose_file(fullpath)
     md5folder = Md5Folder(path)
     checksum = md5folder.get_checksum(os.path.basename(fullpath))
     return BibDocFile(fullpath=fullpath,
         recid_doctypes=[(0, None, name)],
         version=version,
         docformat=docformat,
         docid=0,
         status=None,
         checksum=checksum,
         more_info=None)
 
 class BibDocFile(object):
     """This class represents a physical file in the Invenio filesystem.
     It should never be instantiated directly"""
 
     def __init__(self, fullpath, recid_doctypes, version, docformat, docid,
                  status, checksum, more_info=None, human_readable=False,
                  cd=None, md=None, size=None, bibdoc=None):
         self.fullpath = os.path.abspath(fullpath)
 
         self.docid = docid
 
         self.recids_doctypes = recid_doctypes
 
         self.version = version
         self.status = status
         self.checksum = checksum
         self.human_readable = human_readable
         self.name = recid_doctypes[0][2]
         if bibdoc is not None:
             self.__bibdoc = ref(bibdoc)
         else:
             self.__bibdoc = None
 
         if more_info:
             self.description = more_info.get_description(docformat, version)
             self.comment = more_info.get_comment(docformat, version)
             self.flags = more_info.get_flags(docformat, version)
         else:
             self.description = None
             self.comment = None
             self.flags = []
         self.format = normalize_format(docformat)
         self.superformat = get_superformat_from_format(self.format)
         self.subformat = get_subformat_from_format(self.format)
         if docformat:
             self.recids_doctypes = [(a,b,c+self.superformat) for (a,b,c) in self.recids_doctypes]
 
         self.mime, self.encoding = _mimes.guess_type(self.recids_doctypes[0][2])
         if self.mime is None:
             self.mime = "application/octet-stream"
         self.more_info = more_info
         self.hidden = 'HIDDEN' in self.flags
         self.size = size or os.path.getsize(fullpath)
         self.md = md or datetime.fromtimestamp(os.path.getmtime(fullpath))
         try:
             self.cd = cd or datetime.fromtimestamp(os.path.getctime(fullpath))
         except OSError:
             self.cd = self.md
 
         self.dir = os.path.dirname(fullpath)
         # make filename url safe
         url_safe_filename = urllib.quote(self.name)
         if self.subformat:
             self.url = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {'subformat' : self.subformat})
             self.fullurl = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {'subformat' : self.subformat, 'version' : self.version})
         else:
             self.url = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {})
             self.fullurl = create_url('%s/%s/%s/files/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, self.recids_doctypes[0][0], url_safe_filename, self.superformat), {'version' : self.version})
         self.etag = '"%i%s%i"' % (self.docid, self.format, self.version)
         self.magic = None
 
     @property
     def bibdoc(self):
         """
         Wrapper around the referenced bibdoc necesseary to avoid memory leaks.
         """
         if self.__bibdoc is None or self.__bibdoc() is None:
             bibdoc = BibDoc(self.docid)
             self.__bibdoc = ref(bibdoc)
             return bibdoc
         return self.__bibdoc()
 
     def __getstate__(self):
         """Remove weakref so the object can be pickled."""
         dict_ = copy.copy(self.__dict__)
         dict_['_BibDocFile__bibdoc'] = self.bibdoc
         return  dict_
 
     def __setstate__(self, data_dict):
         """Undo what `__getstate__` did setting back the weakref.
 
         :param data_dict: `dict` from `__getstate__`
         """
         for (name, value) in data_dict.iteritems():
             setattr(self, name, value)
 
         if self.__bibdoc is not None:
             self.__bibdoc = ref(self.__bibdoc)
 
     def __repr__(self):
         return ('BibDocFile(%s,  %i, %s, %s, %i, %i, %s, %s, %s, %s)' % (repr(self.fullpath), self.version, repr(self.name), repr(self.format), self.recids_doctypes[0][0], self.docid, repr(self.status), repr(self.checksum), repr(self.more_info), repr(self.human_readable)))
 
     def format_recids(self):
         if self.bibdoc:
             return self.bibdoc.format_recids()
         return "0"
 
     def __str__(self):
         recids = self.format_recids()
         out = '%s:%s:%s:%s:fullpath=%s\n' % (recids, self.docid, self.version, self.format, self.fullpath)
         out += '%s:%s:%s:%s:name=%s\n' % (recids,  self.docid, self.version, self.format, self.name)
         out += '%s:%s:%s:%s:subformat=%s\n' % (recids,  self.docid, self.version, self.format, get_subformat_from_format(self.format))
         out += '%s:%s:%s:%s:status=%s\n' % (recids,  self.docid, self.version, self.format, self.status)
         out += '%s:%s:%s:%s:checksum=%s\n' % (recids,  self.docid, self.version, self.format, self.checksum)
         if self.human_readable:
             out += '%s:%s:%s:%s:size=%s\n' % (recids,  self.docid, self.version, self.format, nice_size(self.size))
         else:
             out += '%s:%s:%s:%s:size=%s\n' % (recids,  self.docid, self.version, self.format, self.size)
         out += '%s:%s:%s:%s:creation time=%s\n' % (recids,  self.docid, self.version, self.format, self.cd)
         out += '%s:%s:%s:%s:modification time=%s\n' % (recids,  self.docid, self.version, self.format, self.md)
         out += '%s:%s:%s:%s:magic=%s\n' % (recids, self.docid, self.version, self.format, self.get_magic())
         out += '%s:%s:%s:%s:mime=%s\n' % (recids, self.docid, self.version, self.format, self.mime)
         out += '%s:%s:%s:%s:encoding=%s\n' % (recids, self.docid, self.version, self.format, self.encoding)
         out += '%s:%s:%s:%s:url=%s\n' % (recids, self.docid, self.version, self.format, self.url)
         out += '%s:%s:%s:%s:fullurl=%s\n' % (recids, self.docid, self.version, self.format, self.fullurl)
         out += '%s:%s:%s:%s:description=%s\n' % (recids, self.docid, self.version, self.format, self.description)
         out += '%s:%s:%s:%s:comment=%s\n' % (recids, self.docid, self.version, self.format, self.comment)
         out += '%s:%s:%s:%s:hidden=%s\n' % (recids, self.docid, self.version, self.format, self.hidden)
         out += '%s:%s:%s:%s:flags=%s\n' % (recids, self.docid, self.version, self.format, self.flags)
         out += '%s:%s:%s:%s:etag=%s\n' % (recids, self.docid, self.version, self.format, self.etag)
         return out
 
 
     def is_restricted(self, user_info):
         """Returns restriction state. (see acc_authorize_action return values)"""
         if self.status not in ('', 'DELETED'):
             return check_bibdoc_authorization(user_info, status=self.status)
         elif self.status == 'DELETED':
             return (1, 'File has ben deleted')
         else:
             return (0, '')
 
     def is_icon(self, subformat_re=CFG_BIBDOCFILE_ICON_SUBFORMAT_RE):
         """
         @param subformat_re: by default the convention is that
             L{CFG_BIBDOCFILE_ICON_SUBFORMAT_RE} is used as a subformat indicator to
             mean that a particular format is to be used as an icon.
             Specifiy a different subformat if you need to use a different
             convention.
         @type subformat: compiled regular expression
         @return: True if this file is an icon.
         @rtype: bool
         """
         return bool(subformat_re.match(self.subformat))
 
     def hidden_p(self):
         return self.hidden
 
     def get_url(self):
         return self.url
 
     def get_type(self):
         """Returns the first type connected with the bibdoc of this file."""
         return self.recids_doctypes[0][1]
 
     def get_path(self):
         return self.fullpath
 
     def get_bibdocid(self):
         return self.docid
 
     def get_name(self):
         return self.name
 
     def get_full_name(self):
         """Returns the first name connected with the bibdoc of this file."""
         return self.recids_doctypes[0][2]
 
     def get_full_path(self):
         return self.fullpath
 
     def get_format(self):
         return self.format
 
     def get_subformat(self):
         return self.subformat
 
     def get_superformat(self):
         return self.superformat
 
     def get_size(self):
         return self.size
 
     def get_version(self):
         return self.version
 
     def get_checksum(self):
         return self.checksum
 
     def get_description(self):
         return self.description
 
     def get_comment(self):
         return self.comment
 
     def get_content(self):
         """Returns the binary content of the file."""
         content_fd = open(self.fullpath, 'rb')
         content = content_fd.read()
         content_fd.close()
         return content
 
     def get_recid(self):
         """Returns the first recid connected with the bibdoc of this file."""
         return self.recids_doctypes[0][0]
 
     def get_status(self):
         """Returns the status of the file, i.e. either '', 'DELETED' or a
         restriction keyword."""
         return self.status
 
     def get_magic(self):
         """Return all the possible guesses from the magic library about
         the content of the file."""
         if self.magic is None:
             if CFG_HAS_MAGIC == 1:
                 magic_cookies = _get_magic_cookies()
                 magic_result = []
                 for key in magic_cookies.keys():
                     magic_result.append(magic_cookies[key].file(self.fullpath))
                 self.magic = tuple(magic_result)
             elif CFG_HAS_MAGIC == 2:
                 magic_result = []
                 for key in ({'mime': False, 'mime_encoding': False},
                         {'mime': True, 'mime_encoding': False},
                         {'mime': False, 'mime_encoding': True}):
                     magic_result.append(_magic_wrapper(self.fullpath, **key))
                 self.magic = tuple(magic_result)
         return self.magic
 
     def check(self):
         """Return True if the checksum corresponds to the file."""
         return calculate_md5(self.fullpath) == self.checksum
 
     def stream(self, req, download=False):
         """Stream the file.  Note that no restriction check is being
         done here, since restrictions have been checked previously
         inside websubmit_webinterface.py."""
         if os.path.exists(self.fullpath):
             if random.random() < CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY and calculate_md5(self.fullpath) != self.checksum:
                 raise InvenioBibDocFileError, "File %s, version %i, is corrupted!" % (self.recids_doctypes[0][2], self.version)
             stream_file(req, self.fullpath, "%s%s" % (self.name, self.superformat), self.mime, self.encoding, self.etag, self.checksum, self.fullurl, download=download)
             raise apache.SERVER_RETURN, apache.DONE
         else:
             req.status = apache.HTTP_NOT_FOUND
             raise InvenioBibDocFileError, "%s does not exists!" % self.fullpath
 
 _RE_STATUS_PARSER = re.compile(r'^(?P<type>email|group|egroup|role|firerole|status):\s*(?P<value>.*)$', re.S + re.I)
 def check_bibdoc_authorization(user_info, status):
     """
     Check if the user is authorized to access a document protected with the given status.
 
     L{status} is a string of the form::
 
         auth_type: auth_value
 
     where C{auth_type} can have values in::
         email, group, role, firerole, status
 
     and C{auth_value} has a value interpreted againsta C{auth_type}:
     - C{email}: the user can access the document if his/her email matches C{auth_value}
     - C{group}: the user can access the document if one of the groups (local or
         external) of which he/she is member matches C{auth_value}
     - C{role}: the user can access the document if he/she belongs to the WebAccess
         role specified in C{auth_value}
     - C{firerole}: the user can access the document if he/she is implicitly matched
         by the role described by the firewall like role definition in C{auth_value}
     - C{status}: the user can access the document if he/she is authorized to
         for the action C{viewrestrdoc} with C{status} paramter having value
         C{auth_value}
 
     @note: If no C{auth_type} is specified or if C{auth_type} is not one of the
         above, C{auth_value} will be set to the value contained in the
         parameter C{status}, and C{auth_type} will be considered to be C{status}.
 
     @param user_info: the user_info dictionary
     @type: dict
     @param status: the status of the document.
     @type status: string
     @return: a tuple, of the form C{(auth_code, auth_message)} where auth_code is 0
         if the authorization is granted and greater than 0 otherwise.
     @rtype: (int, string)
     @raise ValueError: in case of unexpected parsing error.
     """
     if not status:
         return (0, CFG_WEBACCESS_WARNING_MSGS[0])
 
     def parse_status(status):
         g = _RE_STATUS_PARSER.match(status)
         if g:
             return (g.group('type').lower(), g.group('value'))
         else:
             return ('status', status)
     if acc_is_user_in_role(user_info, acc_get_role_id(SUPERADMINROLE)):
         return (0, CFG_WEBACCESS_WARNING_MSGS[0])
     auth_type, auth_value = parse_status(status)
     if auth_type == 'status':
         return acc_authorize_action(user_info, 'viewrestrdoc', status=auth_value)
     elif auth_type == 'email':
         if not auth_value.lower().strip() == user_info['email'].lower().strip():
             return (1, 'You must be member of the group %s in order to access this document' % repr(auth_value))
     elif auth_type == 'group':
         if not auth_value in user_info['group']:
             return (1, 'You must be member of the group %s in order to access this document' % repr(auth_value))
     elif auth_type == 'role':
         if not acc_is_user_in_role(user_info, acc_get_role_id(auth_value)):
             return (1, 'You must be member in the role %s in order to access this document' % repr(auth_value))
     elif auth_type == 'firerole':
         if not acc_firerole_check_user(user_info, compile_role_definition(auth_value)):
             return (1, 'You must be authorized in order to access this document')
     else:
         raise ValueError, 'Unexpected authorization type %s for %s' % (repr(auth_type), repr(auth_value))
     return (0, CFG_WEBACCESS_WARNING_MSGS[0])
 
 _RE_BAD_MSIE = re.compile("MSIE\s+(\d+\.\d+)")
 def stream_file(req, fullpath, fullname=None, mime=None, encoding=None, etag=None, md5str=None, location=None, download=False):
     """This is a generic function to stream a file to the user.
     If fullname, mime, encoding, and location are not provided they will be
     guessed based on req and fullpath.
     md5str should be passed as an hexadecimal string.
     """
     def normal_streaming(size):
         req.set_content_length(size)
         req.send_http_header()
         if not req.header_only:
             req.sendfile(fullpath)
         return ""
 
     def single_range(size, the_range):
         req.set_content_length(the_range[1])
         req.headers_out['Content-Range'] = 'bytes %d-%d/%d' % (the_range[0], the_range[0] + the_range[1] - 1, size)
         req.status = apache.HTTP_PARTIAL_CONTENT
         req.send_http_header()
         if not req.header_only:
             req.sendfile(fullpath, the_range[0], the_range[1])
         return ""
 
     def multiple_ranges(size, ranges, mime):
         req.status = apache.HTTP_PARTIAL_CONTENT
         boundary = '%s%04d' % (time.strftime('THIS_STRING_SEPARATES_%Y%m%d%H%M%S'), random.randint(0, 9999))
         req.content_type = 'multipart/byteranges; boundary=%s' % boundary
         content_length = 0
         for arange in ranges:
             content_length += len('--%s\r\n' % boundary)
             content_length += len('Content-Type: %s\r\n' % mime)
             content_length += len('Content-Range: bytes %d-%d/%d\r\n' % (arange[0], arange[0] + arange[1] - 1, size))
             content_length += len('\r\n')
             content_length += arange[1]
             content_length += len('\r\n')
         content_length += len('--%s--\r\n' % boundary)
         req.set_content_length(content_length)
         req.send_http_header()
         if not req.header_only:
             for arange in ranges:
                 req.write('--%s\r\n' % boundary, 0)
                 req.write('Content-Type: %s\r\n' % mime, 0)
                 req.write('Content-Range: bytes %d-%d/%d\r\n' % (arange[0], arange[0] + arange[1] - 1, size), 0)
                 req.write('\r\n', 0)
                 req.sendfile(fullpath, arange[0], arange[1])
                 req.write('\r\n', 0)
             req.write('--%s--\r\n' % boundary)
             req.flush()
         return ""
 
     def parse_date(date):
         """According to <http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3>
         a date can come in three formats (in order of preference):
             Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
             Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036
             Sun Nov  6 08:49:37 1994       ; ANSI C's asctime() format
         Moreover IE is adding some trailing information after a ';'.
         Wrong dates should be simpled ignored.
         This function return the time in seconds since the epoch GMT or None
         in case of errors."""
         if not date:
             return None
         try:
             date = date.split(';')[0].strip() # Because of IE
             ## Sun, 06 Nov 1994 08:49:37 GMT
             return time.mktime(time.strptime(date, '%a, %d %b %Y %X %Z'))
         except:
             try:
                 ## Sun, 06 Nov 1994 08:49:37 GMT
                 return time.mktime(time.strptime(date, '%A, %d-%b-%y %H:%M:%S %Z'))
             except:
                 try:
                     ## Sun, 06 Nov 1994 08:49:37 GMT
                     return time.mktime(date)
                 except:
                     return None
 
     def parse_ranges(ranges):
         """According to <http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35>
         a (multiple) range request comes in the form:
             bytes=20-30,40-60,70-,-80
         with the meaning:
             from byte to 20 to 30 inclusive (11 bytes)
             from byte to 40 to 60 inclusive (21 bytes)
             from byte 70 to (size - 1) inclusive (size - 70 bytes)
             from byte size - 80 to (size - 1) inclusive (80 bytes)
         This function will return the list of ranges in the form:
             [[first_byte, last_byte], ...]
         If first_byte or last_byte aren't specified they'll be set to None
         If the list is not well formatted it will return None
         """
         try:
             if ranges.startswith('bytes') and '=' in ranges:
                 ranges = ranges.split('=')[1].strip()
             else:
                 return None
             ret = []
             for arange in ranges.split(','):
                 arange = arange.strip()
                 if arange.startswith('-'):
                     ret.append([None, int(arange[1:])])
                 elif arange.endswith('-'):
                     ret.append([int(arange[:-1]), None])
                 else:
                     ret.append(map(int, arange.split('-')))
             return ret
         except:
             return None
 
     def parse_tags(tags):
         """Return a list of tags starting from a comma separated list."""
         return [tag.strip() for tag in tags.split(',')]
 
     def fix_ranges(ranges, size):
         """Complementary to parse_ranges it will transform all the ranges
         into (first_byte, length), adjusting all the value based on the
         actual size provided.
         """
         ret = []
         for arange in ranges:
             if (arange[0] is None and arange[1] > 0) or arange[0] < size:
                 if arange[0] is None:
                     arange[0] = size - arange[1]
                 elif arange[1] is None:
                     arange[1] = size - arange[0]
                 else:
                     arange[1] = arange[1] - arange[0] + 1
                 arange[0] = max(0, arange[0])
                 arange[1] = min(size - arange[0], arange[1])
                 if arange[1] > 0:
                     ret.append(arange)
         return ret
 
     def get_normalized_headers():
         """Strip and lowerize all the keys of the headers dictionary plus
         strip, lowerize and transform known headers value into their value."""
         ret = {
             'if-match' : None,
             'unless-modified-since' : None,
             'if-modified-since' : None,
             'range' : None,
             'if-range' : None,
             'if-none-match' : None,
         }
         for key, value in req.headers_in.iteritems():
             key = key.strip().lower()
             value = value.strip()
             if key in ('unless-modified-since', 'if-modified-since'):
                 value = parse_date(value)
             elif key == 'range':
                 value = parse_ranges(value)
             elif key == 'if-range':
                 value = parse_date(value) or parse_tags(value)
             elif key in ('if-match', 'if-none-match'):
                 value = parse_tags(value)
             if value:
                 ret[key] = value
         return ret
 
     headers = get_normalized_headers()
     g = _RE_BAD_MSIE.search(headers.get('user-agent', "MSIE 6.0"))
     bad_msie = g and float(g.group(1)) < 9.0
 
     if CFG_BIBDOCFILE_USE_XSENDFILE:
         ## If XSendFile is supported by the server, let's use it.
         if os.path.exists(fullpath):
             if fullname is None:
                 fullname = os.path.basename(fullpath)
             if bad_msie:
                 ## IE is confused by quotes
                 req.headers_out["Content-Disposition"] = 'attachment; filename=%s' % fullname.replace('"', '\\"')
             elif download:
                 req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % fullname.replace('"', '\\"')
             else:
                 ## IE is confused by inline
                 req.headers_out["Content-Disposition"] = 'inline; filename="%s"' % fullname.replace('"', '\\"')
             req.headers_out["X-Sendfile"] = fullpath
             if mime is None:
                 (mime, encoding) = _mimes.guess_type(fullpath)
                 if mime is None:
                     mime = "application/octet-stream"
             if not bad_msie:
                 ## IE is confused by not supported mimetypes
                 req.content_type = mime
             return ""
         else:
             raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND
 
     if headers['if-match']:
         if etag is not None and etag not in headers['if-match']:
             raise apache.SERVER_RETURN, apache.HTTP_PRECONDITION_FAILED
 
     if os.path.exists(fullpath):
         mtime = os.path.getmtime(fullpath)
         if fullname is None:
             fullname = os.path.basename(fullpath)
         if mime is None:
             (mime, encoding) = _mimes.guess_type(fullpath)
             if mime is None:
                 mime = "application/octet-stream"
         if location is None:
             location = req.uri
         if not bad_msie:
             ## IE is confused by not supported mimetypes
             req.content_type = mime
         req.encoding = encoding
         req.filename = fullname
         req.headers_out["Last-Modified"] = time.strftime('%a, %d %b %Y %X GMT', time.gmtime(mtime))
         if CFG_ENABLE_HTTP_RANGE_REQUESTS:
             req.headers_out["Accept-Ranges"] = "bytes"
         else:
             req.headers_out["Accept-Ranges"] = "none"
         req.headers_out["Content-Location"] = location
         if etag is not None:
             req.headers_out["ETag"] = etag
         if md5str is not None:
             req.headers_out["Content-MD5"] = base64.encodestring(binascii.unhexlify(md5str.upper()))[:-1]
         if bad_msie:
             ## IE is confused by quotes
             req.headers_out["Content-Disposition"] = 'attachment; filename=%s' % fullname.replace('"', '\\"')
         elif download:
             req.headers_out["Content-Disposition"] = 'attachment; filename="%s"' % fullname.replace('"', '\\"')
         else:
             ## IE is confused by inline
             req.headers_out["Content-Disposition"] = 'inline; filename="%s"' % fullname.replace('"', '\\"')
         size = os.path.getsize(fullpath)
         if not size:
             try:
                 raise Exception, '%s exists but is empty' % fullpath
             except Exception:
                 register_exception(req=req, alert_admin=True)
             raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND
         if headers['if-modified-since'] and headers['if-modified-since'] >= mtime:
             raise apache.SERVER_RETURN, apache.HTTP_NOT_MODIFIED
         if headers['if-none-match']:
             if etag is not None and etag in headers['if-none-match']:
                 raise apache.SERVER_RETURN, apache.HTTP_NOT_MODIFIED
         if headers['unless-modified-since'] and headers['unless-modified-since'] < mtime:
             return normal_streaming(size)
         if CFG_ENABLE_HTTP_RANGE_REQUESTS and headers['range']:
             try:
                 if headers['if-range']:
                     if etag is None or etag not in headers['if-range']:
                         return normal_streaming(size)
                 ranges = fix_ranges(headers['range'], size)
             except:
                 return normal_streaming(size)
             if len(ranges) > 1:
                 return multiple_ranges(size, ranges, mime)
             elif ranges:
                 return single_range(size, ranges[0])
             else:
                 raise apache.SERVER_RETURN, apache.HTTP_RANGE_NOT_SATISFIABLE
         else:
             return normal_streaming(size)
     else:
         raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND
 
 def stream_restricted_icon(req):
     """Return the content of the "Restricted Icon" file."""
     stream_file(req, '%s/img/restricted.gif' % CFG_WEBDIR)
     raise apache.SERVER_RETURN, apache.DONE
 
 
 #def list_versions_from_array(docfiles):
 #    """Retrieve the list of existing versions from the given docfiles list."""
 #    versions = []
 #    for docfile in docfiles:
 #        if not docfile.get_version() in versions:
 #            versions.append(docfile.get_version())
 #    versions.sort()
 #    versions.reverse()
 #    return versions
 
 def _make_base_dir(docid):
     """Given a docid it returns the complete path that should host its files."""
     group = "g" + str(int(int(docid) / CFG_BIBDOCFILE_FILESYSTEM_BIBDOC_GROUP_LIMIT))
     return os.path.join(CFG_BIBDOCFILE_FILEDIR, group, str(docid))
 
 class Md5Folder(object):
     """Manage all the Md5 checksum about a folder"""
     def __init__(self, folder):
         """Initialize the class from the md5 checksum of a given path"""
         self.folder = folder
         self.load()
 
     def update(self, only_new=True):
         """Update the .md5 file with the current files. If only_new
         is specified then only not already calculated file are calculated."""
         if not only_new:
             self.md5s = {}
         if os.path.exists(self.folder):
             for filename in os.listdir(self.folder):
                 if filename not in self.md5s and not filename.startswith('.'):
                     self.md5s[filename] = calculate_md5(os.path.join(self.folder, filename))
         self.store()
 
     def store(self):
         """Store the current md5 dictionary into .md5"""
         try:
             old_umask = os.umask(022)
             md5file = open(os.path.join(self.folder, ".md5"), "w")
             for key, value in self.md5s.items():
                 md5file.write('%s *%s\n' % (value, key))
             md5file.close()
             os.umask(old_umask)
         except Exception, e:
             register_exception(alert_admin=True)
             raise InvenioBibDocFileError("Encountered an exception while storing .md5 for folder '%s': '%s'" % (self.folder, e))
 
     def load(self):
         """Load .md5 into the md5 dictionary"""
         self.md5s = {}
         md5_path = os.path.join(self.folder, ".md5")
         if os.path.exists(md5_path):
             for row in open(md5_path, "r"):
                 md5hash = row[:32]
                 filename = row[34:].strip()
                 self.md5s[filename] = md5hash
         else:
             self.update()
 
     def check(self, filename=''):
         """Check the specified file or all the files for which it exists a hash
         for being coherent with the stored hash."""
         if filename and filename in self.md5s.keys():
             try:
                 return self.md5s[filename] == calculate_md5(os.path.join(self.folder, filename))
             except Exception, e:
                 register_exception(alert_admin=True)
                 raise InvenioBibDocFileError("Encountered an exception while loading '%s': '%s'" % (os.path.join(self.folder, filename), e))
         else:
             for filename, md5hash in self.md5s.items():
                 try:
                     if calculate_md5(os.path.join(self.folder, filename)) != md5hash:
                         return False
                 except Exception, e:
                     register_exception(alert_admin=True)
                     raise InvenioBibDocFileError("Encountered an exception while loading '%s': '%s'" % (os.path.join(self.folder, filename), e))
             return True
 
     def get_checksum(self, filename):
         """Return the checksum of a physical file."""
         md5hash = self.md5s.get(filename, None)
         if md5hash is None:
             self.update()
         # Now it should not fail!
         md5hash = self.md5s[filename]
         return md5hash
 
 def calculate_md5_external(filename):
     """Calculate the md5 of a physical file through md5sum Command Line Tool.
     This is suitable for file larger than 256Kb."""
     try:
         md5_result = os.popen(CFG_PATH_MD5SUM + ' -b %s' % escape_shell_arg(filename))
         ret = md5_result.read()[:32]
         md5_result.close()
         if len(ret) != 32:
             # Error in running md5sum. Let's fallback to internal
             # algorithm.
             return calculate_md5(filename, force_internal=True)
         else:
             return ret
     except Exception, e:
         raise InvenioBibDocFileError("Encountered an exception while calculating md5 for file '%s': '%s'" % (filename, e))
 
 def calculate_md5(filename, force_internal=False):
     """Calculate the md5 of a physical file. This is suitable for files smaller
     than 256Kb."""
     if not CFG_PATH_MD5SUM or force_internal or os.path.getsize(filename) < CFG_BIBDOCFILE_MD5_THRESHOLD:
         try:
             to_be_read = open(filename, "rb")
             computed_md5 = md5()
             while True:
                 buf = to_be_read.read(CFG_BIBDOCFILE_MD5_BUFFER)
                 if buf:
                     computed_md5.update(buf)
                 else:
                     break
             to_be_read.close()
             return computed_md5.hexdigest()
         except Exception, e:
             register_exception(alert_admin=True)
             raise InvenioBibDocFileError("Encountered an exception while calculating md5 for file '%s': '%s'" % (filename, e))
     else:
         return calculate_md5_external(filename)
 
 
 def bibdocfile_url_to_bibrecdocs(url):
     """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns
     a BibRecDocs object for the corresponding recid."""
 
     recid = decompose_bibdocfile_url(url)[0]
     return BibRecDocs(recid)
 
 def bibdocfile_url_to_bibdoc(url):
     """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns
     a BibDoc object for the corresponding recid/docname."""
     docname = decompose_bibdocfile_url(url)[1]
     return bibdocfile_url_to_bibrecdocs(url).get_bibdoc(docname)
 
 def bibdocfile_url_to_bibdocfile(url):
     """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns
     a BibDocFile object for the corresponding recid/docname/format."""
     docformat = decompose_bibdocfile_url(url)[2]
     return bibdocfile_url_to_bibdoc(url).get_file(docformat)
 
 def bibdocfile_url_to_fullpath(url):
     """Given an URL in the form CFG_SITE_[SECURE_]URL/CFG_SITE_RECORD/xxx/files/... it returns
     the fullpath for the corresponding recid/docname/format."""
 
     return bibdocfile_url_to_bibdocfile(url).get_full_path()
 
 def bibdocfile_url_p(url):
     """Return True when the url is a potential valid url pointing to a
     fulltext owned by a system."""
     if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL):
         return True
     if not (url.startswith('%s/%s/' % (CFG_SITE_URL, CFG_SITE_RECORD)) or url.startswith('%s/%s/' % (CFG_SITE_SECURE_URL, CFG_SITE_RECORD))):
         return False
     splitted_url = url.split('/files/')
     return len(splitted_url) == 2 and splitted_url[0] != '' and splitted_url[1] != ''
 
 def get_docid_from_bibdocfile_fullpath(fullpath):
     """Given a bibdocfile fullpath (e.g. "CFG_BIBDOCFILE_FILEDIR/g0/123/bar.pdf;1")
     returns the docid (e.g. 123)."""
     if not fullpath.startswith(os.path.join(CFG_BIBDOCFILE_FILEDIR, 'g')):
         raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath
     dirname = decompose_file_with_version(fullpath)[0]
     try:
         return int(dirname.split('/')[-1])
     except:
         raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath
 
 def decompose_bibdocfile_fullpath(fullpath):
     """Given a bibdocfile fullpath (e.g. "CFG_BIBDOCFILE_FILEDIR/g0/123/bar.pdf;1")
     returns a quadruple (recid, docname, format, version)."""
     if not fullpath.startswith(os.path.join(CFG_BIBDOCFILE_FILEDIR, 'g')):
         raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath
     dirname, dummy, extension, version = decompose_file_with_version(fullpath)
     try:
         docid = int(dirname.split('/')[-1])
         return {"doc_id" : docid, "extension": extension, "version": version}
     except:
         raise InvenioBibDocFileError, "Fullpath %s doesn't correspond to a valid bibdocfile fullpath" % fullpath
 
 _RE_BIBDOCFILE_URL = re.compile("/%s/(?P<recid>\d+)/files/(?P<rest>.*)" % (re.escape(CFG_SITE_RECORD), ))
 def decompose_bibdocfile_url(url):
     """Given a bibdocfile_url return a triple (recid, docname, format)."""
     if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL):
         return decompose_bibdocfile_very_old_url(url)
 
     scheme, netloc, path, query, dummy_fragment = urlsplit(url)
     if "%s://%s" % (scheme, netloc) not in (CFG_SITE_URL, CFG_SITE_SECURE_URL):
         raise InvenioBibDocFileError("URL %s doesn't correspond to a valid BibDocFile URL." % url)
 
     g = _RE_BIBDOCFILE_URL.match(urllib.unquote(path))
     if g:
         recid = int(g.group('recid'))
         rest = g.group('rest')
         dummy, docname, docformat = decompose_file(rest)
         query = parse_qs(query)
         if 'subformat' in query:
             docformat += ";%s" % query['subformat'][0]
         return recid, docname, docformat
     else:
         raise InvenioBibDocFileError, "Url %s doesn't correspond to a valid record inside the system." % url
 
 re_bibdocfile_old_url = re.compile(r'/%s/(\d*)/files/' % CFG_SITE_RECORD)
 def decompose_bibdocfile_old_url(url):
     """Given a bibdocfile old url (e.g. CFG_SITE_URL/CFG_SITE_RECORD/123/files)
     it returns the recid."""
     g = re_bibdocfile_old_url.search(url)
     if g:
         return int(g.group(1))
     raise InvenioBibDocFileError('%s is not a valid old bibdocfile url' % url)
 
 def decompose_bibdocfile_very_old_url(url):
     """Decompose an old /getfile.py? URL"""
     if url.startswith('%s/getfile.py' % CFG_SITE_URL) or url.startswith('%s/getfile.py' % CFG_SITE_SECURE_URL):
         params = urllib.splitquery(url)[1]
         if params:
             try:
                 params = parse_qs(params)
                 if 'docid' in params:
                     docid = int(params['docid'][0])
                     bibdoc = BibDoc.create_instance(docid)
                     if bibdoc.bibrec_links:
 
                         recid = bibdoc.bibrec_links[0]["rec_id"]
                         docname = bibdoc.bibrec_links[0]["doc_name"]
                     else:
                         raise InvenioBibDocFileError("Old style URL pointing to an unattached document")
                 elif 'recid' in params:
                     recid = int(params['recid'][0])
                     if 'name' in params:
                         docname = params['name'][0]
                     else:
                         docname = ''
                 else:
                     raise InvenioBibDocFileError('%s has not enough params to correspond to a bibdocfile.' % url)
                 docformat = normalize_format(params.get('format', [''])[0])
 
                 return (recid, docname, docformat)
             except Exception, e:
                 raise InvenioBibDocFileError('Problem with %s: %s' % (url, e))
         else:
             raise InvenioBibDocFileError('%s has no params to correspond to a bibdocfile.' % url)
     else:
         raise InvenioBibDocFileError('%s is not a valid very old bibdocfile url' % url)
 
 def get_docname_from_url(url):
     """Return a potential docname given a url"""
     path = urlsplit(urllib.unquote(url))[2]
     filename = os.path.split(path)[-1]
     return file_strip_ext(filename)
 
 def get_format_from_url(url):
     """Return a potential format given a url"""
     path = urlsplit(urllib.unquote(url))[2]
     filename = os.path.split(path)[-1]
     return filename[len(file_strip_ext(filename)):]
 
 def clean_url(url):
     """Given a local url e.g. a local path it render it a realpath."""
     if is_url_a_local_file(url):
         path = urlsplit(urllib.unquote(url))[2]
         return os.path.abspath(path)
     else:
         return url
 
 def is_url_a_local_file(url):
     """Return True if the given URL is pointing to a local file."""
     protocol = urlsplit(url)[0]
     return protocol in ('', 'file')
 
 def check_valid_url(url):
     """
     Check for validity of a url or a file.
 
     @param url: the URL to check
     @type url: string
     @raise StandardError: if the URL is not a valid URL.
     """
     try:
         if is_url_a_local_file(url):
             path = urlsplit(urllib.unquote(url))[2]
             if os.path.abspath(path) != path:
                 raise StandardError, "%s is not a normalized path (would be %s)." % (path, os.path.normpath(path))
             for allowed_path in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS + [CFG_TMPDIR, CFG_TMPSHAREDDIR, CFG_WEBSUBMIT_STORAGEDIR]:
                 if path.startswith(allowed_path):
                     dummy_fd = open(path)
                     dummy_fd.close()
                     return
             raise StandardError, "%s is not in one of the allowed paths." % path
         else:
             try:
                 open_url(url)
             except InvenioBibdocfileUnauthorizedURL, e:
                 raise StandardError, str(e)
     except Exception, e:
         raise StandardError, "%s is not a correct url: %s" % (url, e)
 
 def safe_mkstemp(suffix, prefix='bibdocfile_'):
     """Create a temporary filename that don't have any '.' inside a part
     from the suffix."""
     tmpfd, tmppath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=CFG_TMPDIR)
     # Close the file and leave the responsability to the client code to
     # correctly open/close it.
     os.close(tmpfd)
 
     if '.' not in suffix:
         # Just in case format is empty
         return tmppath
     while '.' in os.path.basename(tmppath)[:-len(suffix)]:
         os.remove(tmppath)
         tmpfd, tmppath = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=CFG_TMPDIR)
         os.close(tmpfd)
     return tmppath
 
 def download_local_file(filename, docformat=None):
     """
     Copies a local file to Invenio's temporary directory.
 
     @param filename: the name of the file to copy
     @type filename: string
     @param format: the format of the file to copy (will be found if not
             specified)
     @type format: string
     @return: the path of the temporary file created
     @rtype: string
     @raise StandardError: if something went wrong
     """
     # Make sure the format is OK.
     if docformat is None:
         docformat = guess_format_from_url(filename)
     else:
         docformat = normalize_format(docformat)
 
     tmppath = ''
 
     # Now try to copy.
     try:
         path = urlsplit(urllib.unquote(filename))[2]
         if os.path.abspath(path) != path:
             raise StandardError, "%s is not a normalized path (would be %s)." \
                     % (path, os.path.normpath(path))
         for allowed_path in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS + [CFG_TMPDIR,
                 CFG_WEBSUBMIT_STORAGEDIR]:
             if path.startswith(allowed_path):
                 tmppath = safe_mkstemp(docformat)
                 shutil.copy(path, tmppath)
                 if os.path.getsize(tmppath) == 0:
                     os.remove(tmppath)
                     raise StandardError, "%s seems to be empty" % filename
                 break
         else:
             raise StandardError, "%s is not in one of the allowed paths." % path
     except Exception, e:
         raise StandardError, "Impossible to copy the local file '%s': %s" % \
                 (filename, str(e))
 
     return tmppath
 
 def download_external_url(url, docformat=None):
     """
     Download a url (if it corresponds to a remote file) and return a
     local url to it.
 
     @param url: the URL to download
     @type url: string
     @param format: the format of the file (will be found if not specified)
     @type format: string
     @return: the path to the download local file
     @rtype: string
     @raise StandardError: if the download failed
     """
     tmppath = None
 
     # Make sure the format is OK.
     if docformat is None:
         # First try to find a known extension to the URL
         docformat = decompose_file(url, skip_version=True,
                 only_known_extensions=True)[2]
         if not docformat:
             # No correct format could be found. Will try to get it from the
             # HTTP message headers.
             docformat = ''
     else:
         docformat = normalize_format(docformat)
 
     from_file, to_file, tmppath = None, None, ''
 
     try:
         from_file = open_url(url)
     except InvenioBibdocfileUnauthorizedURL, e:
         raise StandardError, str(e)
     except urllib2.URLError, e:
         raise StandardError, 'URL could not be opened: %s' % str(e)
 
     if not docformat:
         # We could not determine the format from the URL, so let's try
         # to read it from the HTTP headers.
         docformat = get_format_from_http_response(from_file)
 
     try:
         tmppath = safe_mkstemp(docformat)
 
         to_file = open(tmppath, 'w')
         while True:
             block = from_file.read(CFG_BIBDOCFILE_BLOCK_SIZE)
             if not block:
                 break
             to_file.write(block)
         to_file.close()
         from_file.close()
 
         if os.path.getsize(tmppath) == 0:
             raise StandardError, "%s seems to be empty" % url
     except Exception, e:
         # Try to close and remove the temporary file.
         try:
             to_file.close()
         except Exception:
             pass
         try:
             os.remove(tmppath)
         except Exception:
             pass
         raise StandardError, "Error when downloading %s into %s: %s" % \
                 (url, tmppath, e)
 
     return tmppath
 
 def get_format_from_http_response(response):
     """
     Tries to retrieve the format of the file from the message headers of the
     HTTP response.
 
     @param response: the HTTP response
     @type response: file-like object (as returned by urllib.urlopen)
     @return: the format of the remote resource
     @rtype: string
     """
     def parse_content_type(text):
         return text.split(';')[0].strip()
 
     def parse_content_disposition(text):
         for item in text.split(';'):
             item = item.strip()
             if item.strip().startswith('filename='):
                 return item[len('filename="'):-len('"')]
 
     info = response.info()
 
     docformat = ''
 
     content_disposition = info.getheader('Content-Disposition')
     if content_disposition:
         filename = parse_content_disposition(content_disposition)
         if filename:
             docformat = decompose_file(filename, only_known_extensions=False)[2]
             if docformat:
                 return docformat
 
     content_type = info.getheader('Content-Type')
     if content_type:
         content_type = parse_content_type(content_type)
         if content_type not in ('text/plain', 'application/octet-stream'):
             ## We actually ignore these mimetypes since they are the
             ## defaults often returned by Apache in case the mimetype
             ## was not known
             if content_type in CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING:
                 docformat = normalize_format(CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING[content_type])
             else:
                 ext = _mimes.guess_extension(content_type)
                 if ext:
                     docformat = normalize_format(ext)
 
     return docformat
 
 def download_url(url, docformat=None):
     """
     Download a url (if it corresponds to a remote file) and return a
     local url to it.
     """
     tmppath = None
 
     try:
         if is_url_a_local_file(url):
             tmppath = download_local_file(url, docformat = docformat)
         else:
             tmppath = download_external_url(url, docformat = docformat)
     except StandardError:
         raise
 
     return tmppath
 
 class MoreInfo(object):
     """This class represents a genering MoreInfo dictionary.
        MoreInfo object can be attached to bibdoc, bibversion, format or BibRelation.
        The entity where a particular MoreInfo object is attached has to be specified using the
        constructor parametes.
 
        This class is a thin wrapper around the database table.
        """
 
     def __init__(self, docid = None, version = None, docformat = None,
                  relation = None, cache_only = False, cache_reads = True, initial_data = None):
         """
         @param cache_only Determines if MoreInfo object should be created in
                           memory only or reflected in the database
         @type cache_only boolean
 
         @param cache_reads Determines if reads should be executed on the
                            in-memory cache or should be redirected to the
                            database. If this is true, cache can be entirely
                            regenerated from the database only upon an explicit
                            request. If the value is not present in the cache,
                            the database is queried
         @type cache_reads boolean
 
         @param initial_data Allows to specify initial content of the cache.
                              This parameter is useful when we create an in-memory
                              instance from serialised value
         @type initial_data string
 
         """
         self.docid = docid
         self.version = version
         self.format = docformat
         self.relation = relation
         self.cache_only = cache_only
 
 
         if initial_data != None:
             self.cache = initial_data
             self.dirty = initial_data
             if not self.cache_only:
                 self._flush_cache() #inserts new entries
         else:
             self.cache = {}
             self.dirty = {}
 
         self.cache_reads = cache_reads
 
         if not self.cache_only:
             self.populate_from_database()
 
     @staticmethod
     def create_from_serialised(ser_str, docid = None, version = None, docformat = None,
                  relation = None, cache_only = False, cache_reads = True):
         """Creates an instance of MoreInfo
            using serialised data as the cache content"""
         data = cPickle.loads(base64.b64decode(ser_str))
         return MoreInfo(docid = docid, version = version, docformat = docformat,
                         relation = relation, cache_only = cache_only,
                         cache_reads = cache_reads, initial_data = data);
 
     def serialise_cache(self):
         """Returns a serialised representation of the cache"""
         return base64.b64encode(cPickle.dumps(self.get_cache()))
 
     def populate_from_database(self):
         """Retrieves all values of MoreInfo and places them in the cache"""
         where_str, where_args = self._generate_where_query_args()
         query_str = "SELECT namespace, data_key, data_value FROM bibdocmoreinfo WHERE %s" % (where_str, )
         res = run_sql(query_str, where_args)
         if res:
             for row in res:
                 namespace, data_key, data_value_ser = row
                 data_value = cPickle.loads(data_value_ser)
                 if not namespace in self.cache:
                     self.cache[namespace] = {}
                 self.cache[namespace][data_key] = data_value
 
     def _mark_dirty(self, namespace, data_key):
         """Marks a data key dirty - that should be saved into the database"""
         if not namespace in self.dirty:
             self.dirty[namespace] = {}
         self.dirty[namespace][data_key] = True
 
     def _database_get_distinct_string_list(self, column, namespace = None):
         """A private method reading an unique list of strings from the
         moreinfo database table"""
         where_str, where_args = self._generate_where_query_args(
             namespace = namespace)
         query_str = "SELECT DISTINCT %s FROM bibdocmoreinfo WHERE %s" % \
             ( column, where_str, )
 
         if DBG_LOG_QUERIES:
             from invenio.bibtask import write_message
             write_message("Executing query: " + query_str + "   ARGS: " + repr(where_args))
             print "Executing query: " + query_str + "   ARGS: " + repr(where_args)
 
         res = run_sql(query_str, where_args)
 
         return (res and [x[0] for x in res]) or [] # after migrating to python 2.6, can be rewritten using x if y else z    syntax: return [x[0] for x in res] if res else []
 
     def _database_get_namespaces(self):
         """Read the database to discover namespaces declared in a given MoreInfo"""
         return self._database_get_distinct_string_list("namespace")
 
     def _database_get_keys(self, namespace):
         """Returns all keys assigned in a given namespace of a MoreInfo instance"""
         return self._database_get_distinct_string_list("data_key", namespace=namespace)
 
     def _database_contains_key(self, namespace, key):
         return self._database_read_value(namespace, key) != None
 
     def _database_save_value(self, namespace, key, value):
         """Write changes into the database"""
         #TODO: this should happen within one transaction
         serialised_val = cPickle.dumps(value)
         # on duplicate key will not work here as miltiple null values are permitted by the index
         if not self._database_contains_key(namespace, key):
             #insert new value
             query_parts = []
             query_args = []
 
             to_process = [(self.docid, "id_bibdoc"), (self.version, "version"),
                           (self.format, "format"), (self.relation, "id_rel"),
                           (str(namespace), "namespace"), (str(key), "data_key"),
                           (str(serialised_val), "data_value")]
 
             for entry in to_process:
                 _val_or_null(entry[0], q_str = query_parts, q_args = query_args)
 
             columns_str = ", ".join(map(lambda x: x[1], to_process))
             values_str = ", ".join(query_parts)
 
             query_str = "INSERT INTO bibdocmoreinfo (%s) VALUES(%s)" % \
                           (columns_str, values_str)
 
             if DBG_LOG_QUERIES:
                 from invenio.bibtask import write_message
                 write_message("Executing query: " + query_str + " ARGS: " + repr(query_args))
                 print "Executing query: " + query_str + " ARGS: " + repr(query_args)
 
             run_sql(query_str, query_args)
         else:
             #Update existing value
             where_str, where_args = self._generate_where_query_args(namespace, key)
             query_str = "UPDATE bibdocmoreinfo SET data_value=%s WHERE " + where_str
             query_args =  [str(serialised_val)] + where_args
 
             if DBG_LOG_QUERIES:
                 from invenio.bibtask import write_message
                 write_message("Executing query: " + query_str + " ARGS: " + repr(query_args))
                 print "Executing query: " + query_str + " ARGS: " + repr(query_args)
 
             run_sql(query_str, query_args )
 
     def _database_read_value(self, namespace, key):
         """Reads a value directly from the database
         @param namespace - namespace of the data to be read
         @param key - key of the data to be read
         """
         where_str, where_args = self._generate_where_query_args(namespace = namespace, data_key = key)
         query_str = "SELECT data_value FROM bibdocmoreinfo WHERE " + where_str
 
         res = run_sql(query_str, where_args)
 
         if DBG_LOG_QUERIES:
             from invenio.bibtask import write_message
             write_message("Executing query: " + query_str  + "  ARGS: " + repr(where_args) + "WITH THE RESULT: " + str(res))
             s_ = ""
             if res:
                 s_ = cPickle.loads(res[0][0])
             print "Executing query: " + query_str + "  ARGS: " + repr(where_args) + " WITH THE RESULT: " + str(s_)
 
         if res and res[0][0]:
             try:
                 return cPickle.loads(res[0][0])
             except:
                 raise Exception("Error when deserialising value for %s key=%s retrieved value=%s" % (repr(self), str(key), str(res[0][0])))
         return None
 
     def _database_remove_value(self, namespace, key):
         """Removes an entry directly in the database"""
         where_str, where_args = self._generate_where_query_args(namespace = namespace, data_key = key)
         query_str = "DELETE FROM bibdocmoreinfo WHERE " + where_str
         if DBG_LOG_QUERIES:
             from invenio.bibtask import write_message
             write_message("Executing query: " + query_str + "   ARGS: " + repr(where_args))
             print "Executing query: " + query_str + "   ARGS: " + repr(where_args)
         run_sql(query_str, where_args)
 
         return None
 
     def _flush_cache(self):
         """Writes all the dirty cache entries into the database"""
         for namespace in self.dirty:
             for data_key in self.dirty[namespace]:
                 if namespace in self.cache and data_key in self.cache[namespace]\
                         and not self.cache[namespace][data_key] is None:
                     self._database_save_value(namespace, data_key, self.cache[namespace][data_key])
                 else:
                     # This might happen if a value has been removed from the cache
                     self._database_remove_value(namespace, data_key)
         self.dirty = {}
 
     def _generate_where_query_args(self, namespace = None, data_key = None):
         """Private method generating WHERE clause of SQL statements"""
         ns = []
         if namespace != None:
             ns = [(namespace, "namespace")]
         dk = []
         if data_key != None:
             dk = [(data_key, "data_key")]
         to_process = [(self.docid, "id_bibdoc"), (self.version, "version"),
                       (self.format, "format"), (self.relation, "id_rel")] + \
                       ns + dk
 
         return _sql_generate_conjunctive_where(to_process)
 
     def set_data(self, namespace, key, value):
         """setting data directly in the database dictionary"""
         if not namespace in self.cache:
             self.cache[namespace] = {}
         self.cache[namespace][key] = value
         self._mark_dirty(namespace, key)
         if not self.cache_only:
             self._flush_cache()
 
     def get_data(self, namespace, key):
         """retrieving data from the database"""
         if self.cache_reads or self.cache_only:
             if namespace in self.cache and key in self.cache[namespace]:
                 return self.cache[namespace][key]
 
         if not self.cache_only:
             # we have a permission to read from the database
             value = self._database_read_value(namespace, key)
             if value:
                 if not namespace in self.cache:
                     self.cache[namespace] = {}
                 self.cache[namespace][key] = value
             return value
         return None
 
     def del_key(self, namespace, key):
         """retrieving data from the database"""
         if not namespace in self.cache:
             return None
 
         del self.cache[namespace][key]
         self._mark_dirty(namespace, key)
         if not self.cache_only:
             self._flush_cache()
 
     def contains_key(self, namespace, key):
         return self.get_data(namespace, key) != None
 
     # the dictionary interface -> updating the default namespace
     def __setitem__(self, key, value):
         self.set_data("", key, value) #the default value
 
     def __getitem__(self, key):
         return self.get_data("", key)
 
     def __delitem__(self, key):
         self.del_key("", key)
 
     def __contains__(self, key):
         return self.contains_key("", key)
 
     def __repr__(self):
         return "MoreInfo(docid=%s, version=%s, docformat=%s, relation=%s)" % \
             (self.docid, self.version, self.format, self.relation)
 
     def delete(self):
         """Remove all entries associated with this MoreInfo"""
         self.cache = {}
         if not self.cache_only:
             where_str, query_args = self._generate_where_query_args()
             query_str = "DELETE FROM bibdocmoreinfo WHERE %s" % (where_str, )
 
             if DBG_LOG_QUERIES:
                 from invenio.bibtask import write_message
                 write_message("Executing query: " + query_str + "   ARGS: " + repr(query_args))
                 print "Executing query: " + query_str + "   ARGS: " + repr(query_args)
             run_sql(query_str, query_args)
 
     def get_cache(self):
         """Returns the content of the cache
         @return The content of the MoreInfo cache
         @rtype dictionary {namespace: {key1: value1, ... }, namespace2: {}}
         """
         return self.cache
 
     def get_namespaces(self):
         """Returns a list of namespaces present in the MoreInfo structure.
            If the object is permitted access to the database, the data should
            be always read from there. Unlike when reading a particular value,
            we can not check if value is missing in the cache
         """
         if self.cache_only and self.cache_reads:
             return self.cache.keys()
         return self._database_get_namespaces()
 
     def get_keys(self, namespace):
         """Returns a list of keys present in a given namespace"""
         if self.cache_only and self.cache_reads:
             res = []
             if namespace in self.cache:
                 res = self.cache[namespace].keys()
             return res
         else:
             return self._database_get_keys(namespace)
 
     def flush(self):
         """Flush the content into the database"""
         self._flush_cache()
 
 class BibDocMoreInfo(MoreInfo):
     """
     This class wraps contextual information of the documents, such as the
         - comments
         - descriptions
         - flags.
     Such information is kept separately per every format/version instance of
     the corresponding document and is searialized in the database, ready
     to be retrieved (but not searched).
 
     @param docid: the document identifier.
     @type docid: integer
     @param more_info: a serialized version of an already existing more_info
         object. If not specified this information will be readed from the
         database, and othewise an empty dictionary will be allocated.
     @raise ValueError: if docid is not a positive integer.
     @ivar docid: the document identifier as passed to the constructor.
     @type docid: integer
     @ivar more_info: the more_info dictionary that will hold all the
         additional document information.
     @type more_info: dict of dict of dict
     @note: in general this class is never instanciated in client code and
         never used outside bibdocfile module.
     @note: this class will be extended in the future to hold all the new auxiliary
     information about a document.
     """
     def __init__(self, docid, cache_only = False, initial_data = None):
         if not (type(docid) in (long, int) and docid > 0):
             raise ValueError("docid is not a positive integer, but %s." % docid)
         MoreInfo.__init__(self, docid, cache_only = cache_only, initial_data = initial_data)
 
         if 'descriptions' not in self:
             self['descriptions'] = {}
         if 'comments' not in self:
             self['comments'] = {}
         if 'flags' not in self:
             self['flags'] = {}
         if DBG_LOG_QUERIES:
             from invenio.bibtask import write_message
             write_message("Creating BibDocMoreInfo :" + repr(self["comments"]))
             print "Creating BibdocMoreInfo :" + repr(self["comments"])
 
     def __repr__(self):
         """
         @return: the canonical string representation of the C{BibDocMoreInfo}.
         @rtype: string
         """
         return 'BibDocMoreInfo(%i, %s)' % (self.docid, repr(cPickle.dumps(self)))
 
     def set_flag(self, flagname, docformat, version):
         """
         Sets a flag.
 
         @param flagname: the flag to set (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}).
         @type flagname: string
         @param format: the format for which the flag should set.
         @type format: string
         @param version: the version for which the flag should set:
         @type version: integer
         @raise ValueError: if the flag is not in
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}
         """
         if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS:
             flags = self['flags']
 
             if not flagname in flags:
                 flags[flagname] = {}
             if not version in flags[flagname]:
                 flags[flagname][version] = {}
             if not docformat in flags[flagname][version]:
                 flags[flagname][version][docformat] = {}
             flags[flagname][version][docformat] = True
             self['flags'] = flags
         else:
             raise ValueError, "%s is not in %s" % \
                 (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS)
 
     def get_comment(self, docformat, version):
         """
         Returns the specified comment.
 
         @param format: the format for which the comment should be
             retrieved.
         @type format: string
         @param version: the version for which the comment should be
             retrieved.
         @type version: integer
         @return: the specified comment.
         @rtype: string
         """
         try:
             assert(type(version) is int)
             docformat = normalize_format(docformat)
             return self['comments'].get(version, {}).get(docformat)
         except:
             register_exception()
             raise
 
     def get_description(self, docformat, version):
         """
         Returns the specified description.
 
         @param format: the format for which the description should be
             retrieved.
         @type format: string
         @param version: the version for which the description should be
             retrieved.
         @type version: integer
         @return: the specified description.
         @rtype: string
         """
         try:
             assert(type(version) is int)
             docformat = normalize_format(docformat)
             return self['descriptions'].get(version, {}).get(docformat)
         except:
             register_exception()
             raise
 
     def has_flag(self, flagname, docformat, version):
         """
         Return True if the corresponding has been set.
 
         @param flagname: the name of the flag (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}).
         @type flagname: string
         @param format: the format for which the flag should be checked.
         @type format: string
         @param version: the version for which the flag should be checked.
         @type version: integer
         @return: True if the flag is set for the given format/version.
         @rtype: bool
         @raise ValueError: if the flagname is not in
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}
         """
         if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS:
             return self['flags'].get(flagname, {}).get(version, {}).get(docformat, False)
         else:
             raise ValueError, "%s is not in %s" % (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS)
 
     def get_flags(self, docformat, version):
         """
         Return the list of all the enabled flags.
 
         @param format: the format for which the list should be returned.
         @type format: string
         @param version: the version for which the list should be returned.
         @type version: integer
         @return: the list of enabled flags (from
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}).
         @rtype: list of string
         """
         return [flag for flag in self['flags'] if docformat in self['flags'][flag].get(version, {})]
 
     def set_comment(self, comment, docformat, version):
         """
         Set a comment.
 
         @param comment: the comment to be set.
         @type comment: string
         @param format: the format for which the comment should be set.
         @type format: string
         @param version: the version for which the comment should be set:
         @type version: integer
         """
         try:
             assert(type(version) is int and version > 0)
             docformat = normalize_format(docformat)
             if comment == KEEP_OLD_VALUE:
                 comment = self.get_comment(docformat, version) or self.get_comment(docformat, version - 1)
             if not comment:
                 self.unset_comment(docformat, version)
                 return
             if not version in self['comments']:
                 comments = self['comments']
                 comments[version] = {}
                 self['comments'] = comments
             comments = self['comments']
             comments[version][docformat] = comment
             self['comments'] = comments
         except:
             register_exception()
             raise
 
     def set_description(self, description, docformat, version):
         """
         Set a description.
 
         @param description: the description to be set.
         @type description: string
         @param format: the format for which the description should be set.
         @type format: string
         @param version: the version for which the description should be set:
         @type version: integer
         """
         try:
             assert(type(version) is int and version > 0)
             docformat = normalize_format(docformat)
             if description == KEEP_OLD_VALUE:
                 description = self.get_description(docformat, version) or self.get_description(docformat, version - 1)
             if not description:
                 self.unset_description(docformat, version)
                 return
 
             descriptions = self['descriptions']
             if not version in descriptions:
                 descriptions[version] = {}
 
             descriptions[version][docformat] = description
             self.set_data("", 'descriptions', descriptions)
         except:
             register_exception()
             raise
 
     def unset_comment(self, docformat, version):
         """
         Unset a comment.
 
         @param format: the format for which the comment should be unset.
         @type format: string
         @param version: the version for which the comment should be unset:
         @type version: integer
         """
         try:
             assert(type(version) is int and version > 0)
             comments = self['comments']
             del comments[version][docformat]
             self['comments'] = comments
         except KeyError:
             pass
         except:
             register_exception()
             raise
 
     def unset_description(self, docformat, version):
         """
         Unset a description.
 
         @param format: the format for which the description should be unset.
         @type format: string
         @param version: the version for which the description should be unset:
         @type version: integer
         """
         try:
             assert(type(version) is int and version > 0)
             descriptions = self['descriptions']
             del descriptions[version][docformat]
             self['descriptions'] = descriptions
         except KeyError:
             pass
         except:
             register_exception()
             raise
 
     def unset_flag(self, flagname, docformat, version):
         """
         Unset a flag.
 
         @param flagname: the flag to be unset (see
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}).
         @type flagname: string
         @param format: the format for which the flag should be unset.
         @type format: string
         @param version: the version for which the flag should be unset:
         @type version: integer
         @raise ValueError: if the flag is not in
             L{CFG_BIBDOCFILE_AVAILABLE_FLAGS}
         """
         if flagname in CFG_BIBDOCFILE_AVAILABLE_FLAGS:
             try:
                 flags = self['flags']
                 del flags[flagname][version][docformat]
                 self['flags'] = flags
             except KeyError:
                 pass
         else:
             raise ValueError, "%s is not in %s" % (flagname, CFG_BIBDOCFILE_AVAILABLE_FLAGS)
 
 
 _bib_relation__any_value = -1
 class BibRelation(object):
     """
     A representation of a relation between documents or their particular versions
     """
     def __init__(self, rel_type = None,
                  bibdoc1_id = None, bibdoc2_id = None,
                  bibdoc1_ver = None, bibdoc2_ver = None,
                  bibdoc1_fmt = None, bibdoc2_fmt = None,
                  rel_id = None):
         """
         The constructor of the class representing a relation between two
         documents.
 
         If the more_info parameter is specified, no data is retrieved from
         the database and the internal dictionary is initialised with
         the passed value. If the more_info is not provided, the value is
         read from the database. In the case of non-existing record, an
         empty dictionary is assigned.
 
         If a version of whichever record is not specified, the resulting
         object desctibes a relation of all version of a given BibDoc.
 
         @param bibdoc1
         @type bibdoc1 BibDoc
         @param bibdoc1_ver
         @type version1_ver int
         @param bibdoc2
         @type bibdoc2 BibDco
         @param bibdoc2_ver
         @type bibdoc2_ver int
 
         @param bibdoc1_fmt format of the first document
         @type bibdoc1_fmt string
         @param bibdoc2_fmt format of the second document
         @type bibdoc2_fmt string
 
         @param rel_type
         @type rel_type string
         @param more_info The serialised representation of the more_info
         @type more_info string
 
         @param rel_id allows to specify the identifier of the newly created relation
         @type rel_ide unsigned int
 
         """
 
         self.id = rel_id
         self.bibdoc1_id = bibdoc1_id
         self.bibdoc2_id = bibdoc2_id
         self.bibdoc1_ver = bibdoc1_ver
         self.bibdoc2_ver = bibdoc2_ver
         self.bibdoc1_fmt = bibdoc1_fmt
         self.bibdoc2_fmt = bibdoc2_fmt
         self.rel_type = rel_type
 
 
         if rel_id == None:
             self._fill_id_from_data()
         else:
             self._fill_data_from_id()
 
         self.more_info = MoreInfo(relation = self.id)
 
     def _fill_data_from_id(self):
         """Fill all the relation data from the relation identifier
         """
         query = "SELECT id_bibdoc1, version1, format1, id_bibdoc2, version2, format2, rel_type FROM bibdoc_bibdoc WHERE id=%s"
         res = run_sql(query, (str(self.id), ))
         if res != None and res[0] != None:
             self.bibdoc1_id = res[0][0]
             self.bibdoc1_ver = res[0][1]
             self.bibdoc1_fmt = res[0][2]
             self.bibdoc2_id = res[0][3]
             self.bibdoc2_ver = res[0][4]
             self.bibdoc2_fmt = res[0][5]
             self.rel_type = res[0][6]
 
     def _fill_id_from_data(self):
         """Fill the relation identifier based on the data provided"""
         where_str, where_args = self._get_where_clauses()
         query = "SELECT id FROM bibdoc_bibdoc WHERE %s" % (where_str, )
 
         res = run_sql(query, where_args)
         if res and res[0][0]:
             self.id = int(res[0][0])
 
     def _get_value_column_mapping(self):
         """
         Returns a list of tuples each tuple consists of a value and a name
         of a database column where this value should fit
         """
         return [(self.rel_type, "rel_type"), (self.bibdoc1_id, "id_bibdoc1"),
                (self.bibdoc1_ver, "version1"),
                 (self.bibdoc1_fmt, "format1"),
                (self.bibdoc2_id, "id_bibdoc2"),
                 (self.bibdoc2_ver, "version2"),
                (self.bibdoc2_fmt, "format2")]
 
     def _get_where_clauses(self):
         """Private function returning part of the SQL statement identifying
           current relation
 
         @return
         @rtype tuple
         """
         return _sql_generate_conjunctive_where(self._get_value_column_mapping())
 
     @staticmethod
     def create(bibdoc1_id = None, bibdoc1_ver = None,
                bibdoc1_fmt = None, bibdoc2_id = None,
                bibdoc2_ver = None, bibdoc2_fmt = None,
                rel_type = ""):
         """
         Create a relation and return instance.
         Ommiting an argument means that a particular relation concerns any value of the parameter
         """
         # check if there is already entry corresponding to parameters
         existing = BibRelation.get_relations(rel_type = rel_type,
                                   bibdoc1_id = bibdoc1_id,
                                   bibdoc2_id = bibdoc2_id,
                                   bibdoc1_ver = bibdoc1_ver,
                                   bibdoc2_ver = bibdoc2_ver,
                                   bibdoc1_fmt = bibdoc1_fmt,
                                   bibdoc2_fmt = bibdoc2_fmt)
         if len(existing) > 0:
             return existing[0]
 
         # build the insert query and execute it
         to_process = [(rel_type, "rel_type"), (bibdoc1_id, "id_bibdoc1"),
                       (bibdoc1_ver, "version1"), (bibdoc1_fmt, "format1"),
                       (bibdoc2_id, "id_bibdoc2"), (bibdoc2_ver, "version2"),
                       (bibdoc2_fmt, "format2")]
 
         values_list = []
         args_list = []
         columns_list = []
 
         for entry in to_process:
             columns_list.append(entry[1])
             if entry[0] == None:
                 values_list.append("NULL")
             else:
                 values_list.append("%s")
                 args_list.append(entry[0])
 
         query = "INSERT INTO bibdoc_bibdoc (%s) VALUES (%s)" % (", ".join(columns_list), ", ".join(values_list))
 #        print "Query: %s Args: %s" % (query, str(args_list))
         rel_id = run_sql(query, args_list)
         return BibRelation(rel_id = rel_id)
 
     def delete(self):
         """ Removes a relation between objects from the database.
             executing the flush function on the same object will restore
             the relation
         """
         where_str, where_args = self._get_where_clauses()
         run_sql("DELETE FROM bibdoc_bibdoc WHERE %s" % (where_str,), where_args) # kwalitee: disable=sql
         # removing associated MoreInfo
         self.more_info.delete()
 
     def get_more_info(self):
         return self.more_info
 
     @staticmethod
     def get_relations(rel_type = _bib_relation__any_value,
                        bibdoc1_id = _bib_relation__any_value,
                        bibdoc2_id = _bib_relation__any_value,
                        bibdoc1_ver = _bib_relation__any_value,
                        bibdoc2_ver = _bib_relation__any_value,
                        bibdoc1_fmt = _bib_relation__any_value,
                        bibdoc2_fmt = _bib_relation__any_value):
 
         """Retrieves list of relations satisfying condtions.
           If a parameter is specified, its value has to match exactly.
           If a parameter is ommited, any of its values will be accepted"""
 
         to_process = [(rel_type, "rel_type"), (bibdoc1_id, "id_bibdoc1"),
                       (bibdoc1_ver, "version1"), (bibdoc1_fmt, "format1"),
                       (bibdoc2_id, "id_bibdoc2"), (bibdoc2_ver, "version2"),
                       (bibdoc2_fmt, "format2")]
 
         where_str, where_args = _sql_generate_conjunctive_where(
             filter(lambda x: x[0] != _bib_relation__any_value, to_process))
 
         if where_str:
             where_str = "WHERE " + where_str # in case of nonempty where, we need a where clause
 
         query_str = "SELECT id FROM bibdoc_bibdoc %s" % (where_str, )
         #     print "running query : %s with arguments %s on the object %s" % (query_str, str(where_args), repr(self))
         try:
             res = run_sql(query_str, where_args)
         except:
             raise Exception(query_str + " " + str(where_args))
 
         results = []
         if res != None:
             for res_row in res:
                 results.append(BibRelation(rel_id=res_row[0]))
         return results
 
     # Access to MoreInfo
     def set_data(self, category, key, value):
         """assign additional information to this relation"""
         self.more_info.set_data(category, key, value)
 
     def get_data(self, category, key):
         """read additional information assigned to this relation"""
         return self.more_info.get_data(category, key)
 
 
 
     #the dictionary interface allowing to set data bypassing the namespaces
 
     def __setitem__(self, key, value):
         self.more_info[key] = value
 
     def __getitem__(self, key):
         return self.more_info[key]
 
     def __contains__(self, key):
         return self.more_info.__contains__(key)
 
     def __repr__(self):
         return "BibRelation(id_bibdoc1 = %s, version1 = %s, format1 = %s, id_bibdoc2 = %s, version2 = %s, format2 = %s, rel_type = %s)" % \
             (self.bibdoc1_id, self.bibdoc1_ver, self.bibdoc1_fmt,
              self.bibdoc2_id, self.bibdoc2_ver, self.bibdoc2_fmt,
              self.rel_type)
 
 def readfile(filename):
     """
     Read a file.
 
     @param filename: the name of the file to be read.
     @type filename: string
     @return: the text contained in the file.
     @rtype: string
     @note: Returns empty string in case of any error.
     @note: this function is useful for quick implementation of websubmit
     functions.
     """
     try:
         return open(filename).read()
     except Exception:
         return ''
 
 
 class HeadRequest(urllib2.Request):
     """
     A request object to perform a HEAD request.
     """
     def get_method(self):
         return 'HEAD'
 
 
 def read_cookie(cookiefile):
     """
     Parses a cookie file and returns a string as needed for the urllib2 headers
     The file should respect the Netscape cookie specifications
     """
     cookie_data = ''
     cfile = open(cookiefile, 'r')
     for line in cfile.readlines():
         tokens = line.split('\t')
         if len(tokens) == 7: # we are on a cookie line
             cookie_data += '%s=%s; ' % (tokens[5], tokens[6].replace('\n', ''))
     cfile.close()
     return cookie_data
 
 
 def open_url(url, headers=None, head_request=False):
     """
     Opens a URL. If headers are passed as argument, no check is performed and
     the URL will be opened. Otherwise checks if the URL is present in
     CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS and uses the headers specified in
     the config variable.
 
     @param url: the URL to open
     @type url: string
     @param headers: the headers to use
     @type headers: dictionary
     @param head_request: if True, perform a HEAD request, otherwise a POST
             request
     @type head_request: boolean
     @return: a file-like object as returned by urllib2.urlopen.
     """
     headers_to_use = None
 
     if headers is None:
         for regex, headers in _CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS:
             if regex.match(url) is not None:
                 headers_to_use = headers
                 break
 
         if headers_to_use is None:
             # URL is not allowed.
             raise InvenioBibdocfileUnauthorizedURL, "%s is not an authorized " \
                     "external URL." % url
     else:
         headers_to_use = headers
 
     request_obj = head_request and HeadRequest or urllib2.Request
     request = request_obj(url)
     request.add_header('User-Agent', make_user_agent_string('bibdocfile'))
     for key, value in headers_to_use.items():
         try:
             value = globals()[value['fnc']](**value['args'])
         except (KeyError, TypeError):
             pass
         request.add_header(key, value)
 
     return urllib2.urlopen(request)
 
 
 def update_modification_date_of_file(filepath, modification_date):
     """Update the modification time and date of the file with the modification_date
     @param filepath: the full path of the file that needs to be updated
     @type filepath: string
     @param modification_date: the new modification date and time
     @type modification_date: datetime.datetime object
     """
     try:
         modif_date_in_seconds = time.mktime(modification_date.timetuple()) # try to get the time in seconds
     except (AttributeError, TypeError):
         modif_date_in_seconds = 0
     if modif_date_in_seconds:
         statinfo = os.stat(filepath) # we need to keep the same access time
         os.utime(filepath, (statinfo.st_atime, modif_date_in_seconds)) #update the modification time
diff --git a/modules/bibformat/lib/bibreformat.py b/modules/bibformat/lib/bibreformat.py
index 9c712dcbd..9c51e122f 100644
--- a/modules/bibformat/lib/bibreformat.py
+++ b/modules/bibformat/lib/bibreformat.py
@@ -1,509 +1,509 @@
 # -*- mode: python; coding: utf-8; -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2007, 2008, 2010, 2011, 2012, 2014 CERN.
+# Copyright (C) 2007, 2008, 2010, 2011, 2012, 2014, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """Call BibFormat engine and create HTML brief (and other) formats cache for
    bibliographic records."""
 
 __revision__ = "$Id$"
 
 import os
 from datetime import datetime
 
 from invenio.dbquery import run_sql
 
 from invenio.intbitset import intbitset
 from invenio.search_engine import perform_request_search, search_pattern
 from invenio.bibrank_citation_searcher import get_cited_by
 from invenio.bibrank_citation_indexer import get_bibrankmethod_lastupdate
 from invenio.bibformat_dblayer import save_preformatted_record
 from invenio.shellutils import split_cli_ids_arg
 from invenio.bibfield import get_record
 from invenio.bibtask import task_init, \
     write_message, \
     task_set_option, \
     task_get_option, \
     task_update_progress, \
     task_has_option, \
     task_sleep_now_if_required
 from invenio.bibformat_engine import format_record_1st_pass
 
 
 def fetch_last_updated(fmt):
     select_sql = "SELECT last_updated FROM format WHERE code = %s"
     row = run_sql(select_sql, (fmt.lower(), ))
 
     # Fallback in case we receive None instead of a valid date
     last_date = row[0][0] or datetime(year=1900, month=1, day=1)
 
     return last_date
 
 
 def store_last_updated(fmt, iso_date):
     sql = "UPDATE format SET last_updated = %s " \
           "WHERE code = %s AND (last_updated < %s or last_updated IS NULL)"
     run_sql(sql, (iso_date, fmt.lower(), iso_date))
 
 
 ### run the bibreformat task bibsched scheduled
 ###
 
 def bibreformat_task(fmt, recids, without_fmt, process):
     """BibReformat main task.
 
     @param fmt: output format to use
     @param process:
     @param recids: a list of record IDs to reformat
     @return: None
     """
     write_message("Processing format %s" % fmt)
 
     t1 = os.times()[4]
 
     start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
     latest_bibrank_run = get_bibrankmethod_lastupdate('citation')
 
     def related_records(recids, recids_processed):
         if fmt == "HDREF" and recids:
             # HDREF represents the references tab
             # the tab needs to be recomputed not only when the record changes
             # but also when one of the citations changes
             sql = """SELECT id, modification_date FROM bibrec
                      WHERE id in (%s)""" % ','.join(str(r) for r in recids)
 
             def check_date(mod_date):
                 return mod_date.strftime(
                     "%Y-%m-%d %H:%M:%S") < latest_bibrank_run
             rel_recids = intbitset([recid for recid, mod_date in run_sql(sql)
                                     if check_date(mod_date)])
             for r in rel_recids:
                 recids |= intbitset(get_cited_by(r))
 
         # To not process recids twice
         recids -= recids_processed
         # Adds to the set of processed recids
         recids_processed += recids
 
         return recids
 
     def recid_chunker(recids):
         recids_processed = intbitset()
         chunk = intbitset()
 
         for recid in recids:
             if len(chunk) == 5000:
                 for r in related_records(chunk, recids_processed):
                     yield r
                 recids_processed += chunk
                 chunk = intbitset()
 
             if recid not in recids_processed:
                 chunk.add(recid)
 
         if chunk:
             for r in related_records(chunk, recids_processed):
                 yield r
 
     recIDs = list(recid_chunker(recids))
 
 ### list of corresponding record IDs was retrieved
 ### now format the selected records
 
     if without_fmt:
         write_message("Records to be processed: %d" % len(recIDs))
         write_message("Out of it records without existing cache: %d" %
                       len(without_fmt))
     else:
         write_message("Records to be processed: %d" % len(recIDs))
 
 ### Initialize main loop
 
     total_rec = 0     # Total number of records
     tbibformat = 0     # time taken up by external call
     tbibupload = 0     # time taken up by external call
 
 ### Iterate over all records prepared in lists I (option)
     if process:
         total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt)
         total_rec += total_rec_1
         tbibformat += tbibformat_1
         tbibupload += tbibupload_1
 
 ### Store last run time
     if task_has_option("last"):
         write_message("storing run date to %s" % start_date)
         store_last_updated(fmt, start_date)
 
 ### Final statistics
 
     t2 = os.times()[4]
 
     elapsed = t2 - t1
     message = "total records processed: %d" % total_rec
     write_message(message)
 
     message = "total processing time: %2f sec" % elapsed
     write_message(message)
 
     message = "Time spent on external call (os.system):"
     write_message(message)
 
     message = " bibformat: %2f sec" % tbibformat
     write_message(message)
 
     message = " bibupload: %2f sec" % tbibupload
     write_message(message)
 
 
 def check_validity_input_formats(input_formats):
     """Check the validity of every input format.
 
     @param input_formats: list of given formats
     @type input_formats: list
     @return: if there is any invalid input format it returns this value
     @rtype: string
     """
     from invenio.search_engine import get_available_output_formats
     valid_formats = get_available_output_formats()
 
     # let's to extract the values of the available formats
     format_values = []
     for aformat in valid_formats:
         format_values.append(aformat['value'])
 
     invalid_format = ''
     for aformat in input_formats:
         if aformat.lower() not in format_values:
             invalid_format = aformat.lower()
             break
     return invalid_format
 
 
 ### Bibreformat all selected records (using new python bibformat)
 ### (see iterate_over_old further down)
 
 
 def _update_recjson_format(recid, *args, **kwargs):
     """Update RECJSON cache.
 
     :param int recid: record id to process
     """
     dummy = get_record(recid, reset_cache=True)
 
 
 def _update_format(recid, fmt):
     """Usual format update procedure, gets the formatted record and saves it.
 
     :param int recid: record id to process
     :param str fmt: format to update/create, i.e. 'HB'
     """
     record, needs_2nd_pass = format_record_1st_pass(recID=recid,
                                                     of=fmt,
                                                     on_the_fly=True,
                                                     save_missing=False)
     save_preformatted_record(recID=recid,
                              of=fmt,
                              res=record,
                              needs_2nd_pass=needs_2nd_pass,
                              low_priority=True)
 
 
 _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS = {'recjson': _update_recjson_format}
 """Specific functions to be used for each format if needed.
 If not set `_update_format` will be used.
 """
 
 
 def iterate_over_new(recIDs, fmt):
     """Iterate over list of IDs.
 
     @param list: the list of record IDs to format
     @param fmt: the output format to use
     @return: tuple (total number of records, time taken to format, time taken
         to insert)
     """
     tbibformat = 0     # time taken up by external call
     tbibupload = 0     # time taken up by external call
 
     tot = len(recIDs)
     reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get(
         fmt.lower(), _update_format)
     for count, recID in enumerate(recIDs):
         t1 = os.times()[4]
         reformat_function(recID, fmt)
         t2 = os.times()[4]
         tbibformat += t2 - t1
         if count % 100 == 0:
             write_message("   ... formatted %s records out of %s" %
                           (count, tot))
             task_update_progress('Formatted %s out of %s' % (count, tot))
             task_sleep_now_if_required(can_stop_too=True)
 
     if tot % 100 != 0:
         write_message("   ... formatted %s records out of %s" % (tot, tot))
 
     return tot, tbibformat, tbibupload
 
 
 def all_records():
     """Produce record IDs for all available records."""
     return intbitset(run_sql("SELECT id FROM bibrec"))
 
 
 def outdated_caches(fmt, last_updated, chunk_size=5000):
     sql = """SELECT br.id
              FROM bibrec AS br
              INNER JOIN bibfmt AS bf ON bf.id_bibrec = br.id
              WHERE br.modification_date >= %s
              AND bf.format = %s
              AND bf.last_updated < br.modification_date
              AND br.id BETWEEN %s AND %s"""
 
     last_updated_str = last_updated.strftime('%Y-%m-%d %H:%M:%S')
     recids = intbitset()
     max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0
     for start in xrange(1, max_id + 1, chunk_size):
         end = start + chunk_size
         recids += intbitset(run_sql(sql, (last_updated_str, fmt, start, end)))
 
     return recids
 
 
 def missing_caches(fmt, chunk_size=100000):
     """Produce record IDs to be formated, because their fmt cache is missing.
 
     @param fmt: format to query for
     @return: record IDs generator without pre-created format cache
     """
     write_message("Querying database for records without cache...")
 
     all_recids = intbitset()
     max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0
     for start in xrange(1, max_id + 1, chunk_size):
         end = start + chunk_size
         sql = "SELECT id FROM bibrec WHERE id BETWEEN %s AND %s"
         recids = intbitset(run_sql(sql, (start, end)))
         sql = """SELECT id_bibrec FROM bibfmt
                  WHERE id_bibrec BETWEEN %s AND %s
                  AND format = %s"""
         without_fmt = intbitset(run_sql(sql, (start, end, fmt)))
         all_recids += recids - without_fmt
 
     return all_recids
 
 
 def query_records(params):
     """Produce record IDs from given query parameters.
 
     By passing the appriopriate CLI options, we can query here for additional
     records.
     """
     write_message("Querying database (records query)...")
     res = intbitset()
     if params['field'] or params['collection'] or params['pattern']:
 
         if not params['collection']:
             # use search_pattern() whenever possible, as it can search
             # even in private collections
             res = search_pattern(p=params['pattern'],
                                  f=params['field'],
                                  m=params['matching'])
         else:
             # use perform_request_search when '-c' argument has been
             # defined, as it is not supported by search_pattern()
             res = intbitset(perform_request_search(req=None,
                                                    of='id',
                                                    c=params['collection'],
                                                    p=params['pattern'],
                                                    f=params['field']))
     return res
 
 
 def task_run_core():
     """Run the task by fetching arguments from the BibSched task queue.
 
     This is what BibSched will be invoking via daemon call.
     """
     fmts = task_get_option('format', 'HB,RECJSON')
     for fmt in fmts.split(','):
         last_updated = fetch_last_updated(fmt)
         write_message("last stored run date is %s" % last_updated)
 
         recids = intbitset()
 
         if task_has_option("all"):
             recids += all_records()
 
         if task_has_option("last"):
             recids += outdated_caches(fmt, last_updated)
 
         if task_has_option('ignore_without') or \
                 task_has_option('collection') or \
                 task_has_option('field') or \
                 task_has_option('pattern') or \
                 task_has_option('recids'):
             without_fmt = intbitset()
         else:
             without_fmt = missing_caches(fmt)
             recids += without_fmt
 
         cli_recids = split_cli_ids_arg(task_get_option('recids', ''))
         recids += cli_recids
 
         query_params = {'collection': task_get_option('collection', ''),
                         'field': task_get_option('field', ''),
                         'pattern': task_get_option('pattern', ''),
                         'matching': task_get_option('matching', '')}
         recids += query_records(query_params)
 
         bibreformat_task(fmt,
                          recids,
                          without_fmt,
                          not task_has_option('noprocess'))
 
     return True
 
 
 def main():
     """Main that construct all the bibtask."""
     task_init(authorization_action='runbibformat',
               authorization_msg="BibReformat Task Submission",
               description="""
 BibReformat formats the records and saves the produced outputs for
 later retrieval.
 
 BibReformat is usually run periodically via BibSched in order to (1)
 format new records in the database and to (2) reformat records for
 which the meta data has been modified.
 
 BibReformat has to be run manually when (3) format config files have
 been modified, in order to see the changes in the web interface.
 
 Although it is not necessary to run BibReformat to display formatted
 records in the web interface, BibReformat allows to improve serving
 speed by precreating the outputs. It is suggested to run
 BibReformat for 'HB' output.
 
 Option -m cannot be used at the same time as option -c.
 Option -c prevents from finding records in private collections.
 
 Examples:
   bibreformat                    Format all new or modified records (in HB and RECJSON).
   bibreformat -o HD              Format all new or modified records in HD.
   bibreformat -o HD,HB           Format all new or modified records in HD and HB.
 
   bibreformat -a                 Force reformatting all records (in HB).
   bibreformat -c 'Photos'        Force reformatting all records in 'Photos' collection (in HB).
   bibreformat -c 'Photos' -o HD  Force reformatting all records in 'Photos' collection in HD.
 
   bibreformat -i 15              Force reformatting record 15 (in HB).
   bibreformat -i 15:20           Force reformatting records 15 to 20 (in HB).
   bibreformat -i 15,16,17        Force reformatting records 15, 16 and 17 (in HB).
 
   bibreformat -n                 Show how many records are to be (re)formatted.
   bibreformat -n -c 'Articles'   Show how many records are to be (re)formatted in 'Articles' collection.
 
   bibreformat -oHB -s1h          Format all new and modified records every hour, in HB.
 """, help_specific_usage="""  -o,  --formats         \t Specify output format/s (default HB)
   -n,  --noprocess      \t Count records to be formatted (no processing done)
 Reformatting options:
   -a,  --all            \t Force reformatting all records
   -c,  --collection     \t Force reformatting records by collection
   -f,  --field          \t Force reformatting records by field
   -p,  --pattern        \t Force reformatting records by pattern
   -i,  --id             \t Force reformatting records by record id(s)
   --no-missing          \t Ignore reformatting records without format
 Pattern options:
   -m,  --matching       \t Specify if pattern is exact (e), regular expression (r),
                         \t partial (p), any of the words (o) or all of the words (a)
 """,
               version=__revision__,
               specific_params=("ac:f:p:lo:nm:i:",
                                ["all",
                                 "collection=",
                                 "matching=",
                                 "field=",
                                 "pattern=",
                                 "format=",
                                 "noprocess",
                                 "id=",
                                 "no-missing"]),
               task_submit_check_options_fnc=task_submit_check_options,
               task_submit_elaborate_specific_parameter_fnc=
                  task_submit_elaborate_specific_parameter,
               task_run_fnc=task_run_core)
 
 
 def task_submit_check_options():
     """Last checks and updating on the options..."""
     if not (task_has_option('all') or task_has_option('collection')
             or task_has_option('field') or task_has_option('pattern')
             or task_has_option('matching') or task_has_option('recids')):
         task_set_option('last', 1)
     return True
 
 
 def task_submit_elaborate_specific_parameter(key, value, opts, args):  # pylint: disable-msg=W0613
     """
     Elaborate specific CLI parameters of BibReformat.
 
     @param key: a parameter key to check
     @param value: a value associated to parameter X{Key}
     @return: True for known X{Key} else False.
     """
     if key in ("-a", "--all"):
         task_set_option("all", 1)
     elif key in ("--no-missing", ):
         task_set_option("ignore_without", 1)
     elif key in ("-c", "--collection"):
         task_set_option("collection", value)
     elif key in ("-n", "--noprocess"):
         task_set_option("noprocess", 1)
     elif key in ("-f", "--field"):
         task_set_option("field", value)
     elif key in ("-p", "--pattern"):
         task_set_option("pattern", value)
     elif key in ("-m", "--matching"):
         task_set_option("matching", value)
     elif key in ("-o", "--format"):
         input_formats = value.split(',')
         # check the validity of the given output formats
         invalid_format = check_validity_input_formats(input_formats)
         if invalid_format:
             try:
                 raise Exception('Invalid output format.')
             except Exception:  # pylint: disable-msg=W0703
                 from invenio.errorlib import register_exception
                 register_exception(
                     prefix="The given output format '%s' is not available or "
                            "is invalid. Please try again" %
                            (invalid_format, ), alert_admin=True)
                 return
         else:  # every given format is available
             task_set_option("format", value)
     elif key in ("-i", "--id"):
         task_set_option("recids", value)
     else:
         return False
     return True
 
 
 ### okay, here we go:
 if __name__ == '__main__':
     main()
diff --git a/modules/bibindex/lib/bibindex_engine.py b/modules/bibindex/lib/bibindex_engine.py
index 7d7333af7..f4b2e2adc 100644
--- a/modules/bibindex/lib/bibindex_engine.py
+++ b/modules/bibindex/lib/bibindex_engine.py
@@ -1,2314 +1,2314 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
 # Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009,
-#               2010, 2011, 2012, 2013, 2014, 2015 CERN.
+#               2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 BibIndex indexing engine implementation.
 See bibindex executable for entry point.
 """
 
 __revision__ = "$Id$"
 
 import re
 import sys
 import time
 import fnmatch
 import inspect
 from datetime import datetime
 
 from invenio.config import CFG_SOLR_URL
 from invenio.bibindex_engine_config import CFG_MAX_MYSQL_THREADS, \
      CFG_MYSQL_THREAD_TIMEOUT, \
      CFG_CHECK_MYSQL_THREADS, \
      CFG_BIBINDEX_INDEX_TABLE_TYPE, \
      CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR, \
      CFG_BIBINDEX_UPDATE_MESSAGE, \
      CFG_BIBINDEX_UPDATE_MODE, \
      CFG_BIBINDEX_TOKENIZER_TYPE, \
      CFG_BIBINDEX_WASH_INDEX_TERMS, \
      CFG_BIBINDEX_SPECIAL_TAGS
 from invenio.bibauthority_config import \
     CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC
 from invenio.bibauthority_engine import \
      get_control_nos_from_recID
 from invenio.bibauthorid_dbinterface import get_author_canonical_ids_for_recid
 from invenio.search_engine import perform_request_search, \
      get_index_stemming_language, \
      get_synonym_terms, \
      search_pattern, \
      search_unit_in_bibrec
 
 from invenio.dbquery import run_sql, DatabaseError, serialize_via_marshal, \
      deserialize_via_marshal, wash_table_column_name
 from invenio.bibindex_engine_washer import wash_index_term
 from invenio.bibtask import task_init, write_message, get_datetime, \
     task_set_option, task_get_option, task_get_task_param, \
     task_update_progress, task_sleep_now_if_required
 from invenio.intbitset import intbitset
 from invenio.errorlib import register_exception
 from invenio.solrutils_bibindex_indexer import solr_commit
 from invenio.bibindex_tokenizers.BibIndexJournalTokenizer import \
     CFG_JOURNAL_TAG, \
     CFG_JOURNAL_PUBINFO_STANDARD_FORM, \
     CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
 from invenio.bibindex_termcollectors import TermCollector
 from invenio.bibindex_engine_utils import load_tokenizers, \
     get_all_index_names_and_column_values, \
     get_index_tags, \
     get_field_tags, \
     get_marc_tag_indexes, \
     get_nonmarc_tag_indexes, \
     get_all_indexes, \
     get_index_virtual_indexes, \
     get_virtual_index_building_blocks, \
     get_index_id_from_index_name, \
     run_sql_drop_silently, \
     get_min_last_updated, \
     remove_inexistent_indexes, \
     get_all_synonym_knowledge_bases, \
     get_index_remove_stopwords, \
     get_index_remove_html_markup, \
     get_index_remove_latex_markup, \
     filter_for_virtual_indexes, \
     get_records_range_for_index, \
     make_prefix, \
     list_union, \
     recognize_marc_tag
 from invenio.bibindex_termcollectors import \
     TermCollector, \
     NonmarcTermCollector
 from invenio.memoiseutils import Memoise
 
 
 if sys.hexversion < 0x2040000:
     # pylint: disable=W0622
     from sets import Set as set
     # pylint: enable=W0622
 
 
 # precompile some often-used regexp for speed reasons:
 re_subfields = re.compile('\$\$\w')
 re_datetime_shift = re.compile("([-\+]{0,1})([\d]+)([dhms])")
 re_prefix = re.compile('__[a-zA-Z1-9]*__')
 
 nb_char_in_line = 50  # for verbose pretty printing
 chunksize = 1000 # default size of chunks that the records will be treated by
 base_process_size = 4500 # process base size
 _last_word_table = None
 
 
 _TOKENIZERS = load_tokenizers()
 
 
 def list_unique(_list):
     """Returns a _list with duplicates removed."""
     _dict = {}
     for e in _list:
         _dict[e] = 1
     return _dict.keys()
 
 
 # safety function for killing slow DB threads:
 def kill_sleepy_mysql_threads(max_threads=CFG_MAX_MYSQL_THREADS,
                               thread_timeout=CFG_MYSQL_THREAD_TIMEOUT):
     """Check the number of DB threads and if there are more than
        MAX_THREADS of them, lill all threads that are in a sleeping
        state for more than THREAD_TIMEOUT seconds.  (This is useful
        for working around the the max_connection problem that appears
        during indexation in some not-yet-understood cases.)  If some
        threads are to be killed, write info into the log file.
     """
     res = run_sql("SHOW FULL PROCESSLIST")
     if len(res) > max_threads:
         for row in res:
             r_id, dummy, dummy, dummy, r_command, r_time, dummy, dummy = row
             if r_command == "Sleep" and int(r_time) > thread_timeout:
                 run_sql("KILL %s", (r_id, ))
                 write_message("WARNING: too many DB threads, " + \
                               "killing thread %s" % r_id, verbose=1)
     return
 
 
 def get_associated_subfield_value(recID, tag, value, associated_subfield_code):
     """Return list of ASSOCIATED_SUBFIELD_CODE, if exists, for record
     RECID and TAG of value VALUE.  Used by fulltext indexer only.
     Note: TAG must be 6 characters long (tag+ind1+ind2+sfcode),
     otherwise en empty string is returned.
     FIXME: what if many tag values have the same value but different
     associated_subfield_code?  Better use bibrecord library for this.
     """
     out = ""
     if len(tag) != 6:
         return out
     bibXXx = "bib" + tag[0] + tag[1] + "x"
     bibrec_bibXXx = "bibrec_" + bibXXx
     query = """SELECT bb.field_number, b.tag, b.value FROM %s AS b, %s AS bb
                WHERE bb.id_bibrec=%%s AND bb.id_bibxxx=b.id AND tag LIKE
                %%s%%""" % (bibXXx, bibrec_bibXXx)
     res = run_sql(query, (recID, tag[:-1]))
     field_number = -1
     for row in res:
         if row[1] == tag and row[2] == value:
             field_number = row[0]
     if field_number > 0:
         for row in res:
             if row[0] == field_number and row[1] == tag[:-1] + associated_subfield_code:
                 out = row[2]
                 break
     return out
 
 
 def swap_temporary_reindex_tables(index_id, reindex_prefix="tmp_"):
     """Atomically swap reindexed temporary table with the original one.
     Delete the now-old one."""
     write_message("Putting new tmp index tables " + \
                   "for id %s into production" % index_id)
     run_sql(
         "RENAME TABLE " +
         "idxWORD%02dR TO old_idxWORD%02dR," % (index_id, index_id) +
         "%sidxWORD%02dR TO idxWORD%02dR," % (reindex_prefix, index_id, index_id) +
         "idxWORD%02dF TO old_idxWORD%02dF," % (index_id, index_id) +
         "%sidxWORD%02dF TO idxWORD%02dF," % (reindex_prefix, index_id, index_id) +
         "idxPAIR%02dR TO old_idxPAIR%02dR," % (index_id, index_id) +
         "%sidxPAIR%02dR TO idxPAIR%02dR," % (reindex_prefix, index_id, index_id) +
         "idxPAIR%02dF TO old_idxPAIR%02dF," % (index_id, index_id) +
         "%sidxPAIR%02dF TO idxPAIR%02dF," % (reindex_prefix, index_id, index_id) +
         "idxPHRASE%02dR TO old_idxPHRASE%02dR," % (index_id, index_id) +
         "%sidxPHRASE%02dR TO idxPHRASE%02dR," % (reindex_prefix, index_id, index_id) +
         "idxPHRASE%02dF TO old_idxPHRASE%02dF," % (index_id, index_id) +
         "%sidxPHRASE%02dF TO idxPHRASE%02dF;" % (reindex_prefix, index_id, index_id)
     )
     write_message("Dropping old index tables for id %s" % index_id)
     run_sql_drop_silently("""DROP TABLE old_idxWORD%02dR,
                              old_idxWORD%02dF,
                              old_idxPAIR%02dR,
                              old_idxPAIR%02dF,
                              old_idxPHRASE%02dR,
                              old_idxPHRASE%02dF""" % ((index_id, )* 6)
                              ) # kwalitee: disable=sql
 
 
 def init_temporary_reindex_tables(index_id, reindex_prefix="tmp_"):
     """Create reindexing temporary tables."""
     write_message("Creating new tmp index tables for id %s" % index_id)
 
     query = """DROP TABLE IF EXISTS %sidxWORD%02dF""" % \
             (wash_table_column_name(reindex_prefix), index_id)
     run_sql_drop_silently(query) # kwalitee: disable=sql
 
     run_sql("""CREATE TABLE %sidxWORD%02dF (
                         id mediumint(9) unsigned NOT NULL auto_increment,
                         term varchar(50) default NULL,
                         hitlist longblob,
                         PRIMARY KEY  (id),
                         UNIQUE KEY term (term)
                         ) ENGINE=MyISAM""" % (reindex_prefix, index_id))
 
     query = """DROP TABLE IF EXISTS %sidxWORD%02dR""" % \
             (wash_table_column_name(reindex_prefix), index_id)
     run_sql_drop_silently(query) # kwalitee: disable=sql
 
     run_sql("""CREATE TABLE %sidxWORD%02dR (
                         id_bibrec mediumint(9) unsigned NOT NULL,
                         termlist longblob,
                         type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT',
                         PRIMARY KEY (id_bibrec,type)
                         ) ENGINE=MyISAM""" % (reindex_prefix, index_id))
 
     query = """DROP TABLE IF EXISTS %sidxPAIR%02dF""" % \
             (wash_table_column_name(reindex_prefix), index_id)
     run_sql_drop_silently(query) # kwalitee: disable=sql
 
     run_sql("""CREATE TABLE %sidxPAIR%02dF (
                         id mediumint(9) unsigned NOT NULL auto_increment,
                         term varchar(100) default NULL,
                         hitlist longblob,
                         PRIMARY KEY  (id),
                         UNIQUE KEY term (term)
                         ) ENGINE=MyISAM""" % (reindex_prefix, index_id))
 
     query = """DROP TABLE IF EXISTS %sidxPAIR%02dR""" % \
             (wash_table_column_name(reindex_prefix), index_id)
     run_sql_drop_silently(query) # kwalitee: disable=sql
 
     run_sql("""CREATE TABLE %sidxPAIR%02dR (
                         id_bibrec mediumint(9) unsigned NOT NULL,
                         termlist longblob,
                         type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT',
                         PRIMARY KEY (id_bibrec,type)
                         ) ENGINE=MyISAM""" % (reindex_prefix, index_id))
 
     query = """DROP TABLE IF EXISTS %sidxPHRASE%02dF""" % \
             (wash_table_column_name(reindex_prefix), index_id)
     run_sql_drop_silently(query) # kwalitee: disable=sql
 
     run_sql("""CREATE TABLE %sidxPHRASE%02dF (
                         id mediumint(9) unsigned NOT NULL auto_increment,
                         term text default NULL,
                         hitlist longblob,
                         PRIMARY KEY  (id),
                         KEY term (term(50))
                         ) ENGINE=MyISAM""" % (reindex_prefix, index_id))
 
     query = """DROP TABLE IF EXISTS %sidxPHRASE%02dR""" %  \
             (wash_table_column_name(reindex_prefix), index_id)
     run_sql_drop_silently(query) # kwalitee: disable=sql
 
     run_sql("""CREATE TABLE %sidxPHRASE%02dR (
                         id_bibrec mediumint(9) unsigned NOT NULL default '0',
                         termlist longblob,
                         type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT',
                         PRIMARY KEY  (id_bibrec,type)
                         ) ENGINE=MyISAM""" % (reindex_prefix, index_id))
 
 
 def remove_subfields(s):
     "Removes subfields from string, e.g. 'foo $$c bar' becomes 'foo bar'."
     return re_subfields.sub(' ', s)
 
 
 def get_field_indexes(field):
     """Returns indexes names and ids corresponding to the given field"""
     if recognize_marc_tag(field):
         #field is actually a tag
         return get_marc_tag_indexes(field, virtual=False)
     else:
         return get_nonmarc_tag_indexes(field, virtual=False)
 
 
 get_field_indexes_memoised = Memoise(get_field_indexes)
 
 
 def get_index_tokenizer(index_id):
     """Returns value of a tokenizer field from idxINDEX database table
        @param index_id: id of the index
     """
     query = "SELECT tokenizer FROM idxINDEX WHERE id=%s" % index_id
     out = None
     try:
         res = run_sql(query)
         if res:
             out = _TOKENIZERS[res[0][0]]
     except DatabaseError:
         write_message("Exception caught for SQL statement: %s; " + \
                       "column tokenizer might not exist" % query, sys.stderr)
     except KeyError:
         write_message("Exception caught: there is no such tokenizer")
         out = None
     return out
 
 
 def detect_tokenizer_type(tokenizer):
     """
         Checks what is the main type of the tokenizer.
         For more information on tokenizer types take
         a look at BibIndexTokenizer class.
         @param tokenizer: instance of a tokenizer
     """
     from invenio.bibindex_tokenizers.BibIndexStringTokenizer import BibIndexStringTokenizer
     from invenio.bibindex_tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer
     from invenio.bibindex_tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer
 
     tokenizer_inheritance_tree = inspect.getmro(tokenizer.__class__)
     if BibIndexStringTokenizer in tokenizer_inheritance_tree:
         return CFG_BIBINDEX_TOKENIZER_TYPE['string']
     if BibIndexMultiFieldTokenizer in tokenizer_inheritance_tree:
         return CFG_BIBINDEX_TOKENIZER_TYPE['multifield']
     if BibIndexRecJsonTokenizer in tokenizer_inheritance_tree:
         return CFG_BIBINDEX_TOKENIZER_TYPE['recjson']
     return CFG_BIBINDEX_TOKENIZER_TYPE['unknown']
 
 
 def get_last_updated_all_indexes():
     """Returns last modification date for all defined indexes"""
     query= """SELECT name, last_updated FROM idxINDEX"""
     res = run_sql(query)
     return res
 
 
 def split_ranges(parse_string):
     """Parse a string a return the list or ranges."""
     recIDs = []
     ranges = parse_string.split(",")
     for arange in ranges:
         tmp_recIDs = arange.split("-")
 
         if len(tmp_recIDs) == 1:
             recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
         else:
             if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
                 tmp = tmp_recIDs[0]
                 tmp_recIDs[0] = tmp_recIDs[1]
                 tmp_recIDs[1] = tmp
             recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
     return recIDs
 
 
 def get_word_tables(tables):
     """ Given a list of table names it return a list of tuples
     (index_id, index_name, index_tags).
     """
     wordTables = []
     if tables:
         for index in tables:
             index_id = get_index_id_from_index_name(index)
             if index_id:
                 wordTables.append((index_id, index, get_index_tags(index)))
             else:
                 write_message("Error: There is no %s words table." % \
                                index, sys.stderr)
     return wordTables
 
 
 def get_date_range(var):
     "Returns the two dates contained as a low,high tuple"
     limits = var.split(",")
     if len(limits) == 1:
         low = get_datetime(limits[0])
         return low, None
     if len(limits) == 2:
         low = get_datetime(limits[0])
         high = get_datetime(limits[1])
         return low, high
     return None, None
 
 
 def create_range_list(res):
     """Creates a range list from a recID select query result contained
     in res. The result is expected to have ascending numerical order."""
     if not res:
         return []
     row = res[0]
     if not row:
         return []
     else:
         range_list = [[row, row]]
     for row in res[1:]:
         row_id = row
         if row_id == range_list[-1][1] + 1:
             range_list[-1][1] = row_id
         else:
             range_list.append([row_id, row_id])
     return range_list
 
 
 def beautify_range_list(range_list):
     """Returns a non overlapping, maximal range list"""
     ret_list = []
     for new in range_list:
         found = 0
         for old in ret_list:
             if new[0] <= old[0] <= new[1] + 1 or new[0] - 1 <= old[1] <= new[1]:
                 old[0] = min(old[0], new[0])
                 old[1] = max(old[1], new[1])
                 found = 1
                 break
 
         if not found:
             ret_list.append(new)
 
     return ret_list
 
 
 def truncate_index_table(index_name):
     """Properly truncate the given index."""
     index_id = get_index_id_from_index_name(index_name)
     if index_id:
         write_message('Truncating %s index table in order to reindex.' % \
                       index_name, verbose=2)
         run_sql("""UPDATE idxINDEX SET last_updated='0000-00-00 00:00:00'
                    WHERE id=%s""", (index_id, ))
         run_sql("TRUNCATE idxWORD%02dF" % index_id) # kwalitee: disable=sql
         run_sql("TRUNCATE idxWORD%02dR" % index_id) # kwalitee: disable=sql
         run_sql("TRUNCATE idxPHRASE%02dF" % index_id) # kwalitee: disable=sql
         run_sql("TRUNCATE idxPHRASE%02dR" % index_id) # kwalitee: disable=sql
 
 
 def update_index_last_updated(indexes, starting_time=None):
     """Update last_updated column of the index table in the database.
        Puts starting time there so that if the task
        was interrupted for record download,
        the records will be reindexed next time.
        @param indexes: list of indexes names
     """
     if starting_time is None:
         return None
     for index_name in indexes:
         write_message("updating last_updated to %s...for %s index" % \
                       (starting_time, index_name), verbose=9)
         run_sql("UPDATE idxINDEX SET last_updated=%s WHERE name=%s",
                 (starting_time, index_name))
 
 
 def get_percentage_completed(num_done, num_total):
     """ Return a string containing the approx. percentage completed """
     percentage_remaining = 100.0 * float(num_done) / float(num_total)
     if percentage_remaining:
         percentage_display = "(%.1f%%)" % (percentage_remaining, )
     else:
         percentage_display = ""
     return percentage_display
 
 
 def _fill_dict_of_indexes_with_empty_sets():
     """find_affected_records internal function.
        Creates dict: {'index_name1':set([]), ...}
     """
     index_dict = {}
     tmp_all_indexes = get_all_indexes(virtual=False)
     for index in tmp_all_indexes:
         index_dict[index] = set([])
     return index_dict
 
 
 def find_affected_records_for_index(indexes=None, recIDs=None, force_all_indexes=False):
     """
         Function checks which records need to be changed/reindexed
         for given index/indexes.
         Makes use of hstRECORD table where
         different revisions of record are kept.
         If parameter force_all_indexes is set
         function will assign all recIDs to all indexes.
         @param indexes: names of indexes for reindexation separated by coma
         @param recIDs: recIDs for reindexation in form:
                        [[range1_down, range1_up],[range2_down, range2_up]..]
         @param force_all_indexes: should we index all indexes?
     """
 
     if indexes is None:
         indexes = []
     if recIDs is None:
         recIDs = []
 
     tmp_dates = dict(get_last_updated_all_indexes())
     modification_dates = dict([(date, tmp_dates[date] or datetime(1000, 1, 1, 1, 1, 1))
                                     for date in tmp_dates])
     tmp_all_indexes = get_all_indexes(virtual=False)
 
     indexes = remove_inexistent_indexes(indexes, leave_virtual=False)
     if not indexes:
         return {}
 
     def _should_reindex_for_revision(index_name, revision_date):
         try:
             if modification_dates[index_name] < revision_date and \
                index_name in indexes:
                 return True
             return False
         except KeyError:
             return False
 
     if force_all_indexes:
         records_for_indexes = {}
         all_recIDs = []
         for recIDs_range in recIDs:
             all_recIDs.extend(range(recIDs_range[0], recIDs_range[1]+1))
         for index in indexes:
             records_for_indexes[index] = all_recIDs
         return records_for_indexes
 
     min_last_updated = get_min_last_updated(indexes)[0][0] or \
                        datetime(1000, 1, 1, 1, 1, 1)
 
     recIDs_info = []
     for recIDs_range in recIDs:
 
         # firstly, determine which records were updated since min_last_updated:
         query = """SELECT id_bibrec,job_date,affected_fields FROM hstRECORD
                    WHERE id_bibrec BETWEEN %s AND %s AND
                          job_date > '%s'""" % \
                    (recIDs_range[0], recIDs_range[1], min_last_updated)
         res = run_sql(query)
         if res:
             recIDs_info.extend(res)
 
         # secondly, there may be newly inserted records which were
         # uploaded with old timestamp (via 005), so let us detect
         # those too, using their "real" modification_date:
         res = run_sql("""SELECT bibrec.id,modification_date,''
                          FROM bibrec, hstRECORD
                          WHERE modification_date>%s
                            AND bibrec.id=id_bibrec
                            AND (SELECT COUNT(*) FROM hstRECORD WHERE id_bibrec=bibrec.id)=1""", (min_last_updated,))
         if res:
             recIDs_info.extend(res)
 
     indexes_to_change = _fill_dict_of_indexes_with_empty_sets()
     for recID_info in recIDs_info:
         recID, revision, affected_fields = recID_info
         affected_fields = affected_fields.split(",")
         indexes_for_recID = set()
         for field in affected_fields:
             if field:
                 field_indexes = get_field_indexes_memoised(field) or []
                 indexes_names = set([idx[1] for idx in field_indexes])
                 indexes_for_recID |= indexes_names
             else:
                 # record was inserted, all fields were changed,
                 # no specific affected fields
                 indexes_for_recID |= set(tmp_all_indexes)
         indexes_for_recID_filtered = [ind for ind in indexes_for_recID if _should_reindex_for_revision(ind, revision)]
         for index in indexes_for_recID_filtered:
             indexes_to_change[index].add(recID)
 
     indexes_to_change = dict((k, list(sorted(v))) for k, v in indexes_to_change.iteritems() if v)
     return indexes_to_change
 
 
 def chunk_generator(rng):
     """
         Splits one range into several smaller ones
         with respect to global chunksize variable.
         @param rng: range of records
         @type rng: list in the form: [1, 2000]
     """
     global chunksize
     current_low = rng[0]
     current_high = rng[0]
     if rng[0] == None or rng[1] == None:
         raise StopIteration
     if rng[1] - rng[0] + 1 <= chunksize:
         yield rng
     else:
         while current_high - 1 < rng[1]:
             current_high += chunksize
             yield current_low, min(current_high - 1, rng[1])
             current_low += chunksize
 
 
 class AbstractIndexTable(object):
     """
         This class represents an index table in database.
         An index consists of three different kinds of tables:
         table which stores only words in db,
         table which stores pairs of words and
         table which stores whole phrases.
         The class represents only one table. Another instance of
         the class must be created in order to store different
         type of terms.
 
         This class is an abstract class. It contains methods
         to connect to db and methods which facilitate
         inserting/modifing/removing terms from it. The class
         also contains methods which help managing the memory.
         All specific methods for indexing can be found in corresponding
         classes for virtual and regular indexes.
     """
 
     def __init__(self, index_name, table_type, table_prefix="", wash_index_terms=50):
         self.index_name = index_name
         self.index_id = get_index_id_from_index_name(index_name)
         self.table_type = table_type
         self.wash_index_terms = wash_index_terms
         self.table_name = wash_table_column_name(table_prefix + \
                                                 "idx" + \
                                                 table_type + \
                                                 ("%02d" % self.index_id) + "F")
         self.table_prefix = table_prefix
 
         self.value = {} # cache
         self.recIDs_in_mem = []
 
     def put_into_db(self, mode="normal"):
         """Updates the current words table in the corresponding DB
            idxFOO table.  Mode 'normal' means normal execution,
            mode 'emergency' means words index reverting to old state.
         """
         write_message("%s %s wordtable flush started" % \
                       (self.table_name, mode))
         write_message('...updating %d words into %s started' % \
                       (len(self.value), self.table_name))
         task_update_progress("(%s:%s) flushed %d/%d words" % \
                       (self.table_name, self.index_name, 0, len(self.value)))
 
         self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem)
 
         tab_name = self.table_name[:-1] + "R"
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %s SET type='TEMPORARY' WHERE id_bibrec
                 BETWEEN %%s AND %%s AND type='CURRENT'""" % tab_name
                 write_message(query % (group[0], group[1]), verbose=9)
                 run_sql(query, (group[0], group[1]))
 
         nb_words_total = len(self.value)
         nb_words_report = int(nb_words_total / 10.0)
         nb_words_done = 0
         for word in self.value.keys():
             self.put_word_into_db(word)
             nb_words_done += 1
             if nb_words_report != 0 and ((nb_words_done % nb_words_report) == 0):
                 write_message('......processed %d/%d words' % \
                               (nb_words_done, nb_words_total))
                 percentage_display = get_percentage_completed(nb_words_done, nb_words_total)
                 task_update_progress("(%s:%s) flushed %d/%d words %s" % \
                                      (tab_name, self.index_name,
                                       nb_words_done, nb_words_total,
                                       percentage_display))
 
         write_message('...updating %d words into %s ended' % \
                       (nb_words_total, tab_name))
 
         write_message('...updating reverse table %s started' % tab_name)
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %s SET type='CURRENT' WHERE id_bibrec
                 BETWEEN %%s AND %%s AND type='FUTURE'""" % tab_name
                 write_message(query % (group[0], group[1]), verbose=9)
                 run_sql(query, (group[0], group[1]))
                 query = """DELETE FROM %s WHERE id_bibrec
                 BETWEEN %%s AND %%s AND type='TEMPORARY'""" % tab_name
                 write_message(query % (group[0], group[1]), verbose=9)
                 run_sql(query, (group[0], group[1]))
             write_message('End of updating wordTable into %s' % \
                           tab_name, verbose=9)
         elif mode == "emergency":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %s SET type='CURRENT' WHERE id_bibrec
                 BETWEEN %%s AND %%s AND type='TEMPORARY'""" % tab_name
                 write_message(query % (group[0], group[1]), verbose=9)
                 run_sql(query, (group[0], group[1]))
                 query = """DELETE FROM %s WHERE id_bibrec
                 BETWEEN %%s AND %%s AND type='FUTURE'""" % tab_name
                 write_message(query % (group[0], group[1]), verbose=9)
                 run_sql(query, (group[0], group[1]))
             write_message('End of emergency flushing wordTable into %s' % \
                           tab_name, verbose=9)
         write_message('...updating reverse table %s ended' % tab_name)
 
         self.clean()
         self.recIDs_in_mem = []
         write_message("%s %s wordtable flush ended" % \
                       (self.table_name, mode))
         task_update_progress("(%s:%s) flush ended" % \
                       (self.table_name, self.index_name))
 
     def put_word_into_db(self, word):
         """Flush a single word to the database and delete it from memory"""
         set = self.load_old_recIDs(word)
         if set is not None: # merge the word recIDs found in memory:
             hitlist_was_changed = self.merge_with_old_recIDs(word, set)
             if not hitlist_was_changed:
                 # nothing to update:
                 write_message("......... unchanged hitlist for ``%s''" % \
                               word, verbose=9)
             else:
                 # yes there were some new words:
                 write_message("......... updating hitlist for ``%s''" %  \
                               word, verbose=9)
-                run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % wash_table_column_name(self.table_name), (set.fastdump(), word)) # kwalitee: disable=sql
+                run_sql("UPDATE %s SET hitlist=_binary %%s WHERE term=%%s" % wash_table_column_name(self.table_name), (set.fastdump(), word)) # kwalitee: disable=sql
 
         else: # the word is new, will create new set:
             write_message("......... inserting hitlist for ``%s''" % \
                           word, verbose=9)
             set = intbitset(self.value[word].keys())
             try:
-                run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, %%s)" % wash_table_column_name(self.table_name), (word, set.fastdump())) # kwalitee: disable=sql
+                run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, _binary %%s)" % wash_table_column_name(self.table_name), (word, set.fastdump())) # kwalitee: disable=sql
             except Exception, e:
                 ## We send this exception to the admin only when is not
                 ## already reparing the problem.
                 register_exception(prefix="Error when putting the term '%s' into db (hitlist=%s): %s\n" % (repr(word), set, e), alert_admin=(task_get_option('cmd') != 'repair'))
 
         if not set: # never store empty words
             run_sql("DELETE FROM %s WHERE term=%%s" % wash_table_column_name(self.table_name), (word,)) # kwalitee: disable=sql
 
     def put(self, recID, word, sign):
         """Keeps track of changes done during indexing
            and stores these changes in memory for further use.
            Indexing process needs this information later while
            filling in the database.
 
            @param recID: recID of the record we want to update in memory
            @param word: word we want to update
            @param sing: sign of the word, 1 means keep this word in database,
                                          -1 remove word from database
         """
         value = self.value
         try:
             if self.wash_index_terms:
                 word = wash_index_term(word, self.wash_index_terms)
             if value.has_key(word):
                 # the word 'word' exist already: update sign
                 value[word][recID] = sign
             else:
                 value[word] = {recID: sign}
         except Exception as e:
             write_message("Error: Cannot put word %s with sign %d for recID %s." % \
                           (word, sign, recID))
 
     def load_old_recIDs(self, word):
         """Load existing hitlist for the word from the database index files."""
         query = "SELECT hitlist FROM %s WHERE term=%%s" % self.table_name
         res = run_sql(query, (word, ))
         if res:
             return intbitset(res[0][0])
         else:
             return None
 
     def merge_with_old_recIDs(self, word, set):
         """Merge the system numbers stored in memory
         (hash of recIDs with value +1 or -1 according
         to whether to add/delete them) with those stored
         in the database index and received in set universe
         of recIDs for the given word.
 
         Return False in case no change was done to SET, return True in case SET
         was changed.
         """
         oldset = intbitset(set)
         set.update_with_signs(self.value[word])
         return set != oldset
 
     def clean(self):
         "Cleans the cache."
         self.value = {}
 
 
 class VirtualIndexTable(AbstractIndexTable):
     """
         There are two types of indexes: virtual and regular/normal.
         Check WordTable class for more on normal indexes.
 
         This class represents a single index table for virtual index
         (see also: AbstractIndexTable).
         Virtual index doesn't store its own terms,
         it accumulates terms from other indexes.
         Good example of virtual index is the global index which stores
         terms from title, abstract, keyword, author and so on.
 
         This class contains methods for indexing virtual indexes.
         See also: run_update()
     """
 
     def __init__(self, index_name, table_type, table_prefix="", wash_index_terms=50):
         """
             Creates VirtualIndexTable instance.
             @param index_name: name of the index we want to reindex
             @param table_type: words, pairs or phrases
             @param table_prefix: add "tmp_" if you want to
                                  reindex to temporary table
         """
         AbstractIndexTable.__init__(self, index_name,
                                           table_type,
                                           table_prefix,
                                           wash_index_terms)
         self.mode = "normal"
         self.dependent_indexes = dict(get_virtual_index_building_blocks(self.index_id))
 
     def set_reindex_mode(self):
         """
             Sets reindex mode. VirtualIndexTable will
             remove all its content from database and
             use insert_index function to repopulate it.
         """
         self.mode = "reindex"
 
     def run_update(self, flush=10000):
         """
             Function starts all updating processes for virtual index.
             It will take all information about pending changes from database
             from queue tables (idxWORD/PAIR/PHRASExxQ), process them
             and trigger appropriate indexing functions.
             @param flush: how many records we will put in one go
                           into database (at most);
                           see also: opt_flush in WordTable class
         """
         global chunksize
 
         if self.mode == "reindex":
             self.clean_database()
             for index_id, index_name in self.dependent_indexes.iteritems():
                 rng = get_records_range_for_index(index_id)
                 flush_count = 0
                 if not rng:
                     continue
                 write_message('Virtual index: %s is being reindexed for %s index' % \
                               (self.index_name, index_name))
                 chunks = chunk_generator(rng)
                 try:
                     while True:
                         task_sleep_now_if_required()
                         chunk = chunks.next()
                         self.insert_index(index_id, chunk[0], chunk[1])
                         flush_count = flush_count + chunk[1] - chunk[0] + 1
                         self.recIDs_in_mem.append(list(chunk))
                         if flush_count >= flush:
                             flush_count = 0
                             self.put_into_db()
                 except StopIteration:
                     if flush_count > 0:
                         self.put_into_db()
                 self.clean_queue_table(index_name)
         else:
             for index_id, index_name in self.dependent_indexes.iteritems():
                 query = """SELECT id_bibrec_low, id_bibrec_high, mode FROM %s
                            WHERE index_name=%%s
                            ORDER BY runtime ASC""" % \
                            (self.table_name[:-1] + "Q")
                 entries = self.remove_duplicates(run_sql(query, (index_name, )))
                 if entries:
                     write_message('Virtual index: %s is being updated for %s index' % \
                                   (self.index_name, index_name))
                 for entry in entries:
                     operation = None
                     recID_low, recID_high, mode = entry
 
                     if mode == CFG_BIBINDEX_UPDATE_MODE["Update"]:
                         operation = self.update_index
                     elif mode == CFG_BIBINDEX_UPDATE_MODE["Remove"]:
                         operation = self.remove_index
                     elif mode == CFG_BIBINDEX_UPDATE_MODE["Insert"]:
                         operation = self.insert_index
 
                     flush_count = 0
                     chunks = chunk_generator([recID_low, recID_high])
                     try:
                         while True:
                             task_sleep_now_if_required()
                             chunk = chunks.next()
                             operation(index_id, chunk[0], chunk[1])
                             flush_count = flush_count + chunk[1] - chunk[0] + 1
                             self.recIDs_in_mem.append(list(chunk))
                             if flush_count >= flush:
                                 flush_count = 0
                                 self.put_into_db()
                     except StopIteration:
                         if flush_count > 0:
                             self.put_into_db()
                 self.clean_queue_table(index_name)
 
     def retrieve_new_values_from_index(self, index_id, records_range):
         """
             Retrieves new values from dependent index
             for specific range of records.
             @param index_id: id of the dependent index
             @param records_range: the smallest and the biggest id
                                   in the range: [id_low, id_high]
         """
 
         tab_name = "idx" + self.table_type + ("%02d" % index_id) + "R"
         query = """SELECT id_bibrec, termlist FROM %s WHERE id_bibrec
                    BETWEEN %%s AND %%s""" % tab_name
         new_regular_values = run_sql(query, (records_range[0], records_range[1]))
         if new_regular_values:
             zipped = zip(*new_regular_values)
             new_regular_values = dict(zip(zipped[0], map(deserialize_via_marshal, zipped[1])))
         else:
             new_regular_values = dict()
         return new_regular_values
 
     def retrieve_old_values(self, records_range):
         """
             Retrieves old values from database for this virtual index
             for specific records range.
             @param records_range: the smallest and the biggest id
                                   in the range: [id_low, id_high]
         """
 
         virtual_tab_name = self.table_name[:-1] + "R"
         query = """SELECT id_bibrec, termlist FROM %s
                    WHERE type='CURRENT' AND
                    id_bibrec BETWEEN %%s AND %%s""" % virtual_tab_name
         old_virtual_values = run_sql(query, (records_range[0], records_range[1]))
         if old_virtual_values:
             zipped = zip(*old_virtual_values)
             old_virtual_values = dict(zip(zipped[0], map(deserialize_via_marshal, zipped[1])))
         else:
             old_virtual_values = dict()
         return old_virtual_values
 
     def update_index(self, index_id, recID_low, recID_high):
         """
             Updates the state of virtual index for records in range:
             recID_low, recID_high for index specified by index_id.
             Function stores terms in idxWORD/PAIR/PHRASExxR tables with
             prefixes for specific index, for example term 'ellis'
             from author index will be stored in reversed table as:
             '__author__ellis'. It allows fast operations on only part of terms
             @param index_id: id of the dependent index we want to remove
             @param recID_low: first recID from the range of considered recIDs
             @param recID_high: last recID from the range of considered recIDs
         """
         index_name = self.dependent_indexes[index_id]
         update_cache_for_record = self.update_cache_for_record
         virtual_tab_name = self.table_name[:-1] + "R"
 
         # take new values
         new_regular_values = self.retrieve_new_values_from_index(index_id, [recID_low, recID_high])
 
         # take old values
         old_virtual_values = self.retrieve_old_values([recID_low, recID_high])
 
         # update reversed table
         for recID in xrange(recID_low, recID_high + 1):
             new_values = new_regular_values.get(recID) or []
             old_values = old_virtual_values.get(recID) or []
             to_serialize = update_cache_for_record(index_name, recID, old_values, new_values)
             if len(to_serialize) == 0:
                 continue
             run_sql("""INSERT INTO %s (id_bibrec,termlist,type)
-                       VALUES (%%s,%%s,'FUTURE')""" % \
+                       VALUES (%%s,_binary %%s,'FUTURE')""" % \
                        wash_table_column_name(virtual_tab_name),
                        (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql
             try:
-                run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
+                run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
             except DatabaseError:
                 pass
 
     def insert_index(self, index_id, recID_low, recID_high):
         """
             Inserts terms from dependent index to virtual table
             without looking what's inside the virtual table and
             what terms are being added. It's faster than 'updating',
             but it can only be used when virtual table is free of
             terms from this dependent index.
             @param index_id: id of the dependent index we want to remove
             @param recID_low: first recID from the range of considered recIDs
             @param recID_high: last recID from the range of considered recIDs
         """
         index_name = self.dependent_indexes[index_id]
         insert_to_cache_for_record = self.insert_to_cache_for_record
         virtual_tab_name = self.table_name[:-1] + "R"
 
         # take new values
         new_regular_values = self.retrieve_new_values_from_index(index_id, [recID_low, recID_high])
 
         # take old values
         old_virtual_values = self.retrieve_old_values([recID_low, recID_high])
 
         # update reversed table
         for recID in xrange(recID_low, recID_high + 1):
             new_values = new_regular_values.get(recID) or []
             old_values = old_virtual_values.get(recID) or []
             to_serialize = insert_to_cache_for_record(index_name, recID, old_values, new_values)
             if len(to_serialize) == 0:
                 continue
-            run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql
+            run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql
             try:
-                run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
+                run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
             except DatabaseError:
                 pass
 
     def remove_index(self, index_id, recID_low, recID_high):
         """
             Removes words found in dependent index from reversed
             table of virtual index. Updates the state of the memory
             (for future removal from forward table).
             Takes into account that given words can be found in more
             that one dependent index and it won't mark these words
             for the removal process.
             @param index_id: id of the dependent index we want to remove
             @param recID_low: first recID from the range of considered recIDs
             @param recID_high: last recID from the range of considered recIDs
         """
         index_name = self.dependent_indexes[index_id]
         remove_from_cache_for_record = self.remove_from_cache_for_record
         virtual_tab_name = self.table_name[:-1] + "R"
 
         # take old values
         old_virtual_values = self.retrieve_old_values([recID_low, recID_high])
 
         # update reversed table
         for recID in xrange(recID_low, recID_high + 1):
             old_values = old_virtual_values.get(recID) or []
             to_serialize = remove_from_cache_for_record(index_name, recID, old_values)
             if len(to_serialize) == 0:
                 continue
-            run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql
+            run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal(to_serialize))) # kwalitee: disable=sql
             try:
-                run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
+                run_sql("INSERT INTO %s (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(virtual_tab_name), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
             except DatabaseError:
                 pass
 
     def update_cache_for_record(self, index_name, recID, old_values, new_values):
         """
             Updates memory (cache) with information on what to
             remove/add/modify in forward table for specified record.
             It also returns new terms which should be indexed for given record.
             @param index_name: index name of dependent index
             @param recID: considered record
             @param old_values: all old values from all dependent indexes
                                for this virtual index for recID
             @param new_values: new values from some dependent index
                                which should be added
         """
         prefix = make_prefix(index_name)
         put = self.put
         new_values_prefix = [prefix + term for term in new_values]
         part_values = []
         tmp_old_values_prefix = []
         # split old values from v.index into those with 'prefix' and those without
         for term in old_values:
             if term.startswith(prefix):
                 term_without_prefix = re.sub(re_prefix, '', term)
                 part_values.append(term_without_prefix)
                 put(recID, term_without_prefix, -1)
             else:
                 tmp_old_values_prefix.append(term)
 
         # remember not to remove words that occur more than once
         part_values = set(part_values)
         for value in tmp_old_values_prefix:
             term_without_prefix = re.sub(re_prefix, '', value)
             if term_without_prefix in part_values:
                 put(recID, term_without_prefix, 1)
         for term_without_prefix in new_values:
             put(recID, term_without_prefix, 1)
 
         tmp_new_values_prefix = list(tmp_old_values_prefix)
         tmp_new_values_prefix.extend(new_values_prefix)
         return tmp_new_values_prefix
 
     def insert_to_cache_for_record(self, index_name, recID, old_values, new_values):
         """
             Updates cache with terms which should be inserted to database.
             Used in insert_index function. See also: update_cache_for_record
             which is analogous for update_index function.
         """
         prefix = make_prefix(index_name)
 
         append = old_values.append
         put = self.put
         for term in new_values:
             append(prefix + term)
             put(recID, term, 1)
         return old_values
 
     def remove_from_cache_for_record(self, index_name, recID, old_values):
         """
             Updates information in cache with terms which should be removed
             from virtual table. Used in remove_index function.
         """
         prefix = make_prefix(index_name)
         tmp_rest = []
         tmp_removed = []
         tmp_new_values = []
 
         append_to_new = tmp_new_values.append
         append_to_rest = tmp_rest.append
         append_to_removed = tmp_removed.append
         put = self.put
         for term in old_values:
             if term.startswith(prefix):
                 term_without_prefix = re.sub(re_prefix, '', term)
                 append_to_removed(term_without_prefix)
                 put(recID, term_without_prefix, -1)
             else:
                 append_to_rest(re.sub(re_prefix, '', term))
                 append_to_new(term)
 
         to_remember = set(tmp_rest) & set(tmp_removed)
         for term_without_prefix in to_remember:
             put(recID, term_without_prefix, 1)
         return tmp_new_values
 
     def clean_database(self):
         """Removes all entries from corresponding tables in database"""
         query = """DELETE FROM %s""" % self.table_name
         run_sql(query)
         query = """DELETE FROM %s""" % self.table_name[:-1] + "R"
         run_sql(query)
 
     def clean_queue_table(self, index_name):
         """
             Cleans queue table (i.e. idxWORD/PAIR/PHRASExxQ)
             for specific index. It means that function will remove
             all entries from db from queue table for this index.
         """
         query = "DELETE FROM %s WHERE index_name='%s'" % \
                  (self.table_name[:-1].lstrip(self.table_prefix) + "Q",
                   index_name)
         run_sql(query)
 
     def remove_duplicates(self, entries):
         """
             Removes duplicates from a list of entries (taken from Queue table)
             in order to process a single command only once.
             Queue table may look like this:
             id (..) id_bibrec_low id_bibrec_high index_name mode
             ...
             12          1               100       title     update
             13          1               100       title     update
             We don't want to perform the same operation twice. First we want to
             squash the same commands into one.
             @param entries: list of entries taken from the database
         """
         unique = set()
         return [entry for entry in entries if entry not in unique and not unique.add(entry)]
 
 
     def remove_dependent_index(self, index_name):
         """
             Removes dependent index from this virtual index.
             It means removing all words from all records with prefix:
             __index_name__ from reversed table, and removing some of
             them from forward table if they don't appear in another
             dependent index.
             @param index_name: name of the dependent index to remove
         """
         flush = 10000
         dependent = self.dependent_indexes.values()
         if len(dependent) == 0:
             write_message("Specified index is not virtual...")
             return
         if index_name not in dependent:
             write_message("Dependent index already removed...")
             return
         index_id = get_index_id_from_index_name(index_name)
         records_range = get_records_range_for_index(index_id)
         write_message("Removing an index: %s" % index_name)
         if records_range:
             flush_count = 0
             chunks = chunk_generator([records_range[0], records_range[1]])
             try:
                 while True:
                     task_sleep_now_if_required()
                     chunk = chunks.next()
                     self.remove_index(index_id, chunk[0], chunk[1])
                     flush_count = flush_count + chunk[1] - chunk[0] + 1
                     self.recIDs_in_mem.append(chunk)
                     if flush_count >= flush:
                         flush_count = 0
                         self.put_into_db()
             except StopIteration:
                 if flush_count > 0:
                     self.put_into_db()
 
 
 class WordTable(AbstractIndexTable):
     """
         This class represents a single index table of regular index
         (regular means it doesn't accumulates data from other indexes,
          but it takes data directly from metadata of records which
          are being indexed; for other type of index check: VirtualIndexTable).
 
         To start indexing process one need to invoke add_recIDs() method.
         For furher reading see description of this method.
     """
 
     def __init__(self, index_name, table_type, table_prefix="", wash_index_terms=50):
         """Creates words table instance.
         @param index_name: the index name
         @param index_id: the index integer identificator
         @param fields_to_index: a list of fields to index
         @param table_type: type of the wordtable: Words, Pairs, Phrases
         @param table_prefix: prefix for table name, indexing will be performed
             on table: <<table_prefix>>idx<<wordtable_type>>XXF
         @param wash_index_terms: do we wash index terms, and if yes (when >0),
             how many characters do we keep in the index terms; see
             max_char_length parameter of wash_index_term()
         """
         AbstractIndexTable.__init__(self, index_name, table_type, table_prefix, wash_index_terms)
         self.tags = get_index_tags(index_name, virtual=False)
         self.nonmarc_tags = get_index_tags(index_name,
                                            virtual=False,
                                            tagtype="nonmarc")
         self.timestamp = datetime.now()
 
         self.virtual_indexes = get_index_virtual_indexes(self.index_id)
         self.virtual_index_update_mode = CFG_BIBINDEX_UPDATE_MODE["Update"]
 
         try:
             self.stemming_language = get_index_stemming_language(self.index_id)
         except KeyError:
             self.stemming_language = ''
         self.remove_stopwords = get_index_remove_stopwords(self.index_id)
         self.remove_html_markup = get_index_remove_html_markup(self.index_id)
         self.remove_latex_markup = get_index_remove_latex_markup(self.index_id)
         self.tokenizer = get_index_tokenizer(self.index_id)(self.stemming_language,
                                                             self.remove_stopwords,
                                                             self.remove_html_markup,
                                                             self.remove_latex_markup)
         self.tokenizer_type = detect_tokenizer_type(self.tokenizer)
         self.default_tokenizer_function = self.tokenizer.get_tokenizing_function(table_type)
 
         self.special_tags = self._handle_special_tags()
 
         if self.stemming_language and self.table_name.startswith('idxWORD'):
             write_message('%s has stemming enabled, language %s' % (self.table_name, self.stemming_language))
 
     def _handle_special_tags(self):
         """
             Fills in a dict with special tags which
             always use the same tokenizer and this
             tokenizer is independent of index.
         """
         special_tags = {}
         fields = self.tags + self.nonmarc_tags
         for tag in fields:
             if tag in CFG_BIBINDEX_SPECIAL_TAGS:
 
                 for t in CFG_BIBINDEX_INDEX_TABLE_TYPE:
                     if self.table_type == CFG_BIBINDEX_INDEX_TABLE_TYPE[t]:
                         tokenizer_name = CFG_BIBINDEX_SPECIAL_TAGS[tag][t]
                         tokenizer = _TOKENIZERS[tokenizer_name]
                         instance = tokenizer(self.stemming_language,
                                              self.remove_stopwords,
                                              self.remove_html_markup,
                                              self.remove_latex_markup)
                         special_tags[tag] = instance.get_tokenizing_function(self.table_type)
                         break
         return special_tags
 
     def turn_off_virtual_indexes(self):
         """
             Prevents from reindexing related virtual indexes.
         """
         self.virtual_indexes = []
 
     def turn_on_virtual_indexes(self):
         """
             Turns on indexing related virtual indexes.
         """
         self.virtual_indexes = get_index_virtual_indexes(self.index_id)
 
     def get_field(self, recID, tag):
         """Returns list of values of the MARC-21 'tag' fields for the
            record 'recID'."""
 
         out = []
         bibXXx = "bib" + tag[0] + tag[1] + "x"
         bibrec_bibXXx = "bibrec_" + bibXXx
         query = """SELECT value FROM %s AS b, %s AS bb
                 WHERE bb.id_bibrec=%%s AND bb.id_bibxxx=b.id
                 AND tag LIKE %%s""" % (bibXXx, bibrec_bibXXx)
         res = run_sql(query, (recID, tag))
         for row in res:
             out.append(row[0])
         return out
 
     def notify_virtual_indexes(self, recID_ranges):
         """
             Informs all related virtual indexes about index change.
             Function leaves information about the change for each index
             in proper table in database (idxSOMETHINGxxQ).
             @param recID_ranges: low and high recIDs of ranges
             @type recID_ranges: list [[low_id1, high_id1], [low_id2, high_id2]...]
         """
         query = """INSERT INTO %s (runtime, id_bibrec_low, id_bibrec_high, index_name, mode)
                    VALUES (%%s, %%s, %%s, %%s, %%s)"""
         for index_id, index_name in self.virtual_indexes:
             tab_name = "idx%s%02dQ" % (self.table_type, index_id)
             full_query = query % tab_name
             for recID_range in recID_ranges:
                 run_sql(full_query, (self.timestamp,
                                      recID_range[0],
                                      recID_range[1],
                                      self.index_name,
                                      self.virtual_index_update_mode))
 
     def display(self):
         "Displays the word table."
         keys = self.value.keys()
         keys.sort()
         for k in keys:
             write_message("%s: %s" % (k, self.value[k]))
 
     def count(self):
         "Returns the number of words in the table."
         return len(self.value)
 
     def info(self):
         "Prints some information on the words table."
         write_message("The words table contains %d words." % self.count())
 
     def lookup_words(self, word=""):
         "Lookup word from the words table."
 
         if not word:
             done = 0
             while not done:
                 try:
                     word = raw_input("Enter word: ")
                     done = 1
                 except (EOFError, KeyboardInterrupt):
                     return
 
         if self.value.has_key(word):
             write_message("The word '%s' is found %d times." \
                 % (word, len(self.value[word])))
         else:
             write_message("The word '%s' does not exist in the word file."\
                               % word)
 
     def add_recIDs(self, recIDs, opt_flush):
         """Fetches records which id in the recIDs range list and adds
         them to the wordTable.  The recIDs range list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         global chunksize, _last_word_table
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for arange in recIDs:
             records_to_go = records_to_go + arange[1] - arange[0] + 1
 
         time_started = time.time() # will measure profile time
         for arange in recIDs:
             i_low = arange[0]
             chunksize_count = 0
             while i_low <= arange[1]:
                 task_sleep_now_if_required()
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low + opt_flush - flush_count - 1, arange[1])
                 i_high = min(i_low + chunksize - chunksize_count - 1, i_high)
 
                 try:
                     self.chk_recID_range(i_low, i_high)
                 except StandardError:
                     if self.index_name == 'fulltext' and CFG_SOLR_URL:
                         solr_commit()
                     raise
 
                 write_message(CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR % \
                         (self.table_name, i_low, i_high))
                 if CFG_CHECK_MYSQL_THREADS:
                     kill_sleepy_mysql_threads()
                 percentage_display = get_percentage_completed(records_done, records_to_go)
                 task_update_progress("(%s:%s) adding recs %d-%d %s" % (self.table_name, self.index_name, i_low, i_high, percentage_display))
                 self.del_recID_range(i_low, i_high)
                 just_processed = self.add_recID_range(i_low, i_high)
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + just_processed
                 write_message(CFG_BIBINDEX_ADDING_RECORDS_STARTED_STR % \
                         (self.table_name, i_low, i_high))
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= opt_flush:
                     self.put_into_db()
                     self.clean()
                     if self.index_name == 'fulltext' and CFG_SOLR_URL:
                         solr_commit()
                     write_message("%s backing up" % (self.table_name))
                     flush_count = 0
                     self.log_progress(time_started, records_done, records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db()
             if self.index_name == 'fulltext' and CFG_SOLR_URL:
                 solr_commit()
             self.log_progress(time_started, records_done, records_to_go)
         self.notify_virtual_indexes(recIDs)
 
     def add_recID_range(self, recID1, recID2):
         """Add records from RECID1 to RECID2."""
         wlist = {}
         self.recIDs_in_mem.append([recID1, recID2])
         # special case of author indexes where we also add author
         # canonical IDs:
         if self.index_name in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor'):
             for recID in range(recID1, recID2 + 1):
                 if not wlist.has_key(recID):
                     wlist[recID] = []
                 wlist[recID] = list_union(get_author_canonical_ids_for_recid(recID),
                                           wlist[recID])
 
         marc, nonmarc = self.find_nonmarc_records(recID1, recID2)
         if marc:
             collector = TermCollector(self.tokenizer,
                                       self.tokenizer_type,
                                       self.table_type,
                                       self.tags,
                                       [recID1, recID2])
             collector.set_special_tags(self.special_tags)
             wlist = collector.collect(marc, wlist)
         if nonmarc:
             collector = NonmarcTermCollector(self.tokenizer,
                                              self.tokenizer_type,
                                              self.table_type,
                                              self.nonmarc_tags,
                                              [recID1, recID2])
             collector.set_special_tags(self.special_tags)
             wlist = collector.collect(nonmarc, wlist)
 
         # lookup index-time synonyms:
         synonym_kbrs = get_all_synonym_knowledge_bases()
         if synonym_kbrs.has_key(self.index_name):
             if len(wlist) == 0: return 0
             recIDs = wlist.keys()
             for recID in recIDs:
                 for word in wlist[recID]:
                     word_synonyms = get_synonym_terms(word,
                                                       synonym_kbrs[self.index_name][0],
                                                       synonym_kbrs[self.index_name][1],
                                                       use_memoise=True)
 
                     if word_synonyms:
                         wlist[recID] = list_union(word_synonyms, wlist[recID])
 
         # were there some words for these recIDs found?
         recIDs = wlist.keys()
         for recID in recIDs:
             # was this record marked as deleted?
             if "DELETED" in self.get_field(recID, "980__c"):
                 wlist[recID] = []
                 write_message("... record %d was declared deleted, removing its word list" % recID, verbose=9)
             write_message("... record %d, termlist: %s" % (recID, wlist[recID]), verbose=9)
 
         if len(wlist) == 0: return 0
         # put words into reverse index table with FUTURE status:
         for recID in recIDs:
-            run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal(wlist[recID]))) # kwalitee: disable=sql
+            run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal(wlist[recID]))) # kwalitee: disable=sql
             # ... and, for new records, enter the CURRENT status as empty:
             try:
-                run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
+                run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % wash_table_column_name(self.table_name[:-1]), (recID, serialize_via_marshal([]))) # kwalitee: disable=sql
             except DatabaseError:
                 # okay, it's an already existing record, no problem
                 pass
 
         # put words into memory word list:
         put = self.put
         for recID in recIDs:
             for w in wlist[recID]:
                 put(recID, w, 1)
         return len(recIDs)
 
     def find_nonmarc_records(self, recID1, recID2):
         """Divides recID range into two different tables,
            first one contains only recIDs of the records that
            are Marc type and the second one contains records
            of nonMarc type"""
         marc = range(recID1, recID2 + 1)
         nonmarc = []
         query = """SELECT id FROM %s WHERE master_format <> 'marc'
                    AND id BETWEEN %%s AND %%s""" % "bibrec"
         res = run_sql(query, (recID1, recID2))
         if res:
             nonmarc = list(zip(*res)[0])
             if len(nonmarc) == (recID2 - recID1 + 1):
                 nonmarc = xrange(recID1, recID2 + 1)
                 marc = []
             else:
                 for recID in nonmarc:
                     marc.remove(recID)
         else:
             marc = xrange(recID1, recID2 + 1)
         return [marc, nonmarc]
 
     def log_progress(self, start, done, todo):
         """Calculate progress and store it.
         start: start time,
         done: records processed,
         todo: total number of records"""
         time_elapsed = time.time() - start
         # consistency check
         if time_elapsed == 0 or done > todo:
             return
 
         time_recs_per_min = done / (time_elapsed / 60.0)
         write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\
                 % (done, time_elapsed, time_recs_per_min))
 
         if time_recs_per_min:
             write_message("Estimated runtime: %.1f minutes" % \
                     ((todo - done) / time_recs_per_min))
 
     def put(self, recID, word, sign):
         """Keeps track of changes done during indexing
            and stores these changes in memory for further use.
            Indexing process needs this information later while
            filling in the database.
 
            @param recID: recID of the record we want to update in memory
            @param word: word we want to update
            @param sing: sign of the word, 1 means keep this word in database,
                                          -1 remove word from database
         """
         value = self.value
         try:
             if self.wash_index_terms:
                 word = wash_index_term(word, self.wash_index_terms)
             if value.has_key(word):
                 # the word 'word' exist already: update sign
                 value[word][recID] = sign
             else:
                 value[word] = {recID: sign}
         except:
             write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID))
 
 
     def del_recIDs(self, recIDs):
         """Fetches records which id in the recIDs range list and adds
         them to the wordTable.  The recIDs range list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         count = 0
         for arange in recIDs:
             task_sleep_now_if_required()
             self.del_recID_range(arange[0], arange[1])
             count = count + arange[1] - arange[0]
         self.virtual_index_update_mode = CFG_BIBINDEX_UPDATE_MODE["Remove"]
         self.put_into_db()
         self.notify_virtual_indexes(recIDs)
         if self.index_name == 'fulltext' and CFG_SOLR_URL:
             solr_commit()
 
     def del_recID_range(self, low, high):
         """Deletes records with 'recID' system number between low
            and high from memory words index table."""
         write_message("%s fetching existing words for records #%d-#%d started" % \
                 (self.table_name, low, high), verbose=3)
         self.recIDs_in_mem.append([low, high])
         query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec
         BETWEEN %%s AND %%s""" % (self.table_name[:-1])
         recID_rows = run_sql(query, (low, high))
         for recID_row in recID_rows:
             recID = recID_row[0]
             wlist = deserialize_via_marshal(recID_row[1])
             for word in wlist:
                 self.put(recID, word, -1)
         write_message("%s fetching existing words for records #%d-#%d ended" % \
                 (self.table_name, low, high), verbose=3)
 
     def check_bad_words(self):
         """
         Finds bad words in reverse tables. Returns True in case of bad words.
         """
         query = """SELECT 1 FROM %sR WHERE type IN ('TEMPORARY','FUTURE') LIMIT 1""" \
                 % (self.table_name[:-1],)
         res = run_sql(query)
         return bool(res)
 
     def report_on_table_consistency(self):
         """Check reverse words index tables (e.g. idxWORD01R) for
         interesting states such as 'TEMPORARY' state.
         Prints small report (no of words, no of bad words).
         """
         # find number of words:
         query = """SELECT COUNT(1) FROM %s""" % (self.table_name)
 
         res = run_sql(query, None, 1)
         if res:
             nb_words = res[0][0]
         else:
             nb_words = 0
 
         # report stats:
         write_message("%s contains %d words" % (self.table_name, nb_words))
 
         # find possible bad states in reverse tables:
         if self.check_bad_words():
             write_message("EMERGENCY: %s needs to be repaired" %
                           (self.table_name, ))
         else:
             write_message("%s is in consistent state" % (self.table_name))
 
     def repair(self, opt_flush):
         """Repair the whole table"""
         # find possible bad states in reverse tables:
         if not self.check_bad_words():
             return
 
         query = """SELECT id_bibrec FROM %sR WHERE type IN ('TEMPORARY','FUTURE')""" \
                 % (self.table_name[:-1])
         res = intbitset(run_sql(query))
         recIDs = create_range_list(list(res))
 
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for arange in recIDs:
             records_to_go = records_to_go + arange[1] - arange[0] + 1
 
         time_started = time.time() # will measure profile time
         for arange in recIDs:
 
             i_low = arange[0]
             chunksize_count = 0
             while i_low <= arange[1]:
                 task_sleep_now_if_required()
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low + opt_flush - flush_count - 1, arange[1])
                 i_high = min(i_low + chunksize - chunksize_count - 1, i_high)
 
                 self.fix_recID_range(i_low, i_high)
 
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + i_high - i_low + 1
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= opt_flush:
                     self.put_into_db("emergency")
                     self.clean()
                     flush_count = 0
                     self.log_progress(time_started, records_done, records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db("emergency")
             self.log_progress(time_started, records_done, records_to_go)
         write_message("%s inconsistencies repaired." % self.table_name)
 
     def chk_recID_range(self, low, high):
         """Check if the reverse index table is in proper state"""
         ## check db
         query = """SELECT 1 FROM %sR WHERE type IN ('TEMPORARY','FUTURE')
         AND id_bibrec BETWEEN %%s AND %%s LIMIT 1""" % self.table_name[:-1]
         res = run_sql(query, (low, high), 1)
         if not res:
             write_message("%s for %d-%d is in consistent state" % (self.table_name, low, high))
             return # okay, words table is consistent
 
         ## inconsistency detected!
         write_message("EMERGENCY: %s inconsistencies detected..." % self.table_name)
         error_message = "Errors found. You should check consistency of the " \
                 "%s - %sR tables.\nRunning 'bibindex --repair' is " \
                 "recommended." % (self.table_name, self.table_name[:-1])
         write_message("EMERGENCY: " + error_message, stream=sys.stderr)
         raise StandardError(error_message)
 
     def fix_recID_range(self, low, high):
         """Try to fix reverse index database consistency
            (e.g. table idxWORD01R) in the low,high doc-id range.
 
         Possible states for a recID follow:
         CUR TMP FUT: very bad things have happened: warn!
         CUR TMP    : very bad things have happened: warn!
         CUR     FUT: delete FUT (crash before flushing)
         CUR        : database is ok
             TMP FUT: add TMP to memory and del FUT from memory
                      flush (revert to old state)
             TMP    : very bad things have happened: warn!
                 FUT: very bad things have happended: warn!
         """
 
         state = {}
         query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN %%s AND %%s"\
                 % self.table_name[:-1]
         res = run_sql(query, (low, high))
         for row in res:
             if not state.has_key(row[0]):
                 state[row[0]] = []
             state[row[0]].append(row[1])
 
         ok = 1 # will hold info on whether we will be able to repair
         for recID in state.keys():
             if not 'TEMPORARY' in state[recID]:
                 if 'FUTURE' in state[recID]:
                     if 'CURRENT' not in state[recID]:
                         write_message("EMERGENCY: Index record %d is in inconsistent state. Can't repair it." % recID)
                         ok = 0
                     else:
                         write_message("EMERGENCY: Inconsistency in index record %d detected" % recID)
                         query = """DELETE FROM %sR
                         WHERE id_bibrec=%%s""" % self.table_name[:-1]
                         run_sql(query, (recID,))
                         write_message("EMERGENCY: Inconsistency in record %d repaired." % recID)
 
             else:
                 if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]:
                     self.recIDs_in_mem.append([recID, recID])
 
                     # Get the words file
                     query = """SELECT type,termlist FROM %sR
                     WHERE id_bibrec=%%s""" % self.table_name[:-1]
                     write_message(query, verbose=9)
                     res = run_sql(query, (recID,))
                     for row in res:
                         wlist = deserialize_via_marshal(row[1])
                         write_message("Words are %s " % wlist, verbose=9)
                         if row[0] == 'TEMPORARY':
                             sign = 1
                         else:
                             sign = -1
                         for word in wlist:
                             self.put(recID, word, sign)
 
                 else:
                     write_message("EMERGENCY: %s for %d is in inconsistent "
                             "state. Couldn't repair it." % (self.table_name,
                                 recID), stream=sys.stderr)
                     ok = 0
 
         if not ok:
             error_message = "Unrepairable errors found. You should check " \
                     "consistency of the %s - %sR tables. Deleting affected " \
                     "TEMPORARY and FUTURE entries from these tables is " \
                     "recommended; see the BibIndex Admin Guide." % \
                     (self.table_name, self.table_name[:-1])
             write_message("EMERGENCY: " + error_message, stream=sys.stderr)
             raise StandardError(error_message)
 
 
 def main():
     """Main that construct all the bibtask."""
     task_init(authorization_action='runbibindex',
             authorization_msg="BibIndex Task Submission",
             description="""Examples:
 \t%s -a -i 234-250,293,300-500 -u admin@localhost
 \t%s -a -w author,fulltext -M 8192 -v3
             \t%s -d -m +4d -A on --flush=10000\n""" % ((sys.argv[0],) * 3), help_specific_usage=""" Indexing options:
   -a, --add\t\tadd or update words for selected records
   -d, --del\t\tdelete words for selected records
   -i, --id=low[-high]\t\tselect according to doc recID
   -m, --modified=from[,to]\tselect according to modification date
   -c, --collection=c1[,c2]\tselect according to collection
   -R, --reindex\treindex the selected indexes from scratch
 
  Repairing options:
   -k, --check\t\tcheck consistency for all records in the table(s)
   -r, --repair\t\ttry to repair all records in the table(s)
 
  Specific options:
   -w, --windex=w1[,w2]\tword/phrase indexes to consider (all)
   -M, --maxmem=XXX\tmaximum memory usage in kB (no limit)
   -f, --flush=NNN\t\tfull consistent table flush after NNN records (10000)
   --force\t\tforce indexing of all records for provided indexes
   -Z, --remove-dependent-index=w  name of an index for removing from virtual index
   -l --all-virtual\t\t set of all virtual indexes; the same as: -w virtual_ind1, virtual_ind2, ...
 """,
             version=__revision__,
             specific_params=("adi:m:c:w:krRM:f:oZ:l", [
                 "add",
                 "del",
                 "id=",
                 "modified=",
                 "collection=",
                 "windex=",
                 "check",
                 "repair",
                 "reindex",
                 "maxmem=",
                 "flush=",
                 "force",
                 "remove-dependent-index=",
                 "all-virtual"
             ]),
             task_stop_helper_fnc=task_stop_table_close_fnc,
             task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter,
             task_run_fnc=task_run_core,
             task_submit_check_options_fnc=task_submit_check_options)
 
 def task_submit_check_options():
     """Check for options compatibility."""
     if task_get_option("reindex"):
         if task_get_option("cmd") != "add" or task_get_option('id') or task_get_option('collection'):
             print >> sys.stderr, "ERROR: You can use --reindex only when adding modified record."
             return False
     return True
 
 def task_submit_elaborate_specific_parameter(key, value, opts, args):
     """ Given the string key it checks it's meaning, eventually using the
     value. Usually it fills some key in the options dict.
     It must return True if it has elaborated the key, False, if it doesn't
     know that key.
     eg:
     if key in ['-n', '--number']:
         self.options['number'] = value
         return True
     return False
     """
     if key in ("-a", "--add"):
         task_set_option("cmd", "add")
         if ("-x", "") in opts or ("--del", "") in opts:
             raise StandardError("Can not have --add and --del at the same time!")
     elif key in ("-k", "--check"):
         task_set_option("cmd", "check")
     elif key in ("-r", "--repair"):
         task_set_option("cmd", "repair")
     elif key in ("-d", "--del"):
         task_set_option("cmd", "del")
     elif key in ("-i", "--id"):
         task_set_option('id', task_get_option('id') + split_ranges(value))
     elif key in ("-m", "--modified"):
         task_set_option("modified", get_date_range(value))
     elif key in ("-c", "--collection"):
         task_set_option("collection", value)
     elif key in ("-R", "--reindex"):
         task_set_option("reindex", True)
     elif key in ("-w", "--windex"):
         task_set_option("windex", value)
     elif key in ("-M", "--maxmem"):
         task_set_option("maxmem", int(value))
         if task_get_option("maxmem") < base_process_size + 1000:
             raise StandardError("Memory usage should be higher than %d kB" % \
                 (base_process_size + 1000))
     elif key in ("-f", "--flush"):
         task_set_option("flush", int(value))
     elif key in ("-o", "--force"):
         task_set_option("force", True)
     elif key in ("-Z", "--remove-dependent-index",):
         task_set_option("remove-dependent-index", value)
     elif key in ("-l", "--all-virtual",):
         task_set_option("all-virtual", True)
     else:
         return False
     return True
 
 def task_stop_table_close_fnc():
     """ Close tables to STOP. """
     global _last_word_table
     if _last_word_table:
         _last_word_table.put_into_db()
 
 
 def get_recIDs_by_date_bibliographic(dates, index_name, force_all=False):
     """ Finds records that were modified between DATES[0] and DATES[1]
         for given index.
         If DATES is not set, then finds records that were modified since
         the last update of the index.
         @param wordtable_type: can be 'Words', 'Pairs' or 'Phrases'
     """
     index_id = get_index_id_from_index_name(index_name)
     if not dates:
         query = """SELECT last_updated FROM idxINDEX WHERE id=%s"""
         res = run_sql(query, (index_id,))
         if not res:
             return set([])
         if not res[0][0] or force_all:
             dates = ("0000-00-00", None)
         else:
             dates = (res[0][0], None)
     if dates[1] is None:
         res = intbitset(run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s""",
                                    (dates[0],)))
         if index_name == 'fulltext':
             res |= intbitset(run_sql("""SELECT id_bibrec FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id
                                         WHERE text_extraction_date <= modification_date AND
                                         modification_date >= %s
                                         AND status<>'DELETED'""",
                                         (dates[0],)))
     elif dates[0] is None:
         res = intbitset(run_sql("""SELECT b.id FROM bibrec AS b WHERE b.modification_date <= %s""",
                                    (dates[1],)))
         if index_name == 'fulltext':
             res |= intbitset(run_sql("""SELECT id_bibrec FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id
                                         WHERE text_extraction_date <= modification_date
                                         AND modification_date <= %s
                                         AND status<>'DELETED'""",
                                         (dates[1],)))
     else:
         res = intbitset(run_sql("""SELECT b.id FROM bibrec AS b
                                    WHERE b.modification_date >= %s AND
                                    b.modification_date <= %s""",
                                    (dates[0], dates[1])))
         if index_name == 'fulltext':
             res |= intbitset(run_sql("""SELECT id_bibrec FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id
                                         WHERE text_extraction_date <= modification_date AND
                                         modification_date >= %s AND
                                         modification_date <= %s AND
                                         status<>'DELETED'""",
                                         (dates[0], dates[1],)))
     # special case of author indexes where we need to re-index
     # those records that were affected by changed BibAuthorID attributions:
     if index_name in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor'):
         from invenio.bibauthorid_personid_maintenance import get_recids_affected_since
         # dates[1] is ignored, since BibAuthorID API does not offer upper limit search
         rec_list_author = intbitset(get_recids_affected_since(dates[0]))
         res = res | rec_list_author
     return set(res)
 
 
 def get_recIDs_by_date_authority(dates, index_name, force_all=False):
     """ Finds records that were modified between DATES[0] and DATES[1]
         for given index.
         If DATES is not set, then finds records that were modified since
         the last update of the index.
         Searches for bibliographic records connected to authority records
         that have been changed.
     """
     index_id = get_index_id_from_index_name(index_name)
     index_tags = get_index_tags(index_name)
     if not dates:
         query = """SELECT last_updated FROM idxINDEX WHERE id=%s"""
         res = run_sql(query, (index_id,))
         if not res:
             return set([])
         if not res[0][0] or force_all:
             dates = ("0000-00-00", None)
         else:
             dates = (res[0][0], None)
     res = intbitset()
     for tag in index_tags:
         pattern = tag.replace('%', '*')
         matches = fnmatch.filter(CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC.keys(), pattern)
         if not len(matches):
             continue
         for tag_match in matches:
             # get the type of authority record associated with this field
             auth_type = CFG_BIBAUTHORITY_CONTROLLED_FIELDS_BIBLIOGRAPHIC.get(tag_match)
             # find updated authority records of this type
             # dates[1] is ignored, needs dates[0] to find res
             now = datetime.now()
             auth_recIDs = search_pattern(p='980__a:' + auth_type) \
                 & search_unit_in_bibrec(str(dates[0]), str(now), search_type='m')
             # now find dependent bibliographic records
             for auth_recID in auth_recIDs:
                 # get the fix authority identifier of this authority record
                 control_nos = get_control_nos_from_recID(auth_recID)
                 # there may be multiple control number entries! (the '035' field is repeatable!)
                 for control_no in control_nos:
                     # get the bibrec IDs that refer to AUTHORITY_ID in TAG
                     tag_0 = tag_match[:5] + '0' # possibly do the same for '4' subfields ?
                     fieldvalue = '"' + control_no + '"'
                     res |= search_pattern(p=tag_0 + ':' + fieldvalue)
     return set(res)
 
 
 def get_not_updated_recIDs(modified_dates, indexes, force_all=False):
     """Finds not updated recIDs in database for indexes.
        @param modified_dates: between this dates we should look for modified records
        @type modified_dates: [date_old, date_new]
        @param indexes: list of indexes
        @type indexes: string separated by coma
        @param force_all: if True all records will be taken
     """
     found_recIDs = set()
     write_message(CFG_BIBINDEX_UPDATE_MESSAGE)
     for index in indexes:
         found_recIDs |= get_recIDs_by_date_bibliographic(modified_dates, index, force_all)
         found_recIDs |= get_recIDs_by_date_authority(modified_dates, index, force_all)
     return list(sorted(found_recIDs))
 
 
 def get_recIDs_from_cli(indexes=[]):
     """
         Gets recIDs ranges from CLI for indexing when
         user specified 'id' or 'collection' option or
         search for modified recIDs for provided indexes
         when recIDs are not specified.
         @param indexes: it's a list of specified indexes, which
             can be obtained from CLI with use of:
             get_indexes_from_cli() function.
         @type indexes: list of strings
     """
     # need to first update idxINDEX table to find proper recIDs for reindexing
     if task_get_option("reindex"):
         for index_name in indexes:
             run_sql("""UPDATE idxINDEX SET last_updated='0000-00-00 00:00:00'
                        WHERE name=%s""", (index_name,))
 
     if task_get_option("id"):
         return task_get_option("id")
     elif task_get_option("collection"):
         l_of_colls = task_get_option("collection").split(",")
         recIDs = perform_request_search(c=l_of_colls)
         recIDs_range = []
         for recID in recIDs:
             recIDs_range.append([recID, recID])
         return recIDs_range
     elif task_get_option("cmd") == "add":
         recs = get_not_updated_recIDs(task_get_option("modified"),
                                       indexes,
                                       task_get_option("force"))
         recIDs_range = beautify_range_list(create_range_list(recs))
         return recIDs_range
     return []
 
 
 def get_indexes_from_cli():
     """
         Gets indexes from CLI and checks if they are
         valid. If indexes weren't specified function
         will return all known indexes.
     """
     indexes = task_get_option("windex")
     all_virtual = task_get_option("all-virtual")
     if all_virtual:
         indexes = filter_for_virtual_indexes(get_all_indexes())
     elif not indexes:
         indexes = get_all_indexes()
     else:
         indexes = indexes.split(",")
         indexes = remove_inexistent_indexes(indexes, leave_virtual=True)
     return indexes
 
 
 def remove_dependent_index(virtual_indexes, dependent_index):
     """
         Removes dependent index from virtual indexes.
         @param virtual_indexes: names of virtual_indexes
         @type virtual_indexes: list of strings
         @param dependent_index: name of dependent index
         @type dependent_index: string
     """
     if not virtual_indexes:
         write_message("You should specify a name of a virtual index...")
         return
 
     id_dependent = get_index_id_from_index_name(dependent_index)
     for index_name in virtual_indexes:
         index_id = get_index_id_from_index_name(index_name)
         for type_ in CFG_BIBINDEX_INDEX_TABLE_TYPE.itervalues():
             vit = VirtualIndexTable(index_name, type_)
             vit.remove_dependent_index(dependent_index)
             task_sleep_now_if_required()
 
         query = """DELETE FROM idxINDEX_idxINDEX WHERE id_virtual=%s AND id_normal=%s"""
         run_sql(query, (index_id, id_dependent))
 
 
 def should_update_virtual_indexes():
     """
         Decides if any virtual indexes should be updated.
         Decision is made based on arguments obtained
         from CLI.
     """
     return task_get_option("all-virtual") or task_get_option("windex")
 
 
 def update_virtual_indexes(virtual_indexes, reindex=False):
     """
         Function will update all specified virtual_indexes.
         @param virtual_indexes: list of index names
         @param reindex: shall we reindex given v.indexes from scratch?
     """
     kwargs = {}
     if reindex:
         kwargs.update({'table_prefix': 'tmp_'})
 
     for index_name in virtual_indexes:
         if reindex:
             index_id = get_index_id_from_index_name(index_name)
             init_temporary_reindex_tables(index_id)
 
             for key, type_ in CFG_BIBINDEX_INDEX_TABLE_TYPE.iteritems():
                 kwargs.update({'wash_index_terms': CFG_BIBINDEX_WASH_INDEX_TERMS[key]})
                 vit = VirtualIndexTable(index_name, type_, **kwargs)
                 vit.set_reindex_mode()
                 vit.run_update()
 
             swap_temporary_reindex_tables(index_id)
             update_index_last_updated([index_name], task_get_task_param('task_starting_time'))
             task_sleep_now_if_required(can_stop_too=True)
         else:
             for key, type_ in CFG_BIBINDEX_INDEX_TABLE_TYPE.iteritems():
                 kwargs.update({'wash_index_terms': CFG_BIBINDEX_WASH_INDEX_TERMS[key]})
                 vit = VirtualIndexTable(index_name, type_, **kwargs)
                 vit.run_update()
 
             task_sleep_now_if_required(can_stop_too=True)
 
 
 def task_run_core():
     """Runs the task by fetching arguments from the BibSched task queue.
        This is what BibSched will be invoking via daemon call.
     """
     global _last_word_table
 
     indexes = get_indexes_from_cli()
     if len(indexes) == 0:
         write_message("Specified indexes can't be found.")
         return True
     virtual_indexes = filter_for_virtual_indexes(indexes)
     regular_indexes = list(set(indexes) - set(virtual_indexes))
 
     # check tables consistency
     if task_get_option("cmd") == "check":
         for index_name in indexes:
             wordTable = WordTable(index_name=index_name,
                                   table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"],
                                   wash_index_terms=50)
             _last_word_table = wordTable
             wordTable.report_on_table_consistency()
             task_sleep_now_if_required(can_stop_too=True)
 
             wordTable = WordTable(index_name=index_name,
                                   table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"],
                                   wash_index_terms=100)
             _last_word_table = wordTable
             wordTable.report_on_table_consistency()
             task_sleep_now_if_required(can_stop_too=True)
 
             wordTable = WordTable(index_name=index_name,
                                   table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"],
                                   wash_index_terms=0)
             _last_word_table = wordTable
             wordTable.report_on_table_consistency()
             task_sleep_now_if_required(can_stop_too=True)
         _last_word_table = None
         return True
 
     # virtual index: remove dependent index
     if task_get_option("remove-dependent-index"):
         remove_dependent_index(indexes,
                                task_get_option("remove-dependent-index"))
         return True
 
     # virtual index: update
     if should_update_virtual_indexes():
         update_virtual_indexes(virtual_indexes, task_get_option("reindex"))
 
     if len(regular_indexes) == 0:
         return True
 
     # regular index: initialization for Words,Pairs,Phrases
     recIDs_range = get_recIDs_from_cli(regular_indexes)
     recIDs_for_index = find_affected_records_for_index(regular_indexes,
                                                        recIDs_range,
                                                        (task_get_option("force") or \
                                                        task_get_option("reindex") or \
                                                        task_get_option("cmd") == "del"))
 
     if len(recIDs_for_index.keys()) == 0:
         write_message("Selected indexes/recIDs are up to date.")
 
 
     # Let's work on single words!
     for index_name in recIDs_for_index.keys():
         index_id = get_index_id_from_index_name(index_name)
         reindex_prefix = ""
         if task_get_option("reindex"):
             reindex_prefix = "tmp_"
             init_temporary_reindex_tables(index_id, reindex_prefix)
 
         wordTable = WordTable(index_name=index_name,
                               table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"],
                               table_prefix=reindex_prefix,
                               wash_index_terms=50)
         _last_word_table = wordTable
         wordTable.report_on_table_consistency()
         try:
             if task_get_option("cmd") == "del":
                 if task_get_option("id") or task_get_option("collection"):
                     wordTable.del_recIDs(recIDs_range)
                     task_sleep_now_if_required(can_stop_too=True)
                 else:
                     error_message = "Missing IDs of records to delete from " \
                             "index %s." % wordTable.table_name
                     write_message(error_message, stream=sys.stderr)
                     raise StandardError(error_message)
             elif task_get_option("cmd") == "add":
                 final_recIDs = beautify_range_list(create_range_list(recIDs_for_index[index_name]))
                 wordTable.add_recIDs(final_recIDs, task_get_option("flush"))
                 task_sleep_now_if_required(can_stop_too=True)
             elif task_get_option("cmd") == "repair":
                 wordTable.repair(task_get_option("flush"))
                 task_sleep_now_if_required(can_stop_too=True)
             else:
                 error_message = "Invalid command found processing %s" % \
                     wordTable.table_name
                 write_message(error_message, stream=sys.stderr)
                 raise StandardError(error_message)
         except StandardError, e:
             write_message("Exception caught: %s" % e, sys.stderr)
             register_exception(alert_admin=True)
             if _last_word_table:
                 _last_word_table.put_into_db()
             raise
 
         wordTable.report_on_table_consistency()
         task_sleep_now_if_required(can_stop_too=True)
 
         # Let's work on pairs now
         wordTable = WordTable(index_name=index_name,
                               table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"],
                               table_prefix=reindex_prefix,
                               wash_index_terms=100)
         _last_word_table = wordTable
         wordTable.report_on_table_consistency()
         try:
             if task_get_option("cmd") == "del":
                 if task_get_option("id") or task_get_option("collection"):
                     wordTable.del_recIDs(recIDs_range)
                     task_sleep_now_if_required(can_stop_too=True)
                 else:
                     error_message = "Missing IDs of records to delete from " \
                             "index %s." % wordTable.table_name
                     write_message(error_message, stream=sys.stderr)
                     raise StandardError(error_message)
             elif task_get_option("cmd") == "add":
                 final_recIDs = beautify_range_list(create_range_list(recIDs_for_index[index_name]))
                 wordTable.add_recIDs(final_recIDs, task_get_option("flush"))
                 task_sleep_now_if_required(can_stop_too=True)
             elif task_get_option("cmd") == "repair":
                 wordTable.repair(task_get_option("flush"))
                 task_sleep_now_if_required(can_stop_too=True)
             else:
                 error_message = "Invalid command found processing %s" % \
                         wordTable.table_name
                 write_message(error_message, stream=sys.stderr)
                 raise StandardError(error_message)
         except StandardError, e:
             write_message("Exception caught: %s" % e, sys.stderr)
             register_exception()
             if _last_word_table:
                 _last_word_table.put_into_db()
             raise
 
         wordTable.report_on_table_consistency()
         task_sleep_now_if_required(can_stop_too=True)
 
         # Let's work on phrases now
         wordTable = WordTable(index_name=index_name,
                               table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"],
                               table_prefix=reindex_prefix,
                               wash_index_terms=0)
         _last_word_table = wordTable
         wordTable.report_on_table_consistency()
         try:
             if task_get_option("cmd") == "del":
                 if task_get_option("id") or task_get_option("collection"):
                     wordTable.del_recIDs(recIDs_range)
                     task_sleep_now_if_required(can_stop_too=True)
                 else:
                     error_message = "Missing IDs of records to delete from " \
                             "index %s." % wordTable.table_name
                     write_message(error_message, stream=sys.stderr)
                     raise StandardError(error_message)
             elif task_get_option("cmd") == "add":
                 final_recIDs = beautify_range_list(create_range_list(recIDs_for_index[index_name]))
                 wordTable.add_recIDs(final_recIDs, task_get_option("flush"))
                 if not task_get_option("id") and not task_get_option("collection"):
                     update_index_last_updated([index_name], task_get_task_param('task_starting_time'))
                 task_sleep_now_if_required(can_stop_too=True)
             elif task_get_option("cmd") == "repair":
                 wordTable.repair(task_get_option("flush"))
                 task_sleep_now_if_required(can_stop_too=True)
             else:
                 error_message = "Invalid command found processing %s" % \
                         wordTable.table_name
                 write_message(error_message, stream=sys.stderr)
                 raise StandardError(error_message)
         except StandardError, e:
             write_message("Exception caught: %s" % e, sys.stderr)
             register_exception()
             if _last_word_table:
                 _last_word_table.put_into_db()
             raise
 
         wordTable.report_on_table_consistency()
         task_sleep_now_if_required(can_stop_too=True)
 
         if task_get_option("reindex"):
             swap_temporary_reindex_tables(index_id, reindex_prefix)
             update_index_last_updated([index_name], task_get_task_param('task_starting_time'))
         task_sleep_now_if_required(can_stop_too=True)
 
     # update modification date also for indexes that were up to date
     if not task_get_option("id") and not task_get_option("collection") and \
        task_get_option("cmd") == "add":
         up_to_date = set(indexes) - set(recIDs_for_index.keys())
         update_index_last_updated(list(up_to_date), task_get_task_param('task_starting_time'))
 
 
     _last_word_table = None
     return True
 
 
 ### okay, here we go:
 if __name__ == '__main__':
     main()
diff --git a/modules/bibrank/lib/bibrank_citation_indexer.py b/modules/bibrank/lib/bibrank_citation_indexer.py
index 1a11c14a4..a5bd6b3e9 100644
--- a/modules/bibrank/lib/bibrank_citation_indexer.py
+++ b/modules/bibrank/lib/bibrank_citation_indexer.py
@@ -1,1288 +1,1288 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN.
+# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 __revision__ = "$Id$"
 
 import re
 import time
 import os
 import sys
 import ConfigParser
 from datetime import datetime
 from itertools import islice
 
 from invenio.intbitset import intbitset
 from invenio.dbquery import run_sql
 from invenio.bibindex_tokenizers.BibIndexJournalTokenizer import \
     CFG_JOURNAL_PUBINFO_STANDARD_FORM, \
     CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
 from invenio.redisutils import get_redis
 from invenio.search_engine import search_pattern, \
                                   search_unit, \
                                   get_collection_reclist
 from invenio.bibformat_utils import parse_tag
 from invenio.bibknowledge import get_kb_mappings
 from invenio.bibtask import write_message, task_get_option, \
                      task_update_progress, task_sleep_now_if_required, \
                      task_get_task_param
 from invenio.bibindex_engine_utils import get_field_tags
 from invenio.docextract_record import get_record
 from invenio.dbquery import serialize_via_marshal
 
 re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK \
                    = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK)
 
 
 def compute_weights():
     sql = "SELECT citee, COUNT(citer) FROM rnkCITATIONDICT GROUP BY citee"
     weights = {}
     for citee, c in run_sql(sql):
         weights[citee] = c
     return weights
 
 
 def recids_cache(collections, cache={}):
     if 'valid_recids' not in cache:
         cache['valid_recids'] = intbitset()
         for coll in collections.split(','):
             cache['valid_recids'] += get_collection_reclist(coll)
     return cache['valid_recids']
 
 
 def deleted_recids_cache(cache={}):
     if 'deleted_records' not in cache:
         cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a')
     return cache['deleted_records']
 
 
 def get_recids_matching_query(p, f, config, m='e'):
     """Return set of recIDs matching query for pattern p in field f.
 
     @param p: pattern to search for
     @type recID: unicode string
     @param f: field to search in
     @type recID: unicode string
     @param config: bibrank configuration
     @type recID: dict
     @param m: type of matching (usually 'e' for exact or 'r' for regexp)
     @type recID: string
     """
     p = p.encode('utf-8')
     f = f.encode('utf-8')
     function = config.get("rank_method", "function")
     collections = config.get(function, 'collections')
     if collections:
         ret = search_pattern(p=p, f=f, m=m) & recids_cache(collections)
     else:
         ret = search_pattern(p=p, f=f, m=m) - deleted_recids_cache()
     return ret
 
 
 def get_citation_weight(rank_method_code, config, chunk_size=25000):
     """return a dictionary which is used by bibrank daemon for generating
     the index of sorted research results by citation information
     """
     quick = task_get_option("quick") != "no"
 
     # id option forces re-indexing a certain range
     # even if there are no new recs
     if task_get_option("id"):
         # construct a range of records to index
         updated_recids = []
         for first, last in task_get_option("id"):
             updated_recids += range(first, last+1)
         if len(updated_recids) > 10000:
             str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
         else:
             str_updated_recids = str(updated_recids)
         write_message('Records to process: %s' % str_updated_recids)
         index_update_time = None
     else:
         bibrank_update_time = get_bibrankmethod_lastupdate(rank_method_code)
         if not quick:
             bibrank_update_time = "0000-00-00 00:00:00"
         write_message("bibrank: %s" % bibrank_update_time)
         index_update_time = get_bibindex_update_time()
         write_message("bibindex: %s" % index_update_time)
         if index_update_time > datetime.now().strftime("%Y-%m-%d %H:%M:%S"):
             index_update_time = "0000-00-00 00:00:00"
         updated_recids = get_modified_recs(bibrank_update_time,
                                            index_update_time)
         if len(updated_recids) > 10000:
             str_updated_recids = str(updated_recids[:10]) + ' ... ' + str(updated_recids[-10:])
         else:
             str_updated_recids = str(updated_recids)
         write_message("%s records to update" % str_updated_recids)
 
     if updated_recids:
         begin_time = time.time()
         try:
             function = config.get("rank_method", "function")
             config.get(function, 'collections')
         except ConfigParser.NoOptionError:
             config.set(function, 'collections', None)
         # Process fully the updated records
         weights = process_and_store(updated_recids, config, chunk_size)
         end_time = time.time()
         write_message("Total time of get_citation_weight(): %.2f sec" %
                                                       (end_time - begin_time))
         task_update_progress("citation analysis done")
     else:
         weights = None
         write_message("No new records added since last time this "
                       "rank method was executed")
 
     return weights, index_update_time
 
 
 def process_and_store(recids, config, chunk_size):
     # Limit of # of citation we can loose in one chunk
     function = config.get("rank_method", "function")
     citation_loss_limit = int(config.get(function, "citation_loss_limit"))
     # If we have nothing to process
     # Do not update the weights dictionary
     modified = False
     # Process recent records first
     # The older records were most likely added by the above steps
     # to be reprocessed so they only have minor changes
     recids_iter = iter(sorted(recids, reverse=True))
 
     # Split records to process into chunks so that we do not
     # fill up too much memory
     while True:
         task_sleep_now_if_required()
 
         chunk = list(islice(recids_iter, chunk_size))
         if not chunk:
             break
 
         write_message("Processing chunk #%s to #%s" % (chunk[0], chunk[-1]))
         # The core work
         cites, refs = process_chunk(chunk, config)
         # Check that we haven't lost too many citations
         cites_diff = compute_dicts_diff(chunk, refs, cites)
         write_message("Citations balance %s" % cites_diff)
         if citation_loss_limit and cites_diff <= -citation_loss_limit:
             raise Exception('Lost too many references, aborting')
 
         # Store processed citations/references
         store_dicts(chunk, refs, cites)
         modified = True
 
     # Compute new weights dictionary
     if modified:
         weights = compute_weights()
     else:
         weights = None
 
     store_weights_cache(weights)
 
     return weights
 
 
 def store_weights_cache(weights):
     """Store into key/value store"""
     redis = get_redis()
     redis.set('citations_weights', serialize_via_marshal(weights))
 
 
 def process_chunk(recids, config):
     tags = get_tags_config(config)
 
     # call the procedure that does the hard work by reading fields of
     # citations and references in the updated_recid's (but nothing else)!
     write_message("Entering get_citation_informations", verbose=9)
     citation_informations = get_citation_informations(recids, tags, config)
 
     write_message("Entering ref_analyzer", verbose=9)
     # call the analyser that uses the citation_informations to really
     # search x-cites-y in the coll..
     return ref_analyzer(citation_informations,
                         recids,
                         tags,
                         config)
 
 
 def get_bibrankmethod_lastupdate(rank_method_code):
     """Return the last excution date of bibrank method
     """
     query = """SELECT DATE_FORMAT(last_updated, '%%Y-%%m-%%d %%H:%%i:%%s')
                FROM rnkMETHOD WHERE name =%s"""
     last_update_time = run_sql(query, [rank_method_code])
     try:
         r = last_update_time[0][0]
     except IndexError:
         r = "0000-00-00 00:00:00"
 
     return r
 
 
 def get_bibindex_update_time():
     """Return the last indexing date of the journals and report number indexes
     """
     try:
         # check indexing times of `journal' and `reportnumber`
         # indexes, and only fetch records which have been indexed
         sql = "SELECT DATE_FORMAT(MIN(last_updated), " \
               "'%%Y-%%m-%%d %%H:%%i:%%s') FROM idxINDEX WHERE name IN (%s,%s)"
         index_update_time = run_sql(sql, ('journal', 'reportnumber'), 1)[0][0]
     except IndexError:
         write_message("Not running citation indexer since journal/reportnumber"
                       " indexes are not created yet.")
         index_update_time = "0000-00-00 00:00:00"
 
     return index_update_time
 
 
 def get_modified_recs(bibrank_method_lastupdate, indexes_lastupdate):
     """Get records to be updated by bibrank indexing
 
     Return the list of records which have been modified between the last
     execution of bibrank method and the latest journal/report index updates.
     The result is expected to have ascending id order.
     """
     query = """SELECT id FROM bibrec
                WHERE modification_date >= %s
                AND modification_date < %s
                ORDER BY id ASC"""
     records = run_sql(query, (bibrank_method_lastupdate, indexes_lastupdate))
     return [r[0] for r in records]
 
 
 def format_journal(format_string, mappings):
     """format the publ infostring according to the format"""
 
     def replace(char, data):
         return data.get(char, char)
 
     return ''.join(replace(c, mappings) for c in format_string)
 
 
 def get_tags_config(config):
     """Fetch needs config from our config file"""
     # Probably "citation" unless this file gets renamed
     function = config.get("rank_method", "function")
     write_message("config function %s" % function, verbose=9)
 
     tags = {}
 
     # 037a: contains (often) the "hep-ph/0501084" tag of THIS record
     try:
         tag = config.get(function, "primary_report_number")
     except ConfigParser.NoOptionError:
         tags['record_pri_number'] = None
     else:
         tags['record_pri_number'] = tagify(parse_tag(tag))
 
     # 088a: additional short identifier for the record
     try:
         tag = config.get(function, "additional_report_number")
     except ConfigParser.NoOptionError:
         tags['record_add_number'] = None
     else:
         tags['record_add_number'] = tagify(parse_tag(tag))
 
     # 999C5r. this is in the reference list, refers to other records.
     # Looks like: hep-ph/0408002
     try:
         tag = config.get(function, "reference_via_report_number")
     except ConfigParser.NoOptionError:
         tags['refs_report_number'] = None
     else:
         tags['refs_report_number'] = tagify(parse_tag(tag))
     # 999C5s. this is in the reference list, refers to other records.
     # Looks like: Phys.Rev.,A21,78
     try:
         tag = config.get(function, "reference_via_pubinfo")
     except ConfigParser.NoOptionError:
         tags['refs_journal'] = None
     else:
         tags['refs_journal'] = tagify(parse_tag(tag))
     # 999C5a. this is in the reference list, refers to other records.
     # Looks like: 10.1007/BF03170733
     try:
         tag = config.get(function, "reference_via_doi")
     except ConfigParser.NoOptionError:
         tags['refs_doi'] = None
     else:
         tags['refs_doi'] = tagify(parse_tag(tag))
 
     # 999C50. this is in the reference list, refers to other records.
     # Looks like: 1205
     try:
         tag = config.get(function, "reference_via_record_id")
     except ConfigParser.NoOptionError:
         tags['refs_record_id'] = None
     else:
         tags['refs_record_id'] = tagify(parse_tag(tag))
 
     # 999C5i. this is in the reference list, refers to other records.
     # Looks like: 9781439520031
     try:
         tag = config.get(function, "reference_via_isbn")
     except ConfigParser.NoOptionError:
         tags['refs_isbn'] = None
     else:
         tags['refs_isbn'] = tagify(parse_tag(tag))
 
     # Fields needed to construct the journals for this record
     try:
         tag = {
             'pages': config.get(function, "pubinfo_journal_page"),
             'year': config.get(function, "pubinfo_journal_year"),
             'journal': config.get(function, "pubinfo_journal_title"),
             'volume': config.get(function, "pubinfo_journal_volume"),
         }
     except ConfigParser.NoOptionError:
         tags['publication'] = None
     else:
         tags['publication'] = {
             'pages': tagify(parse_tag(tag['pages'])),
             'year': tagify(parse_tag(tag['year'])),
             'journal': tagify(parse_tag(tag['journal'])),
             'volume': tagify(parse_tag(tag['volume'])),
         }
 
     # Fields needed to lookup the DOIs
     tags['doi'] = get_field_tags('doi')
 
     # Fields needed to lookup the ISBN
     tags['isbn'] = get_field_tags('isbn')
 
     # 999C5s. A standardized way of writing a reference in the reference list.
     # Like: Nucl. Phys. B 710 (2000) 371
     try:
         tags['publication_format'] = config.get(function,
                                                 "pubinfo_journal_format")
     except ConfigParser.NoOptionError:
         tags['publication_format'] = CFG_JOURNAL_PUBINFO_STANDARD_FORM
 
     # Print values of tags for debugging
     write_message("tag values: %r" % [tags], verbose=9)
 
     return tags
 
 
 def get_journal_info(record, tags):
     """Fetch journal info from given record"""
     record_info = []
 
     journals_fields = record.find_fields(tags['publication']['journal'][:5])
     for field in journals_fields:
         # we store the tags and their values here
         # like c->444 y->1999 p->"journal of foo",
         # v->20
         tagsvalues = {}
         try:
             tmp = field.get_subfield_values(tags['publication']['journal'][5])[0]
         except IndexError:
             pass
         else:
             tagsvalues["p"] = tmp
 
         try:
             tmp = field.get_subfield_values(tags['publication']['volume'][5])[0]
         except IndexError:
             pass
         else:
             tagsvalues["v"] = tmp
 
         try:
             tmp = field.get_subfield_values(tags['publication']['year'][5])[0]
         except IndexError:
             pass
         else:
             tagsvalues["y"] = tmp
 
         try:
             tmp = field.get_subfield_values(tags['publication']['pages'][5])[0]
         except IndexError:
             pass
         else:
             # if the page numbers have "x-y" take just x
             tagsvalues["c"] = tmp.split('-', 1)[0]
 
         # check if we have the required data
         ok = True
         for c in tags['publication_format']:
             if c in ('p', 'v', 'y', 'c'):
                 if c not in tagsvalues:
                     ok = False
 
         if ok:
             publ = format_journal(tags['publication_format'], tagsvalues)
             record_info += [publ]
 
             alt_volume = get_alt_volume(tagsvalues['v'])
             if alt_volume:
                 tagsvalues2 = tagsvalues.copy()
                 tagsvalues2['v'] = alt_volume
                 publ = format_journal(tags['publication_format'], tagsvalues2)
                 record_info += [publ]
 
             # Add codens
             for coden in get_kb_mappings('CODENS',
                                          value=tagsvalues['p']):
                 tagsvalues2 = tagsvalues.copy()
                 tagsvalues2['p'] = coden['key']
                 publ = format_journal(tags['publication_format'], tagsvalues2)
                 record_info += [publ]
 
     return record_info
 
 
 def get_alt_volume(volume):
     """Get alternate volume form
 
     We handle the inversed volume letter bug
     Some metadata is wrong which leads to journals with the volume letter
     at the end.
     e.g.  Phys.Rev.,51B,1 instead of Phys.Rev.,B51,1
     """
     alt_volume = None
     if re.match(ur'[a-zA-Z]\d+', volume, re.U|re.I):
         alt_volume = volume[1:] + volume[0]
     elif re.match(ur'\d+[a-zA-Z]', volume, re.U|re.I):
         alt_volume = volume[-1] + volume[:-1]
     return alt_volume
 
 
 def get_citation_informations(recid_list, tags, config,
                               fetch_catchup_info=True):
     """Scans the collections searching references (999C5x -fields) and
        citations for items in the recid_list
        returns a 4 list of dictionaries that contains the citation information
        of cds records
        examples: [ {} {} {} {} ]
                  [ {5: 'SUT-DP-92-70-5'},
                    { 93: ['astro-ph/9812088']},
                    { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
         NB: stuff here is for analysing new or changed records.
         see "ref_analyzer" for more.
     """
     begin_time = os.times()[4]
 
     records_info = {
         'report-numbers': {},
         'journals': {},
         'doi': {},
         'hdl': {},
         'isbn': {},
         'record_id': {},
     }
 
     references_info = {
         'report-numbers': {},
         'journals': {},
         'doi': {},
         'record_id': {},
         'isbn': {},
         'hdl': {},
     }
 
     # perform quick check to see if there are some records with
     # reference tags, because otherwise get.cit.inf would be slow even
     # if there is nothing to index:
 
     for done, recid in enumerate(recid_list):
         if done % 10 == 0:
             task_sleep_now_if_required()
 
         if done % 50 == 0:
             mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
             write_message(mesg)
             task_update_progress(mesg)
 
         record = get_record(recid)
         records_info['record_id'][recid] = [unicode(recid)]
 
         function = config.get("rank_method", "function")
         if config.get(function, 'collections'):
             if recid not in recids_cache(config.get(function, 'collections')):
                 # do not treat this record since it is not in the collections
                 # we want to process
                 continue
         elif recid in deleted_recids_cache():
             # do not treat this record since it was deleted; we
             # skip it like this in case it was only soft-deleted
             # e.g. via bibedit (i.e. when collection tag 980 is
             # DELETED but other tags like report number or journal
             # publication info remained the same, so the calls to
             # get_fieldvalues() below would return old values)
             continue
 
         if tags['refs_report_number']:
             references_info['report-numbers'][recid] = [t.value for t in
                              record.find_subfields(tags['refs_report_number'])]
             msg = "references_info['report-numbers'][%s] = %r" \
                         % (recid, references_info['report-numbers'][recid])
             write_message(msg, verbose=9)
         if tags['refs_journal']:
             references_info['journals'][recid] = []
             for ref in record.find_subfields(tags['refs_journal']):
                 try:
                     # Inspire specific parsing
                     journal, volume, page = ref.value.split(',')
                 except ValueError:
                     pass
                 else:
                     alt_volume = get_alt_volume(volume)
                     if alt_volume:
                         alt_ref = ','.join([journal, alt_volume, page])
                         references_info['journals'][recid] += [alt_ref]
                 references_info['journals'][recid] += [ref.value]
             msg = "references_info['journals'][%s] = %r" \
                               % (recid, references_info['journals'][recid])
             write_message(msg, verbose=9)
         if tags['refs_doi']:
             references = [t.value for t in
                                        record.find_subfields(tags['refs_doi'])]
             dois = []
             hdls = []
             for ref in references:
                 if ref.startswith("hdl:"):
                     hdls.append(ref[4:])
                 elif ref.startswith("doi:"):
                     dois.append(ref[4:])
                 else:
                     dois.append(ref)
             references_info['doi'][recid] = dois
             references_info['hdl'][recid] = hdls
 
             msg = "references_info['doi'][%s] = %r" % (recid, dois)
             write_message(msg, verbose=9)
             msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
             write_message(msg, verbose=9)
 
 
         if tags['refs_record_id']:
             references_info['record_id'][recid] = [t.value for t in
                                  record.find_subfields(tags['refs_record_id'])]
             msg = "references_info['record_id'][%s] = %r" \
                                    % (recid, references_info['record_id'][recid])
             write_message(msg, verbose=9)
         if tags['refs_isbn']:
             references_info['isbn'][recid] = [t.value for t in
                                       record.find_subfields(tags['refs_isbn'])]
             msg = "references_info['isbn'][%s] = %r" \
                                    % (recid, references_info['isbn'][recid])
             write_message(msg, verbose=9)
 
         if not fetch_catchup_info:
             # We do not need the extra info
             continue
 
         if tags['record_pri_number'] or tags['record_add_number']:
             records_info['report-numbers'][recid] = []
 
             if tags['record_pri_number']:
                 records_info['report-numbers'][recid] += [t.value for t in
                             record.find_subfields(tags['record_pri_number'])]
 
             if tags['record_add_number']:
                 records_info['report-numbers'][recid] += [t.value for t in
                             record.find_subfields(tags['record_add_number'])]
 
             msg = "records_info[%s]['report-numbers'] = %r" \
                         % (recid, records_info['report-numbers'][recid])
             write_message(msg, verbose=9)
 
         if tags['doi']:
             records_info['doi'][recid] = []
             records_info['hdl'][recid] = []
             for tag in tags['doi']:
                 for field in record.find_fields(tag[:5]):
                     if 'DOI' in field.get_subfield_values('2'):
                         dois = field.get_subfield_values('a')
                         records_info['doi'][recid].extend(dois)
                     elif 'HDL' in field.get_subfield_values('2'):
                         hdls = field.get_subfield_values('a')
                         records_info['hdl'][recid].extend(hdls)
 
             msg = "records_info[%s]['doi'] = %r" \
                                       % (recid, records_info['doi'][recid])
             write_message(msg, verbose=9)
             msg = "records_info[%s]['hdl'] = %r" \
                                       % (recid, records_info['hdl'][recid])
             write_message(msg, verbose=9)
 
         if tags['isbn']:
             records_info['isbn'][recid] = []
             for tag in tags['isbn']:
                 values = [t.value for t in record.find_subfields(tag)]
                 records_info['isbn'][recid] += values
 
             msg = "records_info[%s]['isbn'] = %r" \
                                       % (recid, records_info['isbn'][recid])
             write_message(msg, verbose=9)
 
         # get a combination of
         # journal vol (year) pages
         if tags['publication']:
             records_info['journals'][recid] = get_journal_info(record, tags)
             msg = "records_info[%s]['journals'] = %r" \
                                  % (recid, records_info['journals'][recid])
             write_message(msg, verbose=9)
 
     mesg = "get cit.inf done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     end_time = os.times()[4]
     write_message("Execution time for generating citation info "
                   "from record: %.2f sec" % (end_time - begin_time))
 
     return records_info, references_info
 
 
 def standardize_report_number(report_number):
     """Format the report number to a standard form.
 
     Currently we:
     * remove category for arxiv papers
     """
     report_number = re.sub(ur'(?:arXiv:)?(\d{4}\.\d{4}) \[[a-zA-Z\.-]+\]',
                   ur'arXiv:\g<1>',
                   report_number,
                   re.I | re.U)
     return report_number
 
 
 def ref_analyzer(citation_informations, updated_recids, tags, config):
     """Analyze the citation informations and calculate the citation weight
        and cited by list dictionary.
     """
     citations = {}
     for recid in updated_recids:
         citations[recid] = set()
     references = {}
     for recid in updated_recids:
         references[recid] = set()
 
     def step(msg_prefix, recid, done, total):
         if done % 30 == 0:
             task_sleep_now_if_required()
 
         if done % 1000 == 0:
             mesg = "%s done %s of %s" % (msg_prefix, done, total)
             write_message(mesg)
             task_update_progress(mesg)
 
         write_message("Processing: %s" % recid, verbose=9)
 
     def add_to_cites(citer, citee):
         # Make sure we don't add ourselves
         # Workaround till we know why we are adding ourselves.
         if citer == citee:
             return
 
         citations[citee].add(citer)
         if citer in updated_recids:
             references[citer].add(citee)
 
     def add_to_refs(citer, citee):
         # Make sure we don't add ourselves
         # Workaround till we know why we are adding ourselves.
         if citer == citee:
             return
 
         if citee in updated_recids:
             citations[citee].add(citer)
         references[citer].add(citee)
 
     # dict of recid -> institute_give_publ_id
     records_info, references_info = citation_informations
 
     t1 = os.times()[4]
 
     # Try to find references based on 999C5r
     # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
     # meaning: rec 8 contains these in bibliography
     write_message("Phase 1: Report numbers references")
     done = 0
     for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
         step("Report numbers references", thisrecid, done,
                                         len(references_info['report-numbers']))
         done += 1
 
         for refnumber in (r for r in refnumbers if r):
             field = 'reportnumber'
             refnumber = standardize_report_number(refnumber)
             # Search for "hep-th/5644654 or such" in existing records
             recids = get_recids_matching_query(p=refnumber,
                                                f=field,
                                                config=config)
             write_message("These match searching %s in %s: %s" %
                                    (refnumber, field, list(recids)), verbose=9)
 
             if not recids:
                 insert_into_missing(thisrecid, refnumber)
             else:
                 remove_from_missing(refnumber)
 
             if len(recids) > 1:
                 store_citation_warning('multiple-matches', refnumber)
                 msg = "Whoops: record '%d' report number value '%s' " \
                       "matches many records; taking only the first one. %s" % \
                       (thisrecid, refnumber, repr(recids))
                 write_message(msg, stream=sys.stderr)
 
             for recid in list(recids)[:1]:  # take only the first one
                 add_to_refs(thisrecid, recid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     t2 = os.times()[4]
 
     # Try to find references based on 999C5s
     # e.g. Phys.Rev.Lett. 53 (1986) 2285
     write_message("Phase 2: Journal references")
     done = 0
     for thisrecid, refs in references_info['journals'].iteritems():
         step("Journal references", thisrecid, done,
                                               len(references_info['journals']))
         done += 1
 
         for reference in (r for r in refs if r):
             p = reference
             field = 'journal'
 
             # check reference value to see whether it is well formed:
             if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                 store_citation_warning('not-well-formed', p)
                 msg = "Whoops, record '%d' reference value '%s' " \
                       "is not well formed; skipping it." % (thisrecid, p)
                 write_message(msg, stream=sys.stderr)
                 continue  # skip this ill-formed value
 
             recids = get_recids_matching_query(p=p,
                                                f=field,
                                                config=config)
             write_message("These match searching %s in %s: %s"
                                  % (reference, field, list(recids)), verbose=9)
 
             if not recids:
                 insert_into_missing(thisrecid, p)
             else:
                 remove_from_missing(p)
 
             if len(recids) > 1:
                 store_citation_warning('multiple-matches', p)
                 msg = "Whoops: record '%d' reference value '%s' " \
                       "matches many records; taking only the first one. %s" % \
                       (thisrecid, p, repr(recids))
                 write_message(msg, stream=sys.stderr)
 
             for recid in list(recids)[:1]:  # take only the first one
                 add_to_refs(thisrecid, recid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     t3 = os.times()[4]
 
     # Try to find references based on 999C5a
     # e.g. 10.1007/BF03170733
     write_message("Phase 3: DOI references")
     done = 0
     for thisrecid, refs in references_info['doi'].iteritems():
         step("DOI references", thisrecid, done, len(references_info['doi']))
         done += 1
 
         for reference in (r for r in refs if r):
             p = reference
             field = 'doi'
 
             recids = get_recids_matching_query(p=p,
                                                f=field,
                                                config=config)
             write_message("These match searching %s in %s: %s"
                                  % (reference, field, list(recids)), verbose=9)
 
             if not recids:
                 insert_into_missing(thisrecid, p)
             else:
                 remove_from_missing(p)
 
             if len(recids) > 1:
                 store_citation_warning('multiple-matches', p)
                 msg = "Whoops: record '%d' DOI value '%s' " \
                       "matches many records; taking only the first one. %s" % \
                       (thisrecid, p, repr(recids))
                 write_message(msg, stream=sys.stderr)
 
             for recid in list(recids)[:1]:  # take only the first one
                 add_to_refs(thisrecid, recid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     t4 = os.times()[4]
 
     # Try to find references based on 999C5a (hdl references)
     # e.g. 4263537/4000
     write_message("Phase 4: HDL references")
     done = 0
     for thisrecid, refs in references_info['hdl'].iteritems():
         step("HDL references", thisrecid, done, len(references_info['hdl']))
         done += 1
 
         for reference in (r for r in refs if r):
             p = reference
             field = 'hdl'
 
             recids = get_recids_matching_query(p=p,
                                                f=field,
                                                config=config)
             write_message("These match searching %s in %s: %s"
                                  % (reference, field, list(recids)), verbose=9)
 
             if not recids:
                 insert_into_missing(thisrecid, p)
             else:
                 remove_from_missing(p)
 
             if len(recids) > 1:
                 store_citation_warning('multiple-matches', p)
                 msg = "Whoops: record '%d' HDL value '%s' " \
                       "matches many records; taking only the first one. %s" % \
                       (thisrecid, p, repr(recids))
                 write_message(msg, stream=sys.stderr)
 
             for recid in list(recids)[:1]:  # take only the first one
                 add_to_refs(thisrecid, recid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     t5 = os.times()[4]
 
     # Try to find references based on 999C50
     # e.g. 1244
     write_message("Phase 5: Record ID references")
     done = 0
     for thisrecid, refs in references_info['record_id'].iteritems():
         step("Record ID references", thisrecid, done, len(references_info['record_id']))
         done += 1
         field = "001"
         for recid in (r for r in refs if r):
             valid = get_recids_matching_query(p=recid, f=field, config=config)
             write_message("These match searching %s in %s: %s"
                                  % (recid, field, list(valid)), verbose=9)
             if valid:
                 add_to_refs(thisrecid, valid[0])
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     t6 = os.times()[4]
 
     # Try to find references based on 999C5i
     # e.g. 978-3-942171-73-1
     write_message("Phase 6: ISBN references")
     done = 0
     for thisrecid, refs in references_info['isbn'].iteritems():
         step("ISBN references", thisrecid, done, len(references_info['isbn']))
         done += 1
 
         for reference in (r for r in refs if r):
             p = reference
             field = 'isbn'
 
             recids = get_recids_matching_query(p=p,
                                                f=field,
                                                config=config)
             write_message("These match searching %s in %s: %s"
                                  % (reference, field, list(recids)), verbose=9)
 
             if not recids:
                 insert_into_missing(thisrecid, p)
             else:
                 remove_from_missing(p)
 
             if len(recids) > 1:
                 store_citation_warning('multiple-matches', p)
                 msg = "Whoops: record '%d' ISBN value '%s' " \
                       "matches many records; taking only the first one. %s" % \
                       (thisrecid, p, repr(recids))
                 write_message(msg, stream=sys.stderr)
 
             for recid in list(recids)[:1]:  # take only the first one
                 add_to_refs(thisrecid, recid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     t7 = os.times()[4]
 
     # Search for stuff like CERN-TH-4859/87 in list of refs
     write_message("Phase 7: report numbers catchup")
     done = 0
     for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
         step("Report numbers catchup", thisrecid, done,
                                            len(records_info['report-numbers']))
         done += 1
 
         for reportcode in (r for r in reportcodes if r):
             if reportcode.startswith('arXiv'):
                 std_reportcode = standardize_report_number(reportcode)
                 report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                 re.escape(std_reportcode)
                 recids = get_recids_matching_query(p=report_pattern,
                                                    f=tags['refs_report_number'],
                                                    m='r',
                                                    config=config)
             else:
                 recids = get_recids_matching_query(p=reportcode,
                                                    f=tags['refs_report_number'],
                                                    config=config)
             for recid in recids:
                 add_to_cites(recid, thisrecid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     # Find this record's pubinfo in other records' bibliography
     write_message("Phase 8: journals catchup")
     done = 0
     t8 = os.times()[4]
     for thisrecid, rec_journals in records_info['journals'].iteritems():
         step("Journals catchup", thisrecid, done,
                                                  len(records_info['journals']))
         done += 1
 
         for journal in rec_journals:
             journal = journal.replace("\"", "")
             # Search the publication string like
             # Phys. Lett., B 482 (2000) 417 in 999C5s
             recids = get_recids_matching_query(p=journal,
                                                f=tags['refs_journal'],
                                                config=config)
             write_message("These records match %s in %s: %s"
                     % (journal, tags['refs_journal'], list(recids)), verbose=9)
 
             for recid in recids:
                 add_to_cites(recid, thisrecid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     write_message("Phase 9: DOI catchup")
     done = 0
     t9 = os.times()[4]
     for thisrecid, dois in records_info['doi'].iteritems():
         step("DOI catchup", thisrecid, done, len(records_info['doi']))
         done += 1
 
         for doi in dois:
             recids = get_recids_matching_query(p=doi,
                                                f=tags['refs_doi'],
                                                config=config)
             write_message("These records match %s in %s: %s"
                             % (doi, tags['refs_doi'], list(recids)), verbose=9)
 
             for recid in recids:
                 add_to_cites(recid, thisrecid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     write_message("Phase 10: HDL catchup")
     done = 0
     t10 = os.times()[4]
     for thisrecid, hdls in records_info['hdl'].iteritems():
         step("HDL catchup", thisrecid, done, len(records_info['hdl']))
         done += 1
 
         for hdl in hdls:
             recids = get_recids_matching_query(p=hdl,
                                                f=tags['refs_doi'],
                                                config=config)
             write_message("These records match %s in %s: %s"
                             % (hdl, tags['refs_doi'], list(recids)), verbose=9)
 
             for recid in recids:
                 add_to_cites(recid, thisrecid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     write_message("Phase 11: ISBN catchup")
     done = 0
     t11 = os.times()[4]
     for thisrecid, isbns in records_info['isbn'].iteritems():
         step("ISBN catchup", thisrecid, done, len(records_info['isbn']))
         done += 1
 
         for isbn in isbns:
             recids = get_recids_matching_query(p=isbn,
                                                f=tags['refs_isbn'],
                                                config=config)
             write_message("These records match %s in %s: %s"
                             % (isbn, tags['refs_isbn'], list(recids)), verbose=9)
 
             for recid in recids:
                 add_to_cites(recid, thisrecid)
 
     write_message("Phase 12: Record ID catchup")
     done = 0
     t12 = os.times()[4]
     for thisrecid, record_ids in records_info['record_id'].iteritems():
         step("Record ID catchup", thisrecid, done, len(records_info['record_id']))
         done += 1
 
         for record_id in record_ids:
             recids = get_recids_matching_query(p=record_id,
                                                f=tags['refs_record_id'],
                                                config=config)
             write_message("These records match %s in %s: %s"
                             % (record_id, tags['refs_record_id'], list(recids)), verbose=9)
 
             for recid in recids:
                 add_to_cites(recid, thisrecid)
 
     mesg = "done fully"
     write_message(mesg)
     task_update_progress(mesg)
 
     if task_get_task_param('verbose') >= 3:
         # Print only X first to prevent flood
         write_message("citation_list (x is cited by y):")
         write_message(dict(islice(citations.iteritems(), 10)))
         write_message("size: %s" % len(citations))
         write_message("reference_list (x cites y):")
         write_message(dict(islice(references.iteritems(), 10)))
         write_message("size: %s" % len(references))
 
     t13 = os.times()[4]
 
     write_message("Execution time for analyzing the citation information "
                   "generating the dictionary:")
     write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
     write_message("... checking ref journals: %.2f sec" % (t3-t2))
     write_message("... checking ref DOI: %.2f sec" % (t4-t3))
     write_message("... checking ref HDL: %.2f sec" % (t5-t4))
     write_message("... checking ref Record ID: %.2f sec" % (t6-t5))
     write_message("... checking ref ISBN: %.2f sec" % (t7-t6))
     write_message("... checking rec report numbers: %.2f sec" % (t8-t7))
     write_message("... checking rec journals: %.2f sec" % (t9-t8))
     write_message("... checking rec DOI: %.2f sec" % (t10-t9))
     write_message("... checking rec HDL: %.2f sec" % (t11-t10))
     write_message("... checking rec ISBN: %.2f sec" % (t12-t11))
     write_message("... checking rec Record ID: %.2f sec" % (t13-t12))
     write_message("... total time of ref_analyze: %.2f sec" % (t13-t1))
 
     return citations, references
 
 
 def compute_refs_diff(recid, new_refs):
     """
     Given a set of references for a record, returns how many references were
     added to it. The value can be negative which means the record lost
     citations.
     """
     old_refs = set(row[0] for row in run_sql("""SELECT citee
                                                 FROM rnkCITATIONDICT
                                                 WHERE citer = %s""", [recid]))
     refs_to_add = new_refs - old_refs
     refs_to_delete = old_refs - new_refs
     return len(refs_to_add) - len(refs_to_delete)
 
 
 def compute_cites_diff(recid, new_cites):
     """
     This function does the same thing as compute_refs_diff but with citations.
     """
     old_cites = set(row[0] for row in run_sql("""SELECT citer
                                                  FROM rnkCITATIONDICT
                                                  WHERE citee = %s""", [recid]))
     cites_to_add = new_cites - old_cites
     cites_to_delete = old_cites - new_cites
     return len(cites_to_add) - len(cites_to_delete)
 
 
 def compute_dicts_diff(recids, refs, cites):
     """
     Given the new dictionaries for references and citations, computes how
     many references were added or removed by comparing them to the current
     stored in the database.
     """
     cites_diff = 0
     for recid in recids:
         cites_diff += compute_refs_diff(recid, refs[recid])
         cites_diff += compute_cites_diff(recid, cites[recid])
     return cites_diff
 
 
 def store_dicts(recids, refs, cites):
     """Insert the reference and citation list into the database"""
     for recid in recids:
         replace_refs(recid, refs[recid])
         replace_cites(recid, cites[recid])
 
 
 def replace_refs(recid, new_refs):
     """
     Given a set of references, replaces the references of given recid
     in the database.
     The changes are logged into rnkCITATIONLOG.
     """
     old_refs = set(row[0] for row in run_sql("""SELECT citee
                                                 FROM rnkCITATIONDICT
                                                 WHERE citer = %s""", [recid]))
 
     refs_to_add = new_refs - old_refs
     refs_to_delete = old_refs - new_refs
 
     for ref in refs_to_add:
         write_message('adding ref %s %s' % (recid, ref), verbose=1)
         now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         run_sql("""INSERT INTO rnkCITATIONDICT (citer, citee, last_updated)
                    VALUES (%s, %s, %s)""", (recid, ref, now))
         run_sql("""INSERT INTO rnkCITATIONLOG (citer, citee, type, action_date)
                    VALUES (%s, %s, %s, %s)""", (recid, ref, 'added', now))
 
     for ref in refs_to_delete:
         write_message('deleting ref %s %s' % (recid, ref), verbose=1)
         now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         run_sql("""DELETE FROM rnkCITATIONDICT
                    WHERE citer = %s and citee = %s""", (recid, ref))
         run_sql("""INSERT INTO rnkCITATIONLOG (citer, citee, type, action_date)
                    VALUES (%s, %s, %s, %s)""", (recid, ref, 'removed', now))
 
 
 def replace_cites(recid, new_cites):
     """
     Given a set of citations, replaces the citations of given recid
     in the database.
     The changes are logged into rnkCITATIONLOG.
 
     See @replace_refs
     """
     old_cites = set(row[0] for row in run_sql("""SELECT citer
                                                 FROM rnkCITATIONDICT
                                                 WHERE citee = %s""", [recid]))
 
     cites_to_add = new_cites - old_cites
     cites_to_delete = old_cites - new_cites
 
     for cite in cites_to_add:
         write_message('adding cite %s %s' % (recid, cite), verbose=1)
         now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         run_sql("""INSERT INTO rnkCITATIONDICT (citee, citer, last_updated)
                    VALUES (%s, %s, %s)""", (recid, cite, now))
         run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
                    VALUES (%s, %s, %s, %s)""", (recid, cite, 'added', now))
 
     for cite in cites_to_delete:
         write_message('deleting cite %s %s' % (recid, cite), verbose=1)
         now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         run_sql("""DELETE FROM rnkCITATIONDICT
                    WHERE citee = %s and citer = %s""", (recid, cite))
         run_sql("""INSERT INTO rnkCITATIONLOG (citee, citer, type, action_date)
                    VALUES (%s, %s, %s, %s)""", (recid, cite, 'removed', now))
 
 
 def insert_into_missing(recid, report):
     """Mark reference string as missing.
 
        If a reference is a report number / journal / DOI but we do not have
        the corresping record in the database, we mark that particualar
        reference string as missing, by adding a row in rnkCITATIONDATAEXT.
        The recid represents the record containing the reference string.
     """
     if len(report) >= 255:
         # Invalid report, it is too long
         # and does not fit in the database column
         # (currently varchar 255)
         return
     wasalready = run_sql("""SELECT id_bibrec
                             FROM rnkCITATIONDATAEXT
                             WHERE id_bibrec = %s
                             AND extcitepubinfo = %s""",
                           (recid, report))
     if not wasalready:
         run_sql("""INSERT INTO rnkCITATIONDATAEXT(id_bibrec, extcitepubinfo)
                    VALUES (%s,%s)""", (recid, report))
 
 
 def remove_from_missing(report):
     """Remove the reference string from the missing table
 
        See @insert_into_missing"""
     run_sql("""DELETE FROM rnkCITATIONDATAEXT
                WHERE extcitepubinfo = %s""", (report,))
 
 
 def print_missing(num):
     """
     Print the contents of rnkCITATIONDATAEXT table containing external
     records that were cited by NUM or more internal records.
 
     NUM is by default taken from the -E command line option.
     """
     if not num:
         num = task_get_option("print-extcites")
 
     write_message("Listing external papers cited by %i or more \
                                                       internal records:" % num)
 
     res = run_sql("""SELECT COUNT(id_bibrec), extcitepubinfo
                      FROM rnkCITATIONDATAEXT
                      GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s
                      ORDER BY COUNT(id_bibrec) DESC""", (num,))
     for cnt, brec in res:
         print str(cnt), "\t", brec
 
     write_message("Listing done.")
 
 
 def tagify(parsedtag):
     """aux auf to make '100__a' out of ['100','','','a']"""
     tag = ""
     for t in parsedtag:
         if t == '':
             t = '_'
         tag += t
     return tag
 
 
 def store_citation_warning(warning_type, cit_info):
     """Store citation indexing warnings in the database
 
     If we encounter a problem during the citation indexing, such as multiple
     results for a report number, we store a warning in rnkCITATIONDATAERR
     """
     r = run_sql("""SELECT 1 FROM rnkCITATIONDATAERR
                    WHERE type = %s
                    AND citinfo = %s""", (warning_type, cit_info))
     if not r:
         run_sql("""INSERT INTO rnkCITATIONDATAERR (type, citinfo)
                    VALUES (%s, %s)""", (warning_type, cit_info))
diff --git a/modules/bibrank/lib/bibrank_citerank_indexer.py b/modules/bibrank/lib/bibrank_citerank_indexer.py
index 93377acb8..e7b3279e7 100644
--- a/modules/bibrank/lib/bibrank_citerank_indexer.py
+++ b/modules/bibrank/lib/bibrank_citerank_indexer.py
@@ -1,881 +1,881 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2009, 2010, 2011 CERN.
+# Copyright (C) 2009, 2010, 2011, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """Implementation of different ranking methods based on
 the citation graph:
 - citation count/ time decayed citation count
 - pagerank / pagerank with external citations
 - time decayed pagerank
 """
 
 # pylint: disable=E0611
 
 import ConfigParser
 from math import exp
 import datetime
 import time
 import re
 import sys
 try:
     from numpy import array, ones, zeros, int32, float32, sqrt, dot
     import_numpy = 1
 except ImportError:
     import_numpy = 0
 
 if sys.hexversion < 0x2040000:
     # pylint: disable=W0622
     from sets import Set as set
     # pylint: enable=W0622
 
 from invenio.dbquery import run_sql, serialize_via_marshal
 from invenio.bibtask import write_message
 from invenio.config import CFG_ETCDIR
 
 
 def get_citations_from_file(filename):
     """gets the citation data (who cites who) from a file and returns
     - a dictionary of type x:{x1,x2..},
             where x is cited by x1,x2..
     - a dictionary of type a:{b},
              where recid 'a' is asociated with an index 'b' """
     cit = {}
     dict_of_ids = {}
     count = 0
     try:
         citation_file = open(filename, "r")
     except StandardError:
         write_message("Cannot find file: %s" % filename, sys.stderr)
         raise StandardError
     for line in citation_file:
         tokens = line.strip().split()
         recid_cites = int(tokens[0])
         recid_cited = int(tokens[1])
         if recid_cited not in cit:
             cit[recid_cited] = []
         #without this, duplicates might be introduced
         if recid_cites not in cit[recid_cited] and recid_cites != recid_cited:
             cit[recid_cited].append(recid_cites)
         if recid_cites not in dict_of_ids:
             dict_of_ids[recid_cites] = count
             count += 1
         if recid_cited not in dict_of_ids:
             dict_of_ids[recid_cited] = count
             count += 1
     citation_file.close()
     write_message("Citation data collected from file: %s" %filename, verbose=2)
     write_message("Ids and recids corespondace: %s" \
         %str(dict_of_ids), verbose=9)
     write_message("Citations: %s" % str(cit), verbose=9)
     return cit, dict_of_ids
 
 
 def get_citations_from_db():
     """gets the citation data (who cites who) from the rnkCITATIONDATA table,
     and returns:
     -a dictionary of type x:{x1,x2..}, where x is cited by x1,x2..
     -a dict of type a:{b} where recid 'a' is asociated with an index 'b'"""
     dict_of_ids = {}
 
     cit = {}
     rows = run_sql("SELECT citer, citee FROM rnkCITATIONDICT")
     for citer, citee in rows:
         cit.setdefault(citee, set()).add(citer)
 
     count = 0
     for item in cit:
         if item in cit[item]:
             cit[item].remove(item)
         if item not in dict_of_ids:
             dict_of_ids[item] = count
             count += 1
         for value in cit[item]:
             if value not in dict_of_ids:
                 dict_of_ids[value] = count
                 count += 1
 
     write_message("Citation data collected", verbose=2)
     write_message("Ids and recids correspondence: %s" \
                                                  % str(dict_of_ids), verbose=9)
     write_message("Citations: %s" % str(cit), verbose=9)
 
     return cit, dict_of_ids
 
 
 def construct_ref_array(cit, dict_of_ids, len_):
     """returns an array with the number of references that each recid has """
     ref = array((), int32)
     ref = zeros(len_, int32)
     for key in cit:
         for value in cit[key]:
             ref[dict_of_ids[value]] += 1
     write_message("Number of references: %s" %str(ref), verbose=9)
     write_message("Finished computing total number \
 of references for each paper.", verbose=5)
     return ref
 
 
 def get_external_links_from_file(filename, ref, dict_of_ids):
     """returns a dictionary containing the number of
     external links for each recid
     external link=citation that is not in our database """
     ext_links = {}
     #format: ext_links[dict_of_ids[recid]]=number of total external links
     try:
         external_file = open(filename, "r")
     except StandardError:
         write_message("Cannot find file: %s" % filename, sys.stderr)
         raise StandardError
     for line in external_file:
         tokens = line.strip().split()
         recid = int(tokens[0])
         nr_of_external = int(tokens[1])
         ext_links[dict_of_ids[recid]] = nr_of_external - ref[dict_of_ids[recid]]
         if ext_links[dict_of_ids[recid]] < 0:
             ext_links[dict_of_ids[recid]] = 0
     external_file.close()
     write_message("External link information extracted", verbose=2)
     return ext_links
 
 
 def get_external_links_from_db_old(ref, dict_of_ids, reference_indicator):
     """returns a dictionary containing the number of
     external links for each recid
     external link=citation that is not in our database """
     ext_links = {}
     reference_tag_regex = reference_indicator + "[a-z]"
     for recid in dict_of_ids:
         query = "select COUNT(DISTINCT field_number) from bibrec_bib99x \
                 where id_bibrec='%s' and id_bibxxx in \
                 (select id from bib99x where tag RLIKE '%s');" \
                     % (str(recid), reference_tag_regex)
         result_set = run_sql(query)
         if result_set:
             total_links = int(result_set[0][0])
             internal_links = ref[dict_of_ids[recid]]
             ext_links[dict_of_ids[recid]] = total_links - internal_links
             if ext_links[dict_of_ids[recid]] < 0:
                 ext_links[dict_of_ids[recid]] = 0
         else:
             ext_links[dict_of_ids[recid]] = 0
     write_message("External link information extracted", verbose=2)
     write_message("External links: %s" % str(ext_links), verbose=9)
     return ext_links
 
 
 def get_external_links_from_db(ref, dict_of_ids, reference_indicator):
     """returns a dictionary containing the number of
     external links for each recid
     external link=citation that is not in our database """
     ext_links = {}
     dict_all_ref = {}
     for recid in dict_of_ids:
         dict_all_ref[recid] = 0
         ext_links[dict_of_ids[recid]] = 0
     reference_db_id = reference_indicator[0:2]
     reference_tag_regex = reference_indicator + "[a-z]"
     tag_list = run_sql("select id from bib" + reference_db_id + \
                          "x where tag RLIKE %s", (reference_tag_regex, ))
     tag_set = set()
     for tag in tag_list:
         tag_set.add(tag[0])
     ref_list = run_sql("select id_bibrec, id_bibxxx, field_number from \
                        bibrec_bib" + reference_db_id + "x group by \
                        id_bibrec, field_number")
     for item in ref_list:
         recid = int(item[0])
         id_bib = int(item[1])
         if recid in dict_of_ids and id_bib in tag_set:
             dict_all_ref[recid] += 1
     for recid in dict_of_ids:
         total_links = dict_all_ref[recid]
         internal_links = ref[dict_of_ids[recid]]
         ext_links[dict_of_ids[recid]] = total_links - internal_links
         if ext_links[dict_of_ids[recid]] < 0:
             ext_links[dict_of_ids[recid]] = 0
     write_message("External link information extracted", verbose=2)
     write_message("External links: %s" % str(ext_links), verbose=9)
     return ext_links
 
 
 def avg_ext_links_with_0(ext_links):
     """returns the average number of external links per paper
     including in the counting the papers with 0 external links"""
     total = 0.0
     for item in ext_links:
         total += ext_links[item]
     avg_ext = total/len(ext_links)
     write_message("The average number of external links per paper (including \
 papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
     return avg_ext
 
 
 def avg_ext_links_without_0(ext_links):
     """returns the average number of external links per paper
     excluding in the counting the papers with 0 external links"""
     count = 0.0
     total = 0.0
     for item in ext_links:
         if ext_links[item] != 0:
             count += 1
             total += ext_links[item]
     avg_ext = total/count
     write_message("The average number of external links per paper (excluding \
 papers with 0 external links) is: %s" % str(avg_ext), verbose=3)
     return avg_ext
 
 
 def leaves(ref):
     """returns the number of papers that do not cite any other paper"""
     nr_of_leaves = 0
     for i in ref:
         if i == 0:
             nr_of_leaves += 1
     write_message("The number of papers that do not cite \
 any other papers: %s" % str(leaves), verbose=3)
     return nr_of_leaves
 
 
 def get_dates_from_file(filename, dict_of_ids):
     """Returns the year of the publication for each paper.
     In case the year is not in the db, the year of the submission is taken"""
     dates = {}
     # the format is: dates[dict_of_ids[recid]] = year
     try:
         dates_file = open(filename, "r")
     except StandardError:
         write_message("Cannot find file: %s" % filename, sys.stderr)
         raise StandardError
     for line in dates_file:
         tokens = line.strip().split()
         recid = int(tokens[0])
         year = int(tokens[1])
         dates[dict_of_ids[recid]] = year
     dates_file.close()
     write_message("Dates extracted", verbose=2)
     write_message("Dates dictionary %s" % str(dates), verbose=9)
     return dates
 
 
 def get_dates_from_db(dict_of_ids, publication_year_tag, creation_date_tag):
     """Returns the year of the publication for each paper.
     In case the year is not in the db, the year of the submission is taken"""
     current_year = int(datetime.datetime.now().strftime("%Y"))
     publication_year_db_id = publication_year_tag[0:2]
     creation_date_db_id = creation_date_tag[0:2]
     total = 0
     count = 0
     dict_of_dates = {}
     for recid in dict_of_ids:
         dict_of_dates[recid] = 0
     date_list = run_sql("select id, tag, value from bib" + \
                         publication_year_db_id + "x where tag=%s", \
                         (publication_year_tag, ))
     date_dict = {}
     for item in date_list:
         date_dict[int(item[0])] = item[2]
     pattern = re.compile('.*(\d{4}).*')
     date_list = run_sql("select id_bibrec, id_bibxxx, field_number \
                         from bibrec_bib" + publication_year_db_id +"x")
     for item in date_list:
         recid = int(item[0])
         id_ = int(item[1])
         if id_ in date_dict and recid in dict_of_dates:
             reg = pattern.match(date_dict[id_])
             if reg:
                 date = int(reg.group(1))
                 if date > 1000 and date <= current_year:
                     dict_of_dates[recid] = date
                     total += date
                     count += 1
     not_covered = []
     for recid in dict_of_dates:
         if dict_of_dates[recid] == 0:
             not_covered.append(recid)
     date_list = run_sql("select id, tag, value from bib" + \
                         creation_date_db_id + "x where tag=%s", \
                         (creation_date_tag, ))
     date_dict = {}
     for item in date_list:
         date_dict[int(item[0])] = item[2]
     date_list = run_sql("select id_bibrec, id_bibxxx, field_number \
                         from bibrec_bib" + creation_date_db_id + "x")
     for item in date_list:
         recid = int(item[0])
         id_ = int(item[1])
         if id_ in date_dict and recid in not_covered:
             date = int(str(date_dict[id_])[0:4])
             if date > 1000 and date <= current_year:
                 dict_of_dates[recid] = date
                 total += date
                 count += 1
     dates = {}
     med = total/count
     for recid in dict_of_dates:
         if dict_of_dates[recid] == 0:
             dates[dict_of_ids[recid]] = med
         else:
             dates[dict_of_ids[recid]] = dict_of_dates[recid]
     write_message("Dates extracted", verbose=2)
     write_message("Dates dictionary %s" % str(dates), verbose=9)
     return dates
 
 
 def construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor):
     """returns several structures needed in the calculation
     of the PAGERANK method using this structures, we don't need
     to keep the full matrix in the memory"""
     sparse = {}
     for item in cit:
         for value in cit[item]:
             sparse[(dict_of_ids[item], dict_of_ids[value])] = \
                     damping_factor * 1.0/ref[dict_of_ids[value]]
     semi_sparse = []
     for j in range(len_):
         if ref[j] == 0:
             semi_sparse.append(j)
     semi_sparse_coeficient = damping_factor/len_
     #zero_coeficient = (1-damping_factor)/len_
     write_message("Sparse information calculated", verbose=3)
     return sparse, semi_sparse, semi_sparse_coeficient
 
 
 def construct_sparse_matrix_ext(cit, ref, ext_links, dict_of_ids, alpha, beta):
     """if x doesn't cite anyone: cites everyone : 1/len_ -- should be used!
     returns several structures needed in the calculation
     of the PAGERANK_EXT method"""
     len_ = len(dict_of_ids)
     sparse = {}
     semi_sparse = {}
     sparse[0, 0] = 1.0 - alpha
     for j in range(len_):
         sparse[j+1, 0] = alpha/(len_)
         if j not in ext_links:
             sparse[0, j+1] = beta/(len_ + beta)
         else:
             if ext_links[j] == 0:
                 sparse[0, j+1] = beta/(len_ + beta)
             else:
                 aux = beta * ext_links[j]
                 if ref[j] == 0:
                     sparse[0, j+1] = aux/(aux + len_)
                 else:
                     sparse[0, j+1] = aux/(aux + ref[j])
         if ref[j] == 0:
             semi_sparse[j+1] = (1.0 - sparse[0, j + 1])/len_
     for item in cit:
         for value in cit[item]:
             sparse[(dict_of_ids[item] + 1, dict_of_ids[value] + 1)] = \
                (1.0 - sparse[0, dict_of_ids[value] + 1])/ref[dict_of_ids[value]]
     #for i in range(len_ + 1):
     #    a = ""
     #    for j in range (len_ + 1):
     #        if (i,j) in sparse:
     #            a += str(sparse[(i,j)]) + "\t"
     #        else:
     #            a += "0\t"
     #    print a
     #print semi_sparse
     write_message("Sparse information calculated", verbose=3)
     return sparse, semi_sparse
 
 
 def construct_sparse_matrix_time(cit, ref, dict_of_ids, \
          damping_factor, date_coef):
     """returns several structures needed in the calculation of the PAGERANK_time
     method using this structures,
     we don't need to keep the full matrix in the memory"""
     len_ = len(dict_of_ids)
     sparse = {}
     for item in cit:
         for value in cit[item]:
             sparse[(dict_of_ids[item], dict_of_ids[value])] = damping_factor * \
                     date_coef[dict_of_ids[value]]/ref[dict_of_ids[value]]
     semi_sparse = []
     for j in range(len_):
         if ref[j] == 0:
             semi_sparse.append(j)
     semi_sparse_coeficient = damping_factor/len_
     #zero_coeficient = (1-damping_factor)/len_
     write_message("Sparse information calculated", verbose=3)
     return sparse, semi_sparse, semi_sparse_coeficient
 
 
 def statistics_on_sparse(sparse):
     """returns the number of papers that cite themselves"""
     count_diag = 0
     for (i, j) in sparse.keys():
         if i == j:
             count_diag += 1
     write_message("The number of papers that cite themselves: %s" % \
         str(count_diag), verbose=3)
     return count_diag
 
 
 def pagerank(conv_threshold, check_point, len_, sparse, \
             semi_sparse, semi_sparse_coef):
     """the core function of the PAGERANK method
     returns an array with the ranks coresponding to each recid"""
     weights_old = ones((len_), float32) # initial weights
     weights_new = array((), float32)
     converged = False
     nr_of_check_points = 0
     difference = len_
     while not converged:
         nr_of_check_points += 1
         for step in (range(check_point)):
             weights_new = zeros((len_), float32)
             for (i, j) in sparse.keys():
                 weights_new[i] += sparse[(i, j)]*weights_old[j]
             semi_total = 0.0
             for j in semi_sparse:
                 semi_total += weights_old[j]
             weights_new = weights_new + semi_sparse_coef * semi_total + \
                             (1.0/len_ - semi_sparse_coef) * sum(weights_old)
             if step == check_point - 1:
                 diff = weights_new - weights_old
                 difference = sqrt(dot(diff, diff))/len_
                 write_message("Finished step: %s, %s " \
                         %(str(check_point*(nr_of_check_points-1) + step), \
                             str(difference)), verbose=5)
             weights_old = weights_new.copy()
             converged = (difference < conv_threshold)
     write_message("PageRank calculated for all recids finnished in %s steps. \
 The threshold was %s" % (str(nr_of_check_points), str(difference)),\
              verbose=2)
     return weights_old
 
 
 def pagerank_ext(conv_threshold, check_point, len_, sparse, semi_sparse):
     """the core function of the PAGERANK_EXT method
     returns an array with the ranks coresponding to each recid"""
     weights_old = array((), float32)
     weights_old = ones((len_), float32)
     weights_new = array((), float32)
     converged = False
     nr_of_check_points = 0
     difference = len_
     while not converged:
         nr_of_check_points += 1
         for step in (range(check_point)):
             weights_new = zeros((len_), float32)
             for (i, j) in sparse.keys():
                 weights_new[i] += sparse[(i, j)]*weights_old[j]
             total_sum = 0.0
             for j in semi_sparse:
                 total_sum += semi_sparse[j]*weights_old[j]
             weights_new[1:len_] = weights_new[1:len_] + total_sum
             if step == check_point - 1:
                 diff = weights_new - weights_old
                 difference = sqrt(dot(diff, diff))/len_
                 write_message("Finished step: %s, %s " \
                     % (str(check_point*(nr_of_check_points-1) + step), \
                         str(difference)), verbose=5)
             weights_old = weights_new.copy()
             converged = (difference < conv_threshold)
     write_message("PageRank calculated for all recids finnished in %s steps. \
 The threshold was %s" % (str(nr_of_check_points), \
             str(difference)), verbose=2)
     #return weights_old[1:len_]/(len_ - weights_old[0])
     return weights_old[1:len_]
 
 
 def pagerank_time(conv_threshold, check_point, len_, \
         sparse, semi_sparse, semi_sparse_coeficient, date_coef):
     """the core function of the PAGERANK_TIME method: pageRank + time decay
     returns an array with the ranks coresponding to each recid"""
     weights_old = array((), float32)
     weights_old = ones((len_), float32) # initial weights
     weights_new = array((), float32)
     converged = False
     nr_of_check_points = 0
     difference = len_
     while not converged:
         nr_of_check_points += 1
         for step in (range(check_point)):
             weights_new = zeros((len_), float32)
             for (i, j) in sparse.keys():
                 weights_new[i] += sparse[(i, j)]*weights_old[j]
             semi_total = 0.0
             for j in semi_sparse:
                 semi_total += weights_old[j]*date_coef[j]
             zero_total = 0.0
             for i in range(len_):
                 zero_total += weights_old[i]*date_coef[i]
             #dates = array(date_coef.keys())
             #zero_total = dot(weights_old, dates)
             weights_new = weights_new + semi_sparse_coeficient * semi_total + \
                     (1.0/len_ - semi_sparse_coeficient) * zero_total
             if step == check_point - 1:
                 diff = weights_new - weights_old
                 difference = sqrt(dot(diff, diff))/len_
                 write_message("Finished step: %s, %s " \
                     % (str(check_point*(nr_of_check_points-1) + step), \
                     str(difference)), verbose=5)
             weights_old = weights_new.copy()
             converged = (difference < conv_threshold)
     write_message("PageRank calculated for all recids finnished in %s steps.\
 The threshold was %s" % (str(nr_of_check_points), \
         str(difference)), verbose=2)
     return weights_old
 
 
 def citation_rank_time(cit, dict_of_ids, date_coef, dates, decimals):
     """returns a dictionary recid:weight based on the total number of
     citations as function of time"""
     dict_of_ranks = {}
     for key in dict_of_ids:
         if key in cit:
             dict_of_ranks[key] = 0
             for recid in cit[key]:
                 dict_of_ranks[key] += date_coef[dict_of_ids[recid]]
             dict_of_ranks[key] = round(dict_of_ranks[key], decimals) \
 + dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
         else:
             dict_of_ranks[key] = dates[dict_of_ids[key]]* pow(10, 0-4-decimals)
     write_message("Citation rank calculated", verbose=2)
     return dict_of_ranks
 
 
 def get_ranks(weights, dict_of_ids, mult, dates, decimals):
     """returns a dictionary recid:value, where value is the weight of the
     recid paper; the second order is the reverse time order,
     from recent to past"""
     dict_of_ranks = {}
     for item in dict_of_ids:
         dict_of_ranks[item] = round(weights[dict_of_ids[item]]* mult, decimals)\
           + dates[dict_of_ids[item]]* pow(10, 0-4-decimals)
         #dict_of_ranks[item] = weights[dict_of_ids[item]]
     return dict_of_ranks
 
 
 def sort_weights(dict_of_ranks):
     """sorts the recids based on weights(first order)
     and on dates(second order)"""
     ranks_by_citations = sorted(dict_of_ranks.keys(), lambda x, y: \
 cmp(dict_of_ranks[y], dict_of_ranks[x]))
     return ranks_by_citations
 
 
 def normalize_weights(dict_of_ranks):
     """the weights should be normalized to 100, so they woun't be
     different from the weights from other ranking methods"""
     max_weight = 0.0
     for recid in dict_of_ranks:
         weight = dict_of_ranks[recid]
         if weight > max_weight:
             max_weight = weight
     for recid in dict_of_ranks:
         dict_of_ranks[recid] = round(dict_of_ranks[recid] * 100.0/max_weight, 3)
 
 
 def write_first_ranks_to_file(ranks_by_citations, dict_of_ranks, \
         nr_of_ranks, filename):
     """Writes the first n results of the ranking method into a file"""
     try:
         ranks_file = open(filename, "w")
     except StandardError:
         write_message("Problems with file: %s" % filename, sys.stderr)
         raise StandardError
     for i in range(nr_of_ranks):
         ranks_file.write(str(i+1) + "\t" + str(ranks_by_citations[i]) + \
             "\t" + str(dict_of_ranks[ranks_by_citations[i]]) + "\n")
     ranks_file.close()
     write_message("The first %s pairs recid:rank in the ranking order \
 are written into this file: %s" % (nr_of_ranks, filename), verbose=2)
 
 
 def del_rank_method_data(rank_method_code):
     """Delete the data for a rank method from rnkMETHODDATA table"""
     id_ = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
     run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id_[0][0], ))
 
 
 def into_db(dict_of_ranks, rank_method_code):
     """Writes into the rnkMETHODDATA table the ranking results"""
     method_id = run_sql("SELECT id from rnkMETHOD where name=%s", \
         (rank_method_code, ))
     del_rank_method_data(rank_method_code)
     serialized_data = serialize_via_marshal(dict_of_ranks)
     method_id_str = str(method_id[0][0])
     run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) \
-        VALUES(%s, %s) ", (method_id_str, serialized_data, ))
+        VALUES(%s, _binary %s) ", (method_id_str, serialized_data, ))
     date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", \
         (date, rank_method_code))
     write_message("Finished writing the ranks into rnkMETHOD table", verbose=5)
 
 
 def run_pagerank(cit, dict_of_ids, len_, ref, damping_factor, \
             conv_threshold, check_point, dates):
     """returns the final form of the ranks when using pagerank method"""
     write_message("Running the PageRank method", verbose=5)
     sparse, semi_sparse, semi_sparse_coeficient = \
         construct_sparse_matrix(cit, ref, dict_of_ids, len_, damping_factor)
     weights = pagerank(conv_threshold, check_point, len_, \
                     sparse, semi_sparse, semi_sparse_coeficient)
     dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
     return dict_of_ranks
 
 
 def run_pagerank_ext(cit, dict_of_ids, ref, ext_links, \
                         conv_threshold, check_point, alpha, beta, dates):
     """returns the final form of the ranks when using pagerank_ext method"""
     write_message("Running the PageRank with external links method", verbose=5)
     len_ = len(dict_of_ids)
     sparse, semi_sparse = construct_sparse_matrix_ext(cit, ref, \
         ext_links, dict_of_ids, alpha, beta)
     weights = pagerank_ext(conv_threshold, check_point, \
         len_ + 1, sparse, semi_sparse)
     dict_of_ranks = get_ranks(weights, dict_of_ids, 1, dates, 2)
     return dict_of_ranks
 
 
 def run_pagerank_time(cit, dict_of_ids, len_, ref, damping_factor, \
                         conv_threshold, check_point, date_coef, dates):
     """returns the final form of the ranks when using
     pagerank + time decay method"""
     write_message("Running the PageRank_time method", verbose=5)
     sparse, semi_sparse, semi_sparse_coeficient = \
         construct_sparse_matrix_time(cit, ref, dict_of_ids, \
             damping_factor, date_coef)
     weights = pagerank_time(conv_threshold, check_point, len_, \
         sparse, semi_sparse, semi_sparse_coeficient, date_coef)
     dict_of_ranks = get_ranks(weights, dict_of_ids, 100000, dates, 2)
     return dict_of_ranks
 
 
 def run_citation_rank_time(cit, dict_of_ids, date_coef, dates):
     """returns the final form of the ranks when using citation count
     as function of time method"""
     write_message("Running the citation rank with time decay method", verbose=5)
     dict_of_ranks = citation_rank_time(cit, dict_of_ids, date_coef, dates, 2)
     return dict_of_ranks
 
 
 def spearman_rank_correlation_coef(rank1, rank2, len_):
     """rank1 and rank2 are arrays containing the recids in the ranking order
     returns the corelation coeficient (-1 <= c <= 1) between 2 rankings
     the closec c is to 1, the more correlated are the two ranking methods"""
     total = 0
     for i in range(len_):
         rank_value = rank2.index(rank1[i])
         total += (i - rank_value)*(i - rank_value)
     return 1 - (6.0 * total) / (len_*(len_*len_ - 1))
 
 
 def remove_loops(cit, dates, dict_of_ids):
     """when using time decay, new papers that are part of a loop
     are accumulating a lot of fake weight"""
     new_cit = {}
     for recid in cit:
         new_cit[recid] = []
         for cited_by in cit[recid]:
             if dates[dict_of_ids[cited_by]] >= dates[dict_of_ids[recid]]:
                 if cited_by in cit:
                     if recid not in cit[cited_by]:
                         new_cit[recid].append(cited_by)
                     else:
                         write_message("Loop removed: %s <-> %s" \
                             %(cited_by, recid), verbose=9)
                 else:
                     new_cit[recid].append(cited_by)
             else:
                 write_message("Loop removed: %s <-> %s" \
                         %(cited_by, recid), verbose=9)
     write_message("Simple loops removed", verbose=5)
     return new_cit
 
 
 def calculate_time_weights(len_, time_decay, dates):
     """calculates the time coeficients for each paper"""
     current_year = int(datetime.datetime.now().strftime("%Y"))
     date_coef = {}
     for j in range(len_):
         date_coef[j] = exp(time_decay*(dates[j] - current_year))
     write_message("Time weights calculated", verbose=5)
     write_message("Time weights: %s" % str(date_coef), verbose=9)
     return date_coef
 
 
 def get_dates(function, config, dict_of_ids):
     """returns a dictionary containing the year of
     publishing for each paper"""
     try:
         file_for_dates = config.get(function, "file_with_dates")
         dates = get_dates_from_file(file_for_dates, dict_of_ids)
     except (ConfigParser.NoOptionError, StandardError), err:
         write_message("If you want to read the dates from file set up the \
 'file_for_dates' variable in the config file [%s]" %err, verbose=3)
     try:
         publication_year_tag = config.get(function, "publication_year_tag")
         dummy = int(publication_year_tag[0:3])
     except (ConfigParser.NoOptionError, StandardError):
         write_message("You need to set up correctly the publication_year_tag \
                       in the cfg file", sys.stderr)
         raise Exception
     try:
         creation_date_tag = config.get(function, "creation_date_tag")
         dummy = int(creation_date_tag[0:3])
     except (ConfigParser.NoOptionError, StandardError):
         write_message("You need to set up correctly the creation_date_tag \
                       in the cfg file", sys.stderr)
         raise Exception
     dates = get_dates_from_db(dict_of_ids, publication_year_tag, \
                               creation_date_tag)
     return dates
 
 
 def citerank(rank_method_code):
     """new ranking method based on the citation graph"""
     write_message("Running rank method: %s" % rank_method_code, verbose=0)
     if not import_numpy:
         write_message('The numpy package could not be imported. \
 This package is compulsory for running the citerank methods.')
         return
     try:
         file_ = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg"
         config = ConfigParser.ConfigParser()
         config.readfp(open(file_))
     except StandardError:
         write_message("Cannot find configuration file: %s" % file_, sys.stderr)
         raise StandardError
     # the file for citations needs to have the following format:
     #each line needs to be x[tab]y, where x cites y; x,y are recids
     function = config.get("rank_method", "function")
     try:
         file_for_citations = config.get(function, "file_with_citations")
         cit, dict_of_ids = get_citations_from_file(file_for_citations)
     except (ConfigParser.NoOptionError, StandardError), err:
         write_message("If you want to read the citation data from file set up \
 the file_for_citations parameter in the config file [%s]" %err, verbose=2)
         cit, dict_of_ids = get_citations_from_db()
     len_ = len(dict_of_ids.keys())
     write_message("Number of nodes(papers) to rank : %s" % str(len_), verbose=3)
     if len_ == 0:
         write_message("No citation data found, nothing to be done.")
         return
     try:
         method = config.get(function, "citerank_method")
     except ConfigParser.NoOptionError, err:
         write_message("Exception: %s " %err, sys.stderr)
         raise Exception
     write_message("Running %s method." % method, verbose=2)
     dates = get_dates(function, config, dict_of_ids)
     if method == "citation_time":
         try:
             time_decay = float(config.get(function, "time_decay"))
         except (ConfigParser.NoOptionError, ValueError), err:
             write_message("Exception: %s" % err, sys.stderr)
             raise Exception
         date_coef = calculate_time_weights(len_, time_decay, dates)
         #cit = remove_loops(cit, dates, dict_of_ids)
         dict_of_ranks = \
             run_citation_rank_time(cit, dict_of_ids, date_coef, dates)
     else:
         try:
             conv_threshold = float(config.get(function, "conv_threshold"))
             check_point = int(config.get(function, "check_point"))
             damping_factor = float(config.get(function, "damping_factor"))
             write_message("Parameters: d = %s, conv_threshold = %s, \
 check_point = %s" %(str(damping_factor), \
 str(conv_threshold), str(check_point)), verbose=5)
         except (ConfigParser.NoOptionError, StandardError), err:
             write_message("Exception: %s" % err, sys.stderr)
             raise Exception
         if method == "pagerank_classic":
             ref = construct_ref_array(cit, dict_of_ids, len_)
             use_ext_cit = ""
             try:
                 use_ext_cit = config.get(function, "use_external_citations")
                 write_message("Pagerank will use external citations: %s" \
                    %str(use_ext_cit), verbose=5)
             except (ConfigParser.NoOptionError, StandardError), err:
                 write_message("%s" % err, verbose=2)
             if use_ext_cit == "yes":
                 try:
                     ext_citation_file = config.get(function, "ext_citation_file")
                     ext_links = get_external_links_from_file(ext_citation_file,
                                                              ref, dict_of_ids)
                 except (ConfigParser.NoOptionError, StandardError):
                     write_message("If you want to read the external citation \
 data from file set up the ext_citation_file parameter in the config. file", \
 verbose=3)
                     try:
                         reference_tag = config.get(function, "ext_reference_tag")
                         dummy = int(reference_tag[0:3])
                     except (ConfigParser.NoOptionError, StandardError):
                         write_message("You need to set up correctly the \
 reference_tag in the cfg file", sys.stderr)
                         raise Exception
                     ext_links = get_external_links_from_db(ref, \
                             dict_of_ids, reference_tag)
                     avg = avg_ext_links_with_0(ext_links)
                     if avg < 1:
                         write_message("This method can't be ran. There is not \
 enough information about the external citation. Hint: check the reference tag", \
 sys.stderr)
                         raise Exception
                     avg_ext_links_without_0(ext_links)
                 try:
                     alpha = float(config.get(function, "ext_alpha"))
                     beta = float(config.get(function, "ext_beta"))
                 except (ConfigParser.NoOptionError, StandardError), err:
                     write_message("Exception: %s" % err, sys.stderr)
                     raise Exception
                 dict_of_ranks = run_pagerank_ext(cit, dict_of_ids, ref, \
                 ext_links, conv_threshold, check_point, alpha, beta, dates)
             else:
                 dict_of_ranks = run_pagerank(cit, dict_of_ids, len_, ref, \
                     damping_factor, conv_threshold, check_point, dates)
         elif method == "pagerank_time":
             try:
                 time_decay = float(config.get(function, "time_decay"))
                 write_message("Parameter: time_decay = %s" \
                               %str(time_decay), verbose=5)
             except (ConfigParser.NoOptionError, StandardError), err:
                 write_message("Exception: %s" % err, sys.stderr)
                 raise Exception
             date_coef = calculate_time_weights(len_, time_decay, dates)
             cit = remove_loops(cit, dates, dict_of_ids)
             ref = construct_ref_array(cit, dict_of_ids, len_)
             dict_of_ranks = run_pagerank_time(cit, dict_of_ids, len_, ref, \
              damping_factor, conv_threshold, check_point, date_coef, dates)
         else:
             write_message("Error: Unknown ranking method. \
 Please check the ranking_method parameter in the config. file.", sys.stderr)
             raise Exception
     try:
         filename_ranks = config.get(function, "output_ranks_to_filename")
         max_ranks = config.get(function, "output_rank_limit")
         if not max_ranks.isdigit():
             max_ranks = len_
         else:
             max_ranks = int(max_ranks)
             if max_ranks > len_:
                 max_ranks = len_
         ranks = sort_weights(dict_of_ranks)
         write_message("Ranks: %s" % str(ranks), verbose=9)
         write_first_ranks_to_file(ranks, dict_of_ranks, \
                 max_ranks, filename_ranks)
     except (ConfigParser.NoOptionError, StandardError):
         write_message("If you want the ranks to be printed in a file you have \
 to set output_ranks_to_filename and output_rank_limit \
 parameters in the configuration file", verbose=3)
     normalize_weights(dict_of_ranks)
     into_db(dict_of_ranks, rank_method_code)
diff --git a/modules/bibrank/lib/bibrank_tag_based_indexer.py b/modules/bibrank/lib/bibrank_tag_based_indexer.py
index 239795a45..1ef052bbe 100644
--- a/modules/bibrank/lib/bibrank_tag_based_indexer.py
+++ b/modules/bibrank/lib/bibrank_tag_based_indexer.py
@@ -1,504 +1,504 @@
 # -*- coding: utf-8 -*-
 # Ranking of records using different parameters and methods.
 
 # This file is part of Invenio.
-# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2012 CERN.
+# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2012, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 import sys
 import time
 import traceback
 import ConfigParser
 
 from invenio.config import \
      CFG_SITE_LANG, \
      CFG_ETCDIR
 from invenio.search_engine import perform_request_search
 from invenio.bibrank_citation_indexer import get_citation_weight, print_missing
 from invenio.bibrank_downloads_indexer import *
 from invenio.dbquery import run_sql, serialize_via_marshal, deserialize_via_marshal, \
      wash_table_column_name, get_table_update_time
 from invenio.bibtask import task_get_option, write_message, task_sleep_now_if_required
 from invenio.bibindex_engine import create_range_list
 from invenio.intbitset import intbitset
 
 options = {}
 
 
 def download_weight_filtering_user_repair_exec ():
     """Repair download weight filtering user ranking method"""
     write_message("Repairing for this ranking method is not defined. Skipping.")
     return
 
 def download_weight_total_repair_exec():
     """Repair download weight total ranking method"""
     write_message("Repairing for this ranking method is not defined. Skipping.")
     return
 
 def file_similarity_by_times_downloaded_repair_exec():
     """Repair file similarity by times downloaded ranking method"""
     write_message("Repairing for this ranking method is not defined. Skipping.")
     return
 
 def single_tag_rank_method_repair_exec():
     """Repair single tag ranking method"""
     write_message("Repairing for this ranking method is not defined. Skipping.")
     return
 
 def citation_exec(rank_method_code, name, config):
     """Rank method for citation analysis"""
     #first check if this is a specific task
     if task_get_option("cmd") == "print-missing":
         num = task_get_option("num")
         print_missing(num)
     else:
         dic, index_update_time = get_citation_weight(rank_method_code, config)
         if dic:
             if task_get_option("id") or task_get_option("collection") or \
                task_get_option("modified"):
                 # user have asked to citation-index specific records
                 # only, so we should not update citation indexer's
                 # last run time stamp information
                 index_update_time = None
             intoDB(dic, index_update_time, rank_method_code)
         else:
             write_message("No need to update the indexes for citations.")
 
 def download_weight_filtering_user(run):
     return bibrank_engine(run)
 
 def download_weight_total(run):
     return bibrank_engine(run)
 
 def file_similarity_by_times_downloaded(run):
     return bibrank_engine(run)
 
 def download_weight_filtering_user_exec (rank_method_code, name, config):
     """Ranking by number of downloads per User.
     Only one full Text Download is taken in account for one
     specific userIP address"""
     begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     time1 = time.time()
     dic = fromDB(rank_method_code)
     last_updated = get_lastupdated(rank_method_code)
     keys = new_downloads_to_index(last_updated)
     filter_downloads_per_hour(keys, last_updated)
     dic = get_download_weight_filtering_user(dic, keys)
     intoDB(dic, begin_date, rank_method_code)
     time2 = time.time()
     return {"time":time2-time1}
 
 def download_weight_total_exec(rank_method_code, name, config):
     """rankink by total number of downloads without check the user ip
     if users downloads 3 time the same full text document it has to be count as 3 downloads"""
     begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     time1 = time.time()
     dic = fromDB(rank_method_code)
     last_updated = get_lastupdated(rank_method_code)
     keys = new_downloads_to_index(last_updated)
     filter_downloads_per_hour(keys, last_updated)
     dic = get_download_weight_total(dic, keys)
     intoDB(dic, begin_date, rank_method_code)
     time2 = time.time()
     return {"time":time2-time1}
 
 def file_similarity_by_times_downloaded_exec(rank_method_code, name, config):
     """update dictionnary {recid:[(recid, nb page similarity), ()..]}"""
     begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     time1 = time.time()
     dic = fromDB(rank_method_code)
     last_updated = get_lastupdated(rank_method_code)
     keys = new_downloads_to_index(last_updated)
     filter_downloads_per_hour(keys, last_updated)
     dic = get_file_similarity_by_times_downloaded(dic, keys)
     intoDB(dic, begin_date, rank_method_code)
     time2 = time.time()
     return {"time":time2-time1}
 
 def single_tag_rank_method_exec(rank_method_code, name, config):
     """Creating the rank method data"""
     begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     rnkset = {}
     rnkset_old = fromDB(rank_method_code)
     rnkset_new = single_tag_rank(config)
     rnkset = union_dicts(rnkset_old, rnkset_new)
     intoDB(rnkset, begin_date, rank_method_code)
 
 def single_tag_rank(config):
     """Connect the given tag with the data from the kb file given"""
     write_message("Loading knowledgebase file", verbose=9)
     kb_data = {}
     records = []
 
     write_message("Reading knowledgebase file: %s" % \
                    config.get(config.get("rank_method", "function"), "kb_src"))
     input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r')
     data = input.readlines()
     for line in data:
         if not line[0:1] == "#":
             kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1]
     write_message("Number of lines read from knowledgebase file: %s" % len(kb_data))
 
     tag = config.get(config.get("rank_method", "function"), "tag")
     tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ")
     if tags == ['']:
         tags = ""
 
     records = []
     for (recids, recide) in options["recid_range"]:
         task_sleep_now_if_required(can_stop_too=True)
         write_message("......Processing records #%s-%s" % (recids, recide))
         recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide))
         valid = intbitset(trailing_bits=1)
         valid.discard(0)
         for key in tags:
             newset = intbitset()
             newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))]
             valid.intersection_update(newset)
         if tags:
             recs = filter(lambda x: x[0] in valid, recs)
         records = records + list(recs)
         write_message("Number of records found with the necessary tags: %s" % len(records))
 
     records = filter(lambda x: x[0] in options["validset"], records)
     rnkset = {}
     for key, value in records:
         if kb_data.has_key(value):
             if not rnkset.has_key(key):
                 rnkset[key] = float(kb_data[value])
             else:
                 if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]):
                     rnkset[key] = float(kb_data[value])
         else:
             rnkset[key] = 0
 
     write_message("Number of records available in rank method: %s" % len(rnkset))
     return rnkset
 
 def get_lastupdated(rank_method_code):
     """Get the last time the rank method was updated"""
     res = run_sql("SELECT rnkMETHOD.last_updated FROM rnkMETHOD WHERE name=%s", (rank_method_code, ))
     if res:
         return res[0][0]
     else:
         # raise Exception("Is this the first run? Please do a complete update.")
         return "1970-01-01 00:00:00"
 
 def intoDB(dic, date, rank_method_code):
     """Insert the rank method data into the database"""
     mid = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
     del_rank_method_codeDATA(rank_method_code)
     serdata = serialize_via_marshal(dic)
     midstr = str(mid[0][0])
-    run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) VALUES (%s,%s)", (midstr, serdata,))
+    run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) VALUES (%s,_binary %s)", (midstr, serdata,))
     if date:
         run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", (date, rank_method_code))
 
 def fromDB(rank_method_code):
     """Get the data for a rank method"""
     id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
     if not id:
         return {}
     res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
     if res:
         return deserialize_via_marshal(res[0][0])
     else:
         return {}
 
 def del_rank_method_codeDATA(rank_method_code):
     """Delete the data for a rank method"""
     id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
     run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
 
 def del_recids(rank_method_code, range_rec):
     """Delete some records from the rank method"""
     id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, ))
     res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], ))
     if res:
         rec_dict = deserialize_via_marshal(res[0][0])
         write_message("Old size: %s" % len(rec_dict))
         for (recids, recide) in range_rec:
             for i in range(int(recids), int(recide)):
                 if rec_dict.has_key(i):
                     del rec_dict[i]
         write_message("New size: %s" % len(rec_dict))
         begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         intoDB(rec_dict, begin_date, rank_method_code)
     else:
         write_message("Create before deleting!")
 
 def union_dicts(dict1, dict2):
     "Returns union of the two dicts."
     union_dict = {}
     for (key, value) in dict1.iteritems():
         union_dict[key] = value
     for (key, value) in dict2.iteritems():
         union_dict[key] = value
     return union_dict
 
 def rank_method_code_statistics(rank_method_code):
     """Print statistics"""
 
     method = fromDB(rank_method_code)
     max = ('', -999999)
     maxcount = 0
     min = ('', 999999)
     mincount = 0
 
     for (recID, value) in method.iteritems():
         if value < min and value > 0:
             min = value
         if value > max:
             max = value
 
     for (recID, value) in method.iteritems():
         if value == min:
             mincount += 1
         if value == max:
             maxcount += 1
 
     write_message("Showing statistic for selected method")
     write_message("Method name: %s" % getName(rank_method_code))
     write_message("Short name: %s" % rank_method_code)
     write_message("Last run: %s" % get_lastupdated(rank_method_code))
     write_message("Number of records: %s" % len(method))
     write_message("Lowest value: %s - Number of records: %s" % (min, mincount))
     write_message("Highest value: %s - Number of records: %s" % (max, maxcount))
     write_message("Divided into 10 sets:")
     for i in range(1, 11):
         setcount = 0
         distinct_values = {}
         lower = -1.0 + ((float(max + 1) / 10)) * (i - 1)
         upper = -1.0 + ((float(max + 1) / 10)) * i
         for (recID, value) in method.iteritems():
             if value >= lower and value <= upper:
                 setcount += 1
                 distinct_values[value] = 1
         write_message("Set %s (%s-%s) %s Distinct values: %s" % (i, lower, upper, len(distinct_values), setcount))
 
 def check_method(rank_method_code):
     write_message("Checking rank method...")
     if len(fromDB(rank_method_code)) == 0:
         write_message("Rank method not yet executed, please run it to create the necessary data.")
     else:
         if len(add_recIDs_by_date(rank_method_code)) > 0:
             write_message("Records modified, update recommended")
         else:
             write_message("No records modified, update not necessary")
 
 
 def load_config(method):
     filename = CFG_ETCDIR + "/bibrank/" + method + ".cfg"
     config = ConfigParser.ConfigParser()
     try:
         config.readfp(open(filename))
     except StandardError:
         write_message("Cannot find configuration file: %s" % filename,
                       sys.stderr)
         raise
     return config
 
 def bibrank_engine(run):
     """Run the indexing task.
     Return 1 in case of success and 0 in case of failure.
     """
     startCreate = time.time()
 
     options["run"] = []
     options["run"].append(run)
     for rank_method_code in options["run"]:
         task_sleep_now_if_required(can_stop_too=True)
         cfg_name = getName(rank_method_code)
         write_message("Running rank method: %s." % cfg_name)
         config = load_config(rank_method_code)
         cfg_short = rank_method_code
         cfg_function = config.get("rank_method", "function") + "_exec"
         cfg_repair_function = config.get("rank_method", "function") + "_repair_exec"
         cfg_name = getName(cfg_short)
         options["validset"] = get_valid_range(rank_method_code)
 
         if task_get_option("collection"):
             l_of_colls = string.split(task_get_option("collection"), ", ")
             recIDs = perform_request_search(c=l_of_colls)
             recIDs_range = []
             for recID in recIDs:
                 recIDs_range.append([recID, recID])
             options["recid_range"] = recIDs_range
         elif task_get_option("id"):
             options["recid_range"] = task_get_option("id")
         elif task_get_option("modified"):
             options["recid_range"] = add_recIDs_by_date(rank_method_code, task_get_option("modified"))
         elif task_get_option("last_updated"):
             options["recid_range"] = add_recIDs_by_date(rank_method_code)
         else:
             write_message("No records specified, updating all", verbose=2)
             min_id = run_sql("SELECT min(id) from bibrec")[0][0]
             max_id = run_sql("SELECT max(id) from bibrec")[0][0]
             options["recid_range"] = [[min_id, max_id]]
 
         if task_get_option("quick") == "no":
             write_message("Recalculate parameter not used, parameter ignored.", verbose=9)
 
         if task_get_option("cmd") == "del":
             del_recids(cfg_short, options["recid_range"])
         elif task_get_option("cmd") == "add":
             func_object = globals().get(cfg_function)
             func_object(rank_method_code, cfg_name, config)
         elif task_get_option("cmd") == "stat":
             rank_method_code_statistics(rank_method_code)
         elif task_get_option("cmd") == "check":
             check_method(rank_method_code)
         elif task_get_option("cmd") == "print-missing":
             func_object = globals().get(cfg_function)
             func_object(rank_method_code, cfg_name, config)
         elif task_get_option("cmd") == "repair":
             func_object = globals().get(cfg_repair_function)
             func_object()
         else:
             write_message("Invalid command found processing %s" % rank_method_code, sys.stderr)
             raise StandardError
 
     if task_get_option("verbose"):
         showtime((time.time() - startCreate))
     return 1
 
 def get_valid_range(rank_method_code):
     """Return a range of records"""
     write_message("Getting records from collections enabled for rank method.", verbose=9)
 
     res = run_sql("SELECT collection.name FROM collection, collection_rnkMETHOD, rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name=%s",  (rank_method_code, ))
     l_of_colls = []
     for coll in res:
         l_of_colls.append(coll[0])
     if len(l_of_colls) > 0:
         recIDs = perform_request_search(c=l_of_colls)
     else:
         recIDs = []
     valid = intbitset()
     valid += recIDs
     return valid
 
 def add_recIDs_by_date(rank_method_code, dates=""):
     """Return recID range from records modified between DATES[0] and DATES[1].
        If DATES is not set, then add records modified since the last run of
        the ranking method RANK_METHOD_CODE.
     """
     if not dates:
         dates = (get_lastupdated(rank_method_code), '')
     if dates[0] is None:
         dates = ("0000-00-00 00:00:00", '')
     query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s"""
     if dates[1]:
         query += " and b.modification_date <= %s"
     query += " ORDER BY b.id ASC"""
     if dates[1]:
         res = run_sql(query, (dates[0], dates[1]))
     else:
         res = run_sql(query, (dates[0], ))
     alist = create_range_list([row[0] for row in res])
     if not alist:
         write_message("No new records added since last time method was run")
     return alist
 
 def getName(rank_method_code, ln=CFG_SITE_LANG, type='ln'):
     """Returns the name of the method if it exists"""
 
     try:
         rnkid = run_sql("SELECT id FROM rnkMETHOD where name=%s", (rank_method_code, ))
         if rnkid:
             rnkid = str(rnkid[0][0])
             res = run_sql("SELECT value FROM rnkMETHODNAME where type=%s and ln=%s and id_rnkMETHOD=%s", (type, ln, rnkid))
             if not res:
                 res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln=%s and id_rnkMETHOD=%s and type=%s", (CFG_SITE_LANG, rnkid, type))
             if not res:
                 return rank_method_code
             return res[0][0]
         else:
             raise Exception
     except Exception:
         write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.")
         raise Exception
 
 def single_tag_rank_method(run):
     return bibrank_engine(run)
 
 def showtime(timeused):
     """Show time used for method"""
     write_message("Time used: %d second(s)." % timeused, verbose=9)
 
 def citation(run):
     return bibrank_engine(run)
 
 
 # Hack to put index based sorting here, but this is very similar to tag
 #based method and should re-use a lot of this code, so better to have here
 #than separate
 #
 
 def index_term_count_exec(rank_method_code, name, config):
     """Creating the rank method data"""
     write_message("Recreating index weighting data")
     begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     # we must recalculate these every time for all records, since the
     # weighting of a record is determined by the index entries of _other_
     # records
 
     rnkset = calculate_index_term_count(config)
     intoDB(rnkset, begin_date, rank_method_code)
 
 def calculate_index_term_count(config):
     """Calculate the weight of a record set based on number of enries of a
     tag from the record in another index...useful for authority files"""
 
     records = []
 
     if config.has_section("index_term_count"):
         index = config.get("index_term_count","index_table_name")
         tag = config.get("index_term_count","index_term_value_from_tag")
         # check against possible SQL injection:
         dummy = get_table_update_time(index)
         tag = wash_table_column_name(tag)
     else:
         raise Exception("Config file " + config + " does not have index_term_count section")
         return()
 
     task_sleep_now_if_required(can_stop_too=True)
     write_message("......Processing all records")
     query = "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id" % \
             (tag[0:2], tag[0:2]) # we checked that tag is safe
     records = list(run_sql(query, (tag,)))
     write_message("Number of records found with the necessary tags: %s" % len(records))
 
 
     rnkset = {}
     for key, value in records:
         hits = 0
         if len(value):
             query = "SELECT hitlist from %s where term = %%s" % index # we checked that index is a table
             row = run_sql(query, (value,))
             if row and row[0] and row[0][0]:
                 #has to be prepared for corrupted data!
                 try:
                     hits = len(intbitset(row[0][0]))
                 except:
                     hits = 0
         rnkset[key] = hits
     write_message("Number of records available in rank method: %s" % len(rnkset))
     return rnkset
 
 
 def index_term_count(run):
     return bibrank_engine(run)
diff --git a/modules/bibrank/lib/bibrank_word_indexer.py b/modules/bibrank/lib/bibrank_word_indexer.py
index df252476f..5ee0cfe5b 100644
--- a/modules/bibrank/lib/bibrank_word_indexer.py
+++ b/modules/bibrank/lib/bibrank_word_indexer.py
@@ -1,1184 +1,1184 @@
 # This file is part of Invenio.
-# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2014 CERN.
+# Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011, 2014, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 __revision__ = "$Id$"
 
 import sys
 import time
 import urllib
 import math
 import re
 import ConfigParser
 
 from invenio.config import \
      CFG_SITE_LANG, \
      CFG_ETCDIR, \
      CFG_SITE_URL
 from invenio.search_engine import perform_request_search, wash_index_term
 from invenio.dbquery import run_sql, DatabaseError, serialize_via_marshal, deserialize_via_marshal
 from invenio.bibindex_engine_stemmer import is_stemmer_available_for_language, stem
 from invenio.bibindex_engine_stopwords import is_stopword
 from invenio.bibindex_engine import beautify_range_list, \
     kill_sleepy_mysql_threads, create_range_list
 from invenio.bibtask import write_message, task_get_option, task_update_progress, \
     task_update_status, task_sleep_now_if_required
 from invenio.intbitset import intbitset
 from invenio.errorlib import register_exception
 from invenio.textutils import strip_accents
 
 options = {} # global variable to hold task options
 
 # safety parameters concerning DB thread-multiplication problem:
 CFG_CHECK_MYSQL_THREADS = 0 # to check or not to check the problem?
 CFG_MAX_MYSQL_THREADS = 50 # how many threads (connections) we consider as still safe
 CFG_MYSQL_THREAD_TIMEOUT = 20 # we'll kill threads that were sleeping for more than X seconds
 
 # override urllib's default password-asking behaviour:
 class MyFancyURLopener(urllib.FancyURLopener):
     def prompt_user_passwd(self, host, realm):
         # supply some dummy credentials by default
         return ("mysuperuser", "mysuperpass")
     def http_error_401(self, url, fp, errcode, errmsg, headers):
         # do not bother with protected pages
         raise IOError, (999, 'unauthorized access')
         return None
 
 #urllib._urlopener = MyFancyURLopener()
 
 
 nb_char_in_line = 50  # for verbose pretty printing
 chunksize = 1000 # default size of chunks that the records will be treated by
 base_process_size = 4500 # process base size
 
 # Dictionary merging functions
 def dict_union(list1, list2):
     "Returns union of the two dictionaries."
     union_dict = {}
 
     for (e, count) in list1.iteritems():
         union_dict[e] = count
     for (e, count) in list2.iteritems():
         if not union_dict.has_key(e):
             union_dict[e] = count
         else:
             union_dict[e] = (union_dict[e][0] + count[0], count[1])
 
     #for (e, count) in list2.iteritems():
     #    list1[e] = (list1.get(e, (0, 0))[0] + count[0], count[1])
 
     #return list1
     return union_dict
 
 # tagToFunctions mapping. It offers an indirection level necesary for
 # indexing fulltext. The default is get_words_from_phrase
 tagToWordsFunctions = {}
 
 def get_words_from_phrase(phrase, weight, lang="",
                           chars_punctuation=r"[\.\,\:\;\?\!\"]",
                           chars_alphanumericseparators=r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]",
                           split=str.split):
     "Returns list of words from phrase 'phrase'."
     words = {}
     phrase = strip_accents(phrase)
     phrase = phrase.lower()
     #Getting rid of strange characters
     phrase = re.sub("&eacute;", 'e', phrase)
     phrase = re.sub("&egrave;", 'e', phrase)
     phrase = re.sub("&agrave;", 'a', phrase)
     phrase = re.sub("&nbsp;", ' ', phrase)
     phrase = re.sub("&laquo;", ' ', phrase)
     phrase = re.sub("&raquo;", ' ', phrase)
     phrase = re.sub("&ecirc;", ' ', phrase)
     phrase = re.sub("&amp;", ' ', phrase)
     if phrase.find("</") > -1:
         #Most likely html, remove html code
         phrase = re.sub("(?s)<[^>]*>|&#?\w+;", ' ', phrase)
     #removes http links
     phrase = re.sub("(?s)http://[^( )]*", '', phrase)
     phrase = re.sub(chars_punctuation, ' ', phrase)
 
     #By doing this like below, characters standing alone, like c a b is not added to the inedx, but when they are together with characters like c++ or c$ they are added.
     for word in split(phrase):
         if options["remove_stopword"] == "True" and not is_stopword(word) and check_term(word, 0):
             if lang and lang !="none" and options["use_stemming"]:
                 word = stem(word, lang)
                 if not words.has_key(word):
                     words[word] = (0, 0)
             else:
                 if not words.has_key(word):
                     words[word] = (0, 0)
             words[word] = (words[word][0] + weight, 0)
         elif options["remove_stopword"] == "True" and not is_stopword(word):
             phrase = re.sub(chars_alphanumericseparators, ' ', word)
             for word_ in split(phrase):
                 if lang and lang !="none" and options["use_stemming"]:
                     word_ = stem(word_, lang)
                 if word_:
                     if not words.has_key(word_):
                         words[word_] = (0,0)
                     words[word_] = (words[word_][0] + weight, 0)
     return words
 
 class WordTable:
     "A class to hold the words table."
 
     def __init__(self, tablename, fields_to_index, separators="[^\s]"):
         "Creates words table instance."
         self.tablename = tablename
         self.recIDs_in_mem = []
         self.fields_to_index = fields_to_index
         self.separators = separators
         self.value = {}
 
     def get_field(self, recID, tag):
         """Returns list of values of the MARC-21 'tag' fields for the
            record 'recID'."""
 
         out = []
         bibXXx = "bib" + tag[0] + tag[1] + "x"
         bibrec_bibXXx = "bibrec_" + bibXXx
         query = """SELECT value FROM %s AS b, %s AS bb
                 WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id
                 AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID, tag);
         res = run_sql(query)
         for row in res:
             out.append(row[0])
         return out
 
     def clean(self):
         "Cleans the words table."
         self.value={}
 
     def put_into_db(self, mode="normal"):
         """Updates the current words table in the corresponding DB
            rnkWORD table.  Mode 'normal' means normal execution,
            mode 'emergency' means words index reverting to old state.
            """
         write_message("%s %s wordtable flush started" % (self.tablename,mode))
         write_message('...updating %d words into %sR started' % \
                 (len(self.value), self.tablename[:-1]))
         task_update_progress("%s flushed %d/%d words" % (self.tablename, 0, len(self.value)))
 
         self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem)
 
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='TEMPORARY' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='CURRENT'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 write_message(query, verbose=9)
                 run_sql(query)
 
         nb_words_total = len(self.value)
         nb_words_report = int(nb_words_total/10)
         nb_words_done = 0
         for word in self.value.keys():
             self.put_word_into_db(word, self.value[word])
             nb_words_done += 1
             if nb_words_report!=0 and ((nb_words_done % nb_words_report) == 0):
                 write_message('......processed %d/%d words' % (nb_words_done, nb_words_total))
                 task_update_progress("%s flushed %d/%d words" % (self.tablename, nb_words_done, nb_words_total))
         write_message('...updating %d words into %s ended' % \
                 (nb_words_total, self.tablename), verbose=9)
 
         #if options["verbose"]:
         #    write_message('...updating reverse table %sR started' % self.tablename[:-1])
         if mode == "normal":
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 write_message(query, verbose=9)
                 run_sql(query)
                 query = """DELETE FROM %sR WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 write_message(query, verbose=9)
                 run_sql(query)
             write_message('End of updating wordTable into %s' % self.tablename, verbose=9)
         elif mode == "emergency":
             write_message("emergency")
             for group in self.recIDs_in_mem:
                 query = """UPDATE %sR SET type='CURRENT' WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 write_message(query, verbose=9)
                 run_sql(query)
                 query = """DELETE FROM %sR WHERE id_bibrec
                 BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
                 (self.tablename[:-1], group[0], group[1])
                 write_message(query, verbose=9)
                 run_sql(query)
             write_message('End of emergency flushing wordTable into %s' % self.tablename, verbose=9)
         #if options["verbose"]:
         #    write_message('...updating reverse table %sR ended' % self.tablename[:-1])
 
         self.clean()
         self.recIDs_in_mem = []
         write_message("%s %s wordtable flush ended" % (self.tablename, mode))
         task_update_progress("%s flush ended" % (self.tablename))
 
     def load_old_recIDs(self,word):
         """Load existing hitlist for the word from the database index files."""
         query = "SELECT hitlist FROM %s WHERE term=%%s" % self.tablename
         res = run_sql(query, (word,))
         if res:
             return deserialize_via_marshal(res[0][0])
         else:
             return None
 
     def merge_with_old_recIDs(self,word,recIDs, set):
         """Merge the system numbers stored in memory (hash of recIDs with value[0] > 0 or -1
         according to whether to add/delete them) with those stored in the database index
         and received in set universe of recIDs for the given word.
 
         Return 0 in case no change was done to SET, return 1 in case SET was changed.
         """
 
         set_changed_p = 0
         for recID,sign in recIDs.iteritems():
             if sign[0] == -1 and set.has_key(recID):
                 # delete recID if existent in set and if marked as to be deleted
                 del set[recID]
                 set_changed_p = 1
             elif sign[0] > -1 and not set.has_key(recID):
                 # add recID if not existent in set and if marked as to be added
                 set[recID] = sign
                 set_changed_p = 1
             elif sign[0] > -1 and sign[0] != set[recID][0]:
                 set[recID] = sign
                 set_changed_p = 1
 
         return set_changed_p
 
     def put_word_into_db(self, word, recIDs, split=str.split):
         """Flush a single word to the database and delete it from memory"""
         set = self.load_old_recIDs(word)
         #write_message("%s %s" % (word, self.value[word]))
         if set is not None: # merge the word recIDs found in memory:
             options["modified_words"][word] = 1
             if not self.merge_with_old_recIDs(word, recIDs, set):
                 # nothing to update:
                 write_message("......... unchanged hitlist for ``%s''" % word, verbose=9)
                 pass
             else:
                 # yes there were some new words:
                 write_message("......... updating hitlist for ``%s''" % word, verbose=9)
-                run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % self.tablename,
+                run_sql("UPDATE %s SET hitlist=_binary %%s WHERE term=%%s" % self.tablename,
                         (serialize_via_marshal(set), word))
         else: # the word is new, will create new set:
             write_message("......... inserting hitlist for ``%s''" % word, verbose=9)
             set = self.value[word]
             if len(set) > 0:
                 #new word, add to list
                 options["modified_words"][word] = 1
                 try:
-                    run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, %%s)" % self.tablename,
+                    run_sql("INSERT INTO %s (term, hitlist) VALUES (%%s, _binary %%s)" % self.tablename,
                             (word, serialize_via_marshal(set)))
                 except Exception, e:
                     ## FIXME: This is for debugging encoding errors
                     register_exception(prefix="Error when putting the term '%s' into db (hitlist=%s): %s\n" % (repr(word), set, e), alert_admin=True)
         if not set: # never store empty words
             run_sql("DELETE from %s WHERE term=%%s" % self.tablename,
                     (word,))
 
         del self.value[word]
 
     def display(self):
         "Displays the word table."
         keys = self.value.keys()
         keys.sort()
         for k in keys:
             write_message("%s: %s" % (k, self.value[k]))
 
     def count(self):
         "Returns the number of words in the table."
         return len(self.value)
 
     def info(self):
         "Prints some information on the words table."
         write_message("The words table contains %d words." % self.count())
 
     def lookup_words(self, word=""):
         "Lookup word from the words table."
 
         if not word:
             done = 0
             while not done:
                 try:
                     word = raw_input("Enter word: ")
                     done = 1
                 except (EOFError, KeyboardInterrupt):
                     return
 
         if self.value.has_key(word):
             write_message("The word '%s' is found %d times." \
                 % (word, len(self.value[word])))
         else:
             write_message("The word '%s' does not exist in the word file."\
                               % word)
 
     def update_last_updated(self, rank_method_code, starting_time=None):
         """Update last_updated column of the index table in the database.
         Puts starting time there so that if the task was interrupted for record download,
         the records will be reindexed next time."""
         if starting_time is None:
             return None
         write_message("updating last_updated to %s..." % starting_time, verbose=9)
         return run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s",
                        (starting_time, rank_method_code,))
 
     def add_recIDs(self, recIDs):
         """Fetches records which id in the recIDs arange list and adds
         them to the wordTable.  The recIDs arange list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         global chunksize
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for arange in recIDs:
             records_to_go = records_to_go + arange[1] - arange[0] + 1
 
         time_started = time.time() # will measure profile time
         for arange in recIDs:
             i_low = arange[0]
             chunksize_count = 0
             while i_low <= arange[1]:
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low+task_get_option("flush")-flush_count-1,arange[1])
                 i_high = min(i_low+chunksize-chunksize_count-1, i_high)
                 try:
                     self.chk_recID_range(i_low, i_high)
                 except StandardError, e:
                     write_message("Exception caught: %s" % e, sys.stderr)
                     register_exception()
                     task_update_status("ERROR")
                     sys.exit(1)
                 write_message("%s adding records #%d-#%d started" % \
                         (self.tablename, i_low, i_high))
                 if CFG_CHECK_MYSQL_THREADS:
                     kill_sleepy_mysql_threads()
                 task_update_progress("%s adding recs %d-%d" % (self.tablename, i_low, i_high))
                 self.del_recID_range(i_low, i_high)
                 just_processed = self.add_recID_range(i_low, i_high)
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + just_processed
                 write_message("%s adding records #%d-#%d ended  " % \
                         (self.tablename, i_low, i_high))
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= task_get_option("flush"):
                     self.put_into_db()
                     self.clean()
                     write_message("%s backing up" % (self.tablename))
                     flush_count = 0
                     self.log_progress(time_started,records_done,records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db()
             self.log_progress(time_started,records_done,records_to_go)
 
     def add_recIDs_by_date(self, dates=""):
         """Add recIDs modified between DATES[0] and DATES[1].
            If DATES is not set, then add records modified since the last run of
            the ranking method.
         """
         if not dates:
             write_message("Using the last update time for the rank method")
             query = """SELECT last_updated FROM rnkMETHOD WHERE name='%s'
             """ % options["current_run"]
             res = run_sql(query)
 
             if not res:
                 return
             if not res[0][0]:
                 dates = ("0000-00-00",'')
             else:
                 dates = (res[0][0],'')
 
         query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >=
         '%s'""" % dates[0]
         if dates[1]:
             query += "and b.modification_date <= '%s'" % dates[1]
         query += " ORDER BY b.id ASC"""
         res = run_sql(query)
 
         alist = create_range_list([row[0] for row in res])
         if not alist:
             write_message( "No new records added. %s is up to date" % self.tablename)
         else:
             self.add_recIDs(alist)
         return alist
 
 
     def add_recID_range(self, recID1, recID2):
         """Add records from RECID1 to RECID2."""
         wlist = {}
         normalize = {}
 
         self.recIDs_in_mem.append([recID1,recID2])
         # secondly fetch all needed tags:
 
         for (tag, weight, lang) in self.fields_to_index:
             if tag in tagToWordsFunctions.keys():
                 get_words_function = tagToWordsFunctions[tag]
             else:
                 get_words_function = get_words_from_phrase
             bibXXx = "bib" + tag[0] + tag[1] + "x"
             bibrec_bibXXx = "bibrec_" + bibXXx
             query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb
                     WHERE bb.id_bibrec BETWEEN %d AND %d
                     AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID1, recID2, tag)
             res = run_sql(query)
             nb_total_to_read = len(res)
             verbose_idx = 0     # for verbose pretty printing
             for row in res:
                 recID, phrase = row
                 if recID in options["validset"]:
                     if not wlist.has_key(recID): wlist[recID] = {}
                     new_words = get_words_function(phrase, weight, lang) # ,self.separators
                     wlist[recID] = dict_union(new_words,wlist[recID])
 
         # were there some words for these recIDs found?
         if len(wlist) == 0: return 0
         recIDs = wlist.keys()
         for recID in recIDs:
             # was this record marked as deleted?
             if "DELETED" in self.get_field(recID, "980__c"):
                 wlist[recID] = {}
                 write_message("... record %d was declared deleted, removing its word list" % recID, verbose=9)
             write_message("... record %d, termlist: %s" % (recID, wlist[recID]), verbose=9)
 
         # put words into reverse index table with FUTURE status:
         for recID in recIDs:
-            run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'FUTURE')" % self.tablename[:-1],
+            run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'FUTURE')" % self.tablename[:-1],
                     (recID, serialize_via_marshal(wlist[recID])))
             # ... and, for new records, enter the CURRENT status as empty:
             try:
-                run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,%%s,'CURRENT')" % self.tablename[:-1],
+                run_sql("INSERT INTO %sR (id_bibrec,termlist,type) VALUES (%%s,_binary %%s,'CURRENT')" % self.tablename[:-1],
                         (recID, serialize_via_marshal([])))
             except DatabaseError:
                 # okay, it's an already existing record, no problem
                 pass
 
         # put words into memory word list:
         put = self.put
         for recID in recIDs:
             for (w, count) in wlist[recID].iteritems():
                 put(recID, w, count)
 
         return len(recIDs)
 
     def log_progress(self, start, done, todo):
         """Calculate progress and store it.
         start: start time,
         done: records processed,
         todo: total number of records"""
         time_elapsed = time.time() - start
         # consistency check
         if time_elapsed == 0 or done > todo:
             return
 
         time_recs_per_min = done/(time_elapsed/60.0)
         write_message("%d records took %.1f seconds to complete.(%1.f recs/min)"\
                 % (done, time_elapsed, time_recs_per_min))
 
         if time_recs_per_min:
             write_message("Estimated runtime: %.1f minutes" % \
                     ((todo-done)/time_recs_per_min))
 
     def put(self, recID, word, sign):
         "Adds/deletes a word to the word list."
         try:
             word = wash_index_term(word)
             if self.value.has_key(word):
                 # the word 'word' exist already: update sign
                 self.value[word][recID] = sign
                 # PROBLEM ?
             else:
                 self.value[word] = {recID: sign}
         except:
             write_message("Error: Cannot put word %s with sign %d for recID %s." % (word, sign, recID))
 
 
     def del_recIDs(self, recIDs):
         """Fetches records which id in the recIDs range list and adds
         them to the wordTable.  The recIDs range list is of the form:
         [[i1_low,i1_high],[i2_low,i2_high], ..., [iN_low,iN_high]].
         """
         count = 0
         for range in recIDs:
             self.del_recID_range(range[0],range[1])
             count = count + range[1] - range[0]
         self.put_into_db()
 
     def del_recID_range(self, low, high):
         """Deletes records with 'recID' system number between low
            and high from memory words index table."""
         write_message("%s fetching existing words for records #%d-#%d started" % \
                 (self.tablename, low, high), verbose=3)
         self.recIDs_in_mem.append([low,high])
         query = """SELECT id_bibrec,termlist FROM %sR as bb WHERE bb.id_bibrec
         BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
         recID_rows = run_sql(query)
         for recID_row in recID_rows:
             recID = recID_row[0]
             wlist = deserialize_via_marshal(recID_row[1])
             for word in wlist:
                 self.put(recID, word, (-1, 0))
         write_message("%s fetching existing words for records #%d-#%d ended" % \
                 (self.tablename, low, high), verbose=3)
 
     def check_bad_words(self):
         """
         Finds bad words in reverse tables. Returns the number of bad words.
         """
         query = """SELECT count(1) FROM %sR WHERE type IN ('TEMPORARY','FUTURE')""" % (self.tablename[:-1])
         res = run_sql(query)
         return res[0][0]
 
     def report_on_table_consistency(self):
         """Check reverse words index tables (e.g. rnkWORD01R) for
         interesting states such as 'TEMPORARY' state.
         Prints small report (no of words, no of bad words).
         """
         # find number of words:
         query = """SELECT COUNT(*) FROM %s""" % (self.tablename)
         res = run_sql(query, None, 1)
         if res:
             nb_words = res[0][0]
         else:
             nb_words = 0
 
         # report stats:
         write_message("%s contains %d words" % (self.tablename, nb_words))
 
         # find possible bad states in reverse tables:
         nb_bad_words = self.check_bad_words()
         if nb_bad_words:
             write_message("EMERGENCY: %s needs to repair %d of %d index records" %
                           (self.tablename, nb_bad_words, nb_words))
         else:
             write_message("%s is in consistent state" % (self.tablename))
 
     def repair(self):
         """Repair the whole table"""
         # find possible bad states in reverse tables:
         if self.check_bad_words() == 0:
             return
 
         query = """SELECT id_bibrec FROM %sR WHERE type in ('TEMPORARY','FUTURE')""" \
                 % (self.tablename[:-1])
         res = intbitset(run_sql(query))
         recIDs = create_range_list(list(res))
 
         flush_count = 0
         records_done = 0
         records_to_go = 0
 
         for range in recIDs:
             records_to_go = records_to_go + range[1] - range[0] + 1
 
         time_started = time.time() # will measure profile time
         for range in recIDs:
             i_low = range[0]
             chunksize_count = 0
             while i_low <= range[1]:
                 # calculate chunk group of recIDs and treat it:
                 i_high = min(i_low+task_get_option("flush")-flush_count-1,range[1])
                 i_high = min(i_low+chunksize-chunksize_count-1, i_high)
                 try:
                     self.fix_recID_range(i_low, i_high)
                 except StandardError, e:
                     write_message("Exception caught: %s" % e, sys.stderr)
                     register_exception()
                     task_update_status("ERROR")
                     sys.exit(1)
 
                 flush_count = flush_count + i_high - i_low + 1
                 chunksize_count = chunksize_count + i_high - i_low + 1
                 records_done = records_done + i_high - i_low + 1
                 if chunksize_count >= chunksize:
                     chunksize_count = 0
                 # flush if necessary:
                 if flush_count >= task_get_option("flush"):
                     self.put_into_db("emergency")
                     self.clean()
                     flush_count = 0
                     self.log_progress(time_started,records_done,records_to_go)
                 # iterate:
                 i_low = i_high + 1
         if flush_count > 0:
             self.put_into_db("emergency")
             self.log_progress(time_started,records_done,records_to_go)
         write_message("%s inconsistencies repaired." % self.tablename)
 
     def chk_recID_range(self, low, high):
         """Check if the reverse index table is in proper state"""
         ## check db
         query = """SELECT COUNT(*) FROM %sR WHERE type <> 'CURRENT'
         AND id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename[:-1], low, high)
         res = run_sql(query, None, 1)
         if res[0][0]==0:
             write_message("%s for %d-%d is in consistent state"%(self.tablename,low,high))
             return # okay, words table is consistent
 
         ## inconsistency detected!
         write_message("EMERGENCY: %s inconsistencies detected..." % self.tablename)
         write_message("""EMERGENCY: Errors found. You should check consistency of the %s - %sR tables.\nRunning 'bibrank --repair' is recommended.""" \
             % (self.tablename, self.tablename[:-1]))
         raise StandardError
 
     def fix_recID_range(self, low, high):
         """Try to fix reverse index database consistency (e.g. table rnkWORD01R) in the low,high doc-id range.
 
         Possible states for a recID follow:
         CUR TMP FUT: very bad things have happened: warn!
         CUR TMP    : very bad things have happened: warn!
         CUR     FUT: delete FUT (crash before flushing)
         CUR        : database is ok
             TMP FUT: add TMP to memory and del FUT from memory
                      flush (revert to old state)
             TMP    : very bad things have happened: warn!
                 FUT: very bad things have happended: warn!
         """
 
         state = {}
         query = "SELECT id_bibrec,type FROM %sR WHERE id_bibrec BETWEEN '%d' AND '%d'"\
                 % (self.tablename[:-1], low, high)
         res = run_sql(query)
         for row in res:
             if not state.has_key(row[0]):
                 state[row[0]]=[]
             state[row[0]].append(row[1])
 
         ok = 1 # will hold info on whether we will be able to repair
         for recID in state.keys():
             if not 'TEMPORARY' in state[recID]:
                 if 'FUTURE' in state[recID]:
                     if 'CURRENT' not in state[recID]:
                         write_message("EMERGENCY: Index record %d is in inconsistent state. Can't repair it" % recID)
                         ok = 0
                     else:
                         write_message("EMERGENCY: Inconsistency in index record %d detected" % recID)
                         query = """DELETE FROM %sR
                         WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
                         run_sql(query)
                         write_message("EMERGENCY: Inconsistency in index record %d repaired." % recID)
             else:
                 if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]:
                     self.recIDs_in_mem.append([recID,recID])
                     # Get the words file
                     query = """SELECT type,termlist FROM %sR
                     WHERE id_bibrec='%d'""" % (self.tablename[:-1], recID)
                     write_message(query, verbose=9)
                     res = run_sql(query)
                     for row in res:
                         wlist = deserialize_via_marshal(row[1])
                         write_message("Words are %s " % wlist, verbose=9)
                         if row[0] == 'TEMPORARY':
                             sign = 1
                         else:
                             sign = -1
                         for word in wlist:
                             self.put(recID, word, wlist[word])
 
                 else:
                     write_message("EMERGENCY: %s for %d is in inconsistent state. Couldn't repair it." % (self.tablename, recID))
                     ok = 0
 
         if not ok:
             write_message("""EMERGENCY: Unrepairable errors found. You should check consistency
                 of the %s - %sR tables. Deleting affected TEMPORARY and FUTURE entries
                 from these tables is recommended; see the BibIndex Admin Guide.
                 (The repairing procedure is similar for bibrank word indexes.)""" % (self.tablename, self.tablename[:-1]))
             raise StandardError
 
 def word_index(run):
     """Run the indexing task.  The row argument is the BibSched task
     queue row, containing if, arguments, etc.
     Return 1 in case of success and 0 in case of failure.
     """
     global languages
 
     max_recid = 0
     res = run_sql("SELECT max(id) FROM bibrec")
     if res and res[0][0]:
         max_recid = int(res[0][0])
 
     options["run"] = []
     options["run"].append(run)
     for rank_method_code in options["run"]:
         task_sleep_now_if_required(can_stop_too=True)
         method_starting_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         write_message("Running rank method: %s" % getName(rank_method_code))
         try:
             file = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg"
             config = ConfigParser.ConfigParser()
             config.readfp(open(file))
         except StandardError, e:
             write_message("Cannot find configurationfile: %s" % file, sys.stderr)
             raise StandardError
         options["current_run"] = rank_method_code
         options["modified_words"] = {}
         options["table"] = config.get(config.get("rank_method", "function"), "table")
         options["use_stemming"] = config.get(config.get("rank_method","function"),"stemming")
         options["remove_stopword"] = config.get(config.get("rank_method","function"),"stopword")
         tags = get_tags(config) #get the tags to include
         options["validset"] = get_valid_range(rank_method_code) #get the records from the collections the method is enabled for
         function = config.get("rank_method","function")
         wordTable = WordTable(options["table"], tags)
         wordTable.report_on_table_consistency()
         try:
             if task_get_option("cmd") == "del":
                 if task_get_option("id"):
                     wordTable.del_recIDs(task_get_option("id"))
                     task_sleep_now_if_required(can_stop_too=True)
                 elif task_get_option("collection"):
                     l_of_colls = task_get_option("collection").split(",")
                     recIDs = perform_request_search(c=l_of_colls)
                     recIDs_range = []
                     for recID in recIDs:
                         recIDs_range.append([recID,recID])
                     wordTable.del_recIDs(recIDs_range)
                     task_sleep_now_if_required(can_stop_too=True)
                 else:
                     write_message("Missing IDs of records to delete from index %s.", wordTable.tablename,
                                   sys.stderr)
                     raise StandardError
             elif task_get_option("cmd") == "add":
                 if task_get_option("id"):
                     wordTable.add_recIDs(task_get_option("id"))
                     task_sleep_now_if_required(can_stop_too=True)
                 elif task_get_option("collection"):
                     l_of_colls = task_get_option("collection").split(",")
                     recIDs = perform_request_search(c=l_of_colls)
                     recIDs_range = []
                     for recID in recIDs:
                         recIDs_range.append([recID,recID])
                     wordTable.add_recIDs(recIDs_range)
                     task_sleep_now_if_required(can_stop_too=True)
                 elif task_get_option("last_updated"):
                     wordTable.add_recIDs_by_date("")
                     # only update last_updated if run via automatic mode:
                     wordTable.update_last_updated(rank_method_code, method_starting_time)
                     task_sleep_now_if_required(can_stop_too=True)
                 elif task_get_option("modified"):
                     wordTable.add_recIDs_by_date(task_get_option("modified"))
                     task_sleep_now_if_required(can_stop_too=True)
                 else:
                     wordTable.add_recIDs([[0,max_recid]])
                     task_sleep_now_if_required(can_stop_too=True)
             elif task_get_option("cmd") == "repair":
                 wordTable.repair()
                 check_rnkWORD(options["table"])
                 task_sleep_now_if_required(can_stop_too=True)
             elif task_get_option("cmd") == "check":
                 check_rnkWORD(options["table"])
                 options["modified_words"] = {}
                 task_sleep_now_if_required(can_stop_too=True)
             elif task_get_option("cmd") == "stat":
                 rank_method_code_statistics(options["table"])
                 task_sleep_now_if_required(can_stop_too=True)
             else:
                 write_message("Invalid command found processing %s" % \
                      wordTable.tablename, sys.stderr)
                 raise StandardError
             update_rnkWORD(options["table"], options["modified_words"])
             task_sleep_now_if_required(can_stop_too=True)
         except StandardError, e:
             register_exception(alert_admin=True)
             write_message("Exception caught: %s" % e, sys.stderr)
             sys.exit(1)
         wordTable.report_on_table_consistency()
     # We are done. State it in the database, close and quit
 
     return 1
 
 def get_tags(config):
     """Get the tags that should be used creating the index and each tag's parameter"""
     tags = []
     function = config.get("rank_method","function")
     i = 1
     shown_error = 0
 
     #try:
     if 1:
         while config.has_option(function,"tag%s"% i):
             tag = config.get(function, "tag%s" % i)
             tag = tag.split(",")
             tag[1] = int(tag[1].strip())
             tag[2] = tag[2].strip()
 
             #check if stemmer for language is available
             if config.get(function, "stemming") and stem("information", "en") != "inform":
                 if shown_error == 0:
                     write_message("Warning: Stemming not working. Please check it out!")
                     shown_error = 1
             elif tag[2] and tag[2] != "none" and config.get(function,"stemming") and not is_stemmer_available_for_language(tag[2]):
                 write_message("Warning: Stemming not available for language '%s'." % tag[2])
             tags.append(tag)
             i += 1
     #except Exception:
     #    write_message("Could not read data from configuration file, please check for errors")
     #    raise StandardError
 
     return tags
 
 def get_valid_range(rank_method_code):
     """Returns which records are valid for this rank method, according to which collections it is enabled for."""
 
     #if options["verbose"] >=9:
     #    write_message("Getting records from collections enabled for rank method.")
     #res = run_sql("SELECT collection.name FROM collection,collection_rnkMETHOD,rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" %  rank_method_code)
     #l_of_colls = []
     #for coll in res:
     #    l_of_colls.append(coll[0])
     #if len(l_of_colls) > 0:
     #    recIDs = perform_request_search(c=l_of_colls)
     #else:
     #    recIDs = []
 
     valid = intbitset(trailing_bits=1)
     valid.discard(0)
 
     #valid.addlist(recIDs)
     return valid
 
 def check_term(term, termlength):
     """Check if term contains not allowed characters, or for any other reasons for not using this term."""
     try:
         if len(term) <= termlength:
             return False
         reg = re.compile(r"[1234567890\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]")
         if re.search(reg, term):
             return False
         term = str.replace(term, "-", "")
         term = str.replace(term, ".", "")
         term = str.replace(term, ",", "")
         if int(term):
             return False
     except StandardError, e:
         pass
     return True
 
 def check_rnkWORD(table):
     """Checks for any problems in rnkWORD tables."""
     i = 0
     errors = {}
     termslist = run_sql("SELECT term FROM %s" % table)
     N = run_sql("select max(id_bibrec) from %sR" % table[:-1])[0][0]
     write_message("Checking integrity of rank values in %s" % table)
     terms = map(lambda x: x[0], termslist)
 
     while i < len(terms):
         query_params = ()
         for j in range(i, ((i+5000)< len(terms) and (i+5000) or len(terms))):
             query_params += (terms[j],)
         terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term IN (%s)" % (table, (len(query_params)*"%s,")[:-1]),
                              query_params)
         for (t, hitlist) in terms_docs:
             term_docs = deserialize_via_marshal(hitlist)
             if (term_docs.has_key("Gi") and term_docs["Gi"][1] == 0) or not term_docs.has_key("Gi"):
                 write_message("ERROR: Missing value for term: %s (%s) in %s: %s" % (t, repr(t), table, len(term_docs)))
                 errors[t] = 1
         i += 5000
     write_message("Checking integrity of rank values in %sR" % table[:-1])
     i = 0
     while i < N:
         docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec>=%s and id_bibrec<=%s" % (table[:-1], i, i+5000))
         for (j, termlist) in docs_terms:
             termlist = deserialize_via_marshal(termlist)
             for (t, tf) in termlist.iteritems():
                 if tf[1] == 0 and not errors.has_key(t):
                     errors[t] = 1
                     write_message("ERROR: Gi missing for record %s and term: %s (%s) in %s" % (j,t,repr(t), table))
                     terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term=%%s" % table, (t,))
                     termlist = deserialize_via_marshal(terms_docs[0][1])
             i += 5000
 
     if len(errors) == 0:
         write_message("No direct errors found, but nonconsistent data may exist.")
     else:
         write_message("%s errors found during integrity check, repair and rebalancing recommended." % len(errors))
     options["modified_words"] = errors
 
 def rank_method_code_statistics(table):
     """Shows some statistics about this rank method."""
 
     maxID = run_sql("select max(id) from %s" % table)
     maxID = maxID[0][0]
     terms = {}
     Gi = {}
 
     write_message("Showing statistics of terms in index:")
     write_message("Important: For the 'Least used terms', the number of terms is shown first, and the number of occurences second.")
     write_message("Least used terms---Most important terms---Least important terms")
     i = 0
     while i < maxID:
         terms_docs=run_sql("SELECT term, hitlist FROM %s WHERE id>= %s and id < %s" % (table, i, i + 10000))
         for (t, hitlist) in terms_docs:
             term_docs=deserialize_via_marshal(hitlist)
             terms[len(term_docs)] = terms.get(len(term_docs), 0) + 1
             if term_docs.has_key("Gi"):
                 Gi[t] = term_docs["Gi"]
         i=i + 10000
     terms=terms.items()
     terms.sort(lambda x, y: cmp(y[1], x[1]))
     Gi=Gi.items()
     Gi.sort(lambda x, y: cmp(y[1], x[1]))
     for i in range(0, 20):
         write_message("%s/%s---%s---%s" % (terms[i][0],terms[i][1], Gi[i][0],Gi[len(Gi) - i - 1][0]))
 
 def update_rnkWORD(table, terms):
     """Updates rnkWORDF and rnkWORDR with Gi and Nj values. For each term in rnkWORDF, a Gi value for the term is added. And for each term in each document, the Nj value for that document is added. In rnkWORDR, the Gi value for each term in each document is added. For description on how things are computed, look in the hacking docs.
     table - name of forward index to update
     terms - modified terms"""
 
     zero_division_msg = """\
 ERROR: %s captured. This might be caused by not enough balanced indexes.
 Please, schedule a regular, e.g. weekly, rebalancing of the word similarity
 ranking indexes, by using e.g.
 "bibrank -f50000 -R -wwrd -s14d -LSunday"
 as recommended in %s/help/admin/howto-run"""
 
     stime = time.time()
     Gi = {}
     Nj = {}
     N = run_sql("select count(id_bibrec) from %sR" % table[:-1])[0][0]
 
     if len(terms) == 0 and task_get_option("quick") == "yes":
         write_message("No terms to process, ending...")
         return ""
     elif task_get_option("quick") == "yes": #not used -R option, fast calculation (not accurate)
         write_message("Beginning post-processing of %s terms" % len(terms))
 
         #Locating all documents related to the modified/new/deleted terms, if fast update,
         #only take into account new/modified occurences
         write_message("Phase 1: Finding records containing modified terms")
         terms = terms.keys()
         i = 0
 
         while i < len(terms):
             terms_docs = get_from_forward_index(terms, i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     del term_docs["Gi"]
                 for (j, tf) in term_docs.iteritems():
                     if (task_get_option("quick") == "yes" and tf[1] == 0) or task_get_option("quick") == "no":
                         Nj[j] = 0
             write_message("Phase 1: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
             i += 5000
         write_message("Phase 1: Finished finding records containing modified terms")
 
         #Find all terms in the records found in last phase
         write_message("Phase 2: Finding all terms in affected records")
         records = Nj.keys()
         i = 0
         while i < len(records):
             docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
             for (j, termlist) in docs_terms:
                 doc_terms = deserialize_via_marshal(termlist)
                 for (t, tf) in doc_terms.iteritems():
                     Gi[t] = 0
             write_message("Phase 2: ......processed %s/%s records " % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
             i += 5000
         write_message("Phase 2: Finished finding all terms in affected records")
 
     else: #recalculate
         max_id = run_sql("SELECT MAX(id) FROM %s" % table)
         max_id = max_id[0][0]
         write_message("Beginning recalculation of %s terms" % max_id)
 
         terms = []
         i = 0
         while i < max_id:
             terms_docs = get_from_forward_index_with_id(i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 Gi[t] = 0
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     del term_docs["Gi"]
                 for (j, tf) in term_docs.iteritems():
                     Nj[j] = 0
             write_message("Phase 1: ......processed %s/%s terms" % ((i+5000)>max_id and max_id or (i+5000), max_id))
             i += 5000
 
         write_message("Phase 1: Finished finding which records contains which terms")
         write_message("Phase 2: Jumping over..already done in phase 1 because of -R option")
 
     terms = Gi.keys()
     Gi = {}
     i = 0
     if task_get_option("quick") == "no":
         #Calculating Fi and Gi value for each term
         write_message("Phase 3: Calculating importance of all affected terms")
         while i < len(terms):
             terms_docs = get_from_forward_index(terms, i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     del term_docs["Gi"]
                 Fi = 0
                 Gi[t] = 1
                 for (j, tf) in term_docs.iteritems():
                     Fi += tf[0]
                 for (j, tf) in term_docs.iteritems():
                     if tf[0] != Fi:
                         Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N)
             write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
             i += 5000
         write_message("Phase 3: Finished calculating importance of all affected terms")
     else:
         #Using existing Gi value instead of calculating a new one. Missing some accurancy.
         write_message("Phase 3: Getting approximate importance of all affected terms")
         while i < len(terms):
             terms_docs = get_from_forward_index(terms, i, (i+5000), table)
             for (t, hitlist) in terms_docs:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     Gi[t] = term_docs["Gi"][1]
                 elif len(term_docs) == 1:
                     Gi[t] = 1
                 else:
                     Fi = 0
                     Gi[t] = 1
                     for (j, tf) in term_docs.iteritems():
                         Fi += tf[0]
                     for (j, tf) in term_docs.iteritems():
                         if tf[0] != Fi:
                             Gi[t] = Gi[t] + ((float(tf[0]) / Fi) * math.log(float(tf[0]) / Fi) / math.log(2)) / math.log(N)
             write_message("Phase 3: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
             i += 5000
         write_message("Phase 3: Finished getting approximate importance of all affected terms")
 
     write_message("Phase 4: Calculating normalization value for all affected records and updating %sR" % table[:-1])
     records = Nj.keys()
     i = 0
     while i < len(records):
         #Calculating the normalization value for each document, and adding the Gi value to each term in each document.
         docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
         for (j, termlist) in docs_terms:
             doc_terms = deserialize_via_marshal(termlist)
             try:
                 for (t, tf) in doc_terms.iteritems():
                     if Gi.has_key(t):
                         Nj[j] = Nj.get(j, 0) + math.pow(Gi[t] * (1 + math.log(tf[0])), 2)
                         Git = int(math.floor(Gi[t]*100))
                         if Git >= 0:
                             Git += 1
                         doc_terms[t] = (tf[0], Git)
                     else:
                         Nj[j] = Nj.get(j, 0) + math.pow(tf[1] * (1 + math.log(tf[0])), 2)
                 Nj[j] = 1.0 / math.sqrt(Nj[j])
                 Nj[j] = int(Nj[j] * 100)
                 if Nj[j] >= 0:
                     Nj[j] += 1
-                run_sql("UPDATE %sR SET termlist=%%s WHERE id_bibrec=%%s" % table[:-1],
+                run_sql("UPDATE %sR SET termlist=_binary %%s WHERE id_bibrec=%%s" % table[:-1],
                         (serialize_via_marshal(doc_terms), j))
             except (ZeroDivisionError, OverflowError), e:
                 ## This is to try to isolate division by zero errors.
                 write_message(zero_division_msg % (e, CFG_SITE_URL), stream=sys.stderr)
                 register_exception(prefix=zero_division_msg % (e, CFG_SITE_URL), alert_admin=True)
         write_message("Phase 4: ......processed %s/%s records" % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
         i += 5000
     write_message("Phase 4: Finished calculating normalization value for all affected records and updating %sR" % table[:-1])
     write_message("Phase 5: Updating %s with new normalization values" % table)
     i = 0
     terms = Gi.keys()
     while i < len(terms):
         #Adding the Gi value to each term, and adding the normalization value to each term in each document.
         terms_docs = get_from_forward_index(terms, i, (i+5000), table)
         for (t, hitlist) in terms_docs:
             try:
                 term_docs = deserialize_via_marshal(hitlist)
                 if term_docs.has_key("Gi"):
                     del term_docs["Gi"]
                 for (j, tf) in term_docs.iteritems():
                     if Nj.has_key(j):
                         term_docs[j] = (tf[0], Nj[j])
                 Git = int(math.floor(Gi[t]*100))
                 if Git >= 0:
                     Git += 1
                 term_docs["Gi"] = (0, Git)
-                run_sql("UPDATE %s SET hitlist=%%s WHERE term=%%s" % table,
+                run_sql("UPDATE %s SET hitlist=_binary %%s WHERE term=%%s" % table,
                         (serialize_via_marshal(term_docs), t))
             except (ZeroDivisionError, OverflowError), e:
                 write_message(zero_division_msg % (e, CFG_SITE_URL), stream=sys.stderr)
                 register_exception(prefix=zero_division_msg % (e, CFG_SITE_URL), alert_admin=True)
         write_message("Phase 5: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
         i += 5000
     write_message("Phase 5:  Finished updating %s with new normalization values" % table)
     write_message("Time used for post-processing: %.1fmin" % ((time.time() - stime) / 60))
     write_message("Finished post-processing")
 
 
 def get_from_forward_index(terms, start, stop, table):
     terms_docs = ()
     for j in range(start, (stop < len(terms) and stop or len(terms))):
         terms_docs += run_sql("SELECT term, hitlist FROM %s WHERE term=%%s" % table,
                               (terms[j],))
     return terms_docs
 
 def get_from_forward_index_with_id(start, stop, table):
     terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE id BETWEEN %s AND %s" % (table, start, stop))
     return terms_docs
 
 def get_from_reverse_index(records, start, stop, table):
     current_recs = "%s" % records[start:stop]
     current_recs = current_recs[1:-1]
     docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec IN (%s)" % (table[:-1], current_recs))
     return docs_terms
 
 #def test_word_separators(phrase="hep-th/0101001"):
     #"""Tests word separating policy on various input."""
     #print "%s:" % phrase
     #gwfp = get_words_from_phrase(phrase)
     #for (word, count) in gwfp.iteritems():
         #print "\t-> %s - %s" % (word, count)
 
 def getName(methname, ln=CFG_SITE_LANG, type='ln'):
     """Returns the name of the rank method, either in default language or given language.
     methname = short name of the method
     ln - the language to get the name in
     type - which name "type" to get."""
 
     try:
         rnkid = run_sql("SELECT id FROM rnkMETHOD where name='%s'" % methname)
         if rnkid:
             rnkid = str(rnkid[0][0])
             res = run_sql("SELECT value FROM rnkMETHODNAME where type='%s' and ln='%s' and id_rnkMETHOD=%s" % (type, ln, rnkid))
             if not res:
                 res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln='%s' and id_rnkMETHOD=%s and type='%s'"  % (CFG_SITE_LANG, rnkid, type))
             if not res:
                 return methname
             return res[0][0]
         else:
             raise Exception
     except Exception, e:
         write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.")
         raise Exception
 
 def word_similarity(run):
     """Call correct method"""
     return word_index(run)
diff --git a/modules/bibsort/lib/bibsort_engine.py b/modules/bibsort/lib/bibsort_engine.py
index 6d0d9d959..9294ccf81 100644
--- a/modules/bibsort/lib/bibsort_engine.py
+++ b/modules/bibsort/lib/bibsort_engine.py
@@ -1,946 +1,946 @@
 # -*- mode: python; coding: utf-8; -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """BibSort Engine"""
 
 import sys
 import time
 from invenio.dateutils import datetime, strftime
 from invenio.dbquery import deserialize_via_marshal, \
 serialize_via_marshal, run_sql, Error
 from invenio.search_engine import get_field_tags, search_pattern
 from invenio.intbitset import intbitset
 from invenio.bibtask import write_message, task_update_progress, \
 task_sleep_now_if_required
 from invenio.config import CFG_BIBSORT_BUCKETS, CFG_CERN_SITE
 from invenio.bibsort_washer import BibSortWasher, \
 InvenioBibSortWasherNotImplementedError
 
 import invenio.template
 websearch_templates = invenio.template.load('websearch')
 
 #The space distance between elements, to make inserts faster
 CFG_BIBSORT_WEIGHT_DISTANCE = 8
 
 
 def get_bibsort_methods_details(method_list = None):
     """Returns the id, definition, and washer for the methods in method_list.
     If no method_list is specified: we get all the data from bsrMETHOD table"""
     bibsort_methods = {}
     errors = False
     results = []
     if not method_list:
         try:
             results = run_sql("SELECT id, name, definition, washer \
                               FROM bsrMETHOD")
         except Error, err:
             write_message("The error: [%s] occured while trying to read " \
                           "the bibsort data from the database." \
                           %err, stream=sys.stderr)
             return {}, True
         if not results:
             write_message("The bsrMETHOD table is empty.")
             return {}, errors
     else:
         for method in method_list:
             try:
                 res = run_sql("""SELECT id, name, definition, washer \
                               FROM bsrMETHOD where name = %s""", (method, ))
             except Error, err:
                 write_message("The error: [%s] occured while trying to get " \
                               "the bibsort data from the database for method %s." \
                               %(err, method), stream=sys.stderr)
                 errors = True
             if not res:
                 write_message("No information for method: %s." % method)
             else:
                 results.append(res[0])
     for item in results:
         bibsort_methods.setdefault(item[1], {})['id'] = item[0]
         bibsort_methods[item[1]]['definition'] = item[2]
         bibsort_methods[item[1]]['washer'] = item[3]
     return bibsort_methods, errors
 
 
 def get_all_recids(including_deleted=True):#6.68s on cdsdev
     """Returns a list of all records available in the system"""
     res = run_sql("SELECT id FROM bibrec")
     if not res:
         return intbitset([])
     all_recs = intbitset(res)
     if not including_deleted: # we want to exclude deleted records
         if CFG_CERN_SITE:
             deleted = search_pattern(p='980__:"DELETED" OR 980__:"DUMMY"')
         else:
             deleted = search_pattern(p='980__:"DELETED"')
         all_recs.difference_update(deleted)
     return all_recs
 
 
 def get_max_recid():
     """Returns the max id in bibrec - good approximation
     for the total number of records"""
     try:
         return run_sql("SELECT MAX(id) FROM bibrec")[0][0]
     except IndexError:
         return 0
 
 
 def _get_values_from_marc_tag(tag, recids):
     '''Finds the value for a specific tag'''
     digits = tag[0:2]
     try:
         intdigits = int(digits)
         if intdigits < 0 or intdigits > 99:
             raise ValueError
     except ValueError:
         # invalid tag value asked for
         write_message('You have asked for an invalid tag value ' \
                       '[tag=%s; value=%s].' %(tag, intdigits), verbose=5)
         return []
     bx = "bib%sx" % digits
     bibx = "bibrec_bib%sx" % digits
     max_recid = get_max_recid()
 
     if len(recids) == 1:
         to_append = '= %s'
         query_params = [recids.tolist()[0]]
 
     elif len(recids) < max_recid/3:
         # if we have less then one third of the records
         # use IN
         #This realy depends on how large the repository is..
         to_append = 'IN %s'
         query_params = [tuple(recids)]
 
     else:
        # mysql might crush with big queries, better use BETWEEN
         to_append = 'BETWEEN %s AND %s'
         query_params = [1, max_recid]
 
     query = 'SELECT bibx.id_bibrec, bx.value \
                     FROM %s AS bx, %s AS bibx \
                     WHERE bibx.id_bibrec %s \
                     AND bx.id = bibx.id_bibxxx \
                     AND bx.tag LIKE %%s' % (bx, bibx, to_append)
     query_params.append(tag)
     res = run_sql(query, tuple(query_params))
     return res
 
 
 def get_data_for_definition_marc(tags, recids):
     '''Having a list of tags and a list of recids, it returns a dictionary
     with the values correspondig to the tags'''
     #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x]
     #user: 140s, sys: 21s, total: 160s - cdsdev
     if isinstance(recids, (int, long)):
         recids = intbitset([recids, ])
     # for each recid we need only one value
     #on which we sort, so we can stop looking for a value
     # as soon as we find one
     tag_index = 0
     field_data_dict = {}
     while len(recids) > 0 and tag_index < len(tags):
         write_message('%s records queried for values for tags %s.' \
                       %(len(recids), tags), verbose=5)
         res = _get_values_from_marc_tag(tags[tag_index], recids)
         res_dict = dict(res)
         #field_data_dict.update(res_dict)
         #we can not use this, because res_dict might contain recids
         #that are already in field_data_dict, and we should not overwrite their value
         field_data_dict = dict(res_dict, **field_data_dict)
         #there might be keys that we do not want (ex: using 'between')
         #so we should remove them
         res_dict_keys = intbitset(res_dict.keys())
         recids_not_needed = res_dict_keys.difference(recids)
         for recid in recids_not_needed:
             del field_data_dict[recid]
         #update the recids to contain only the recid that do not have values yet
         recids.difference_update(res_dict_keys)
         tag_index += 1
     return field_data_dict
 
 
 def get_data_for_definition_rnk(method_name, rnk_name):
     '''Returns the dictionary with data for method_name ranking method'''
     try:
         res = run_sql('SELECT d.relevance_data \
                           from rnkMETHODDATA d, rnkMETHOD r WHERE \
                           d.id_rnkMETHOD = r.id AND \
                           r.name = %s', (rnk_name, ))
         if res and res[0]:
             write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \
                           %method_name, verbose=5)
             return deserialize_via_marshal(res[0][0])
     except Error, err:
         write_message("No data could be found for sorting method %s. " \
                       "The following errror occured: [%s]" \
                       %(method_name, err), stream=sys.stderr)
         return {}
 
 
 def get_data_for_definition_bibrec(column_name, recids_copy):
     '''Having a column_name and a list of recids, it returns a dictionary
     mapping each recids with its correspondig value from the column'''
     dict_column = {}
     for recid in recids_copy:
         creation_date = run_sql('SELECT %s from bibrec WHERE id = %%s' %column_name, (recid, ))[0][0]
         new_creation_date = datetime(creation_date.year,creation_date.month,creation_date.day, \
                                      creation_date.hour,creation_date.minute, creation_date.second)
         dict_column[recid] = new_creation_date.strftime('%Y%m%d%H%M%S')
     return dict_column
 
 
 def get_field_data(recids, method_name, definition):
     """Returns the data associated with the definition for recids.
     The returned dictionary will contain ONLY the recids for which
     a value has been found in the database.
     """
     recids_copy = recids.copy()
     #if we are dealing with a MARC definition
     if definition.startswith('MARC'):
         tags = definition.replace('MARC:', '').replace(' ', '').strip().split(',')
         if not tags:
             write_message('No MARC tags found for method %s.' \
                           %method_name, verbose=5)
             return {}
         write_message('The following MARC tags will be queried: %s' %tags, \
                       verbose=5)
         return get_data_for_definition_marc(tags, recids_copy)
     #if we are dealing with tags (ex: author, title)
     elif definition.startswith('FIELD'):
         tags = get_field_tags(definition.replace('FIELD:', '').strip())
         if not tags:
             write_message('No tags found for method %s.' \
                           %method_name, verbose=5)
             return {}
         write_message('The following tags will be queried: %s' %tags, verbose=5)
         return get_data_for_definition_marc(tags, recids_copy)
     # if we are dealing with ranking data
     elif definition.startswith('RNK'):
         rnk_name = definition.replace('RNK:', '').strip()
         return get_data_for_definition_rnk(method_name, rnk_name)
     # if we are looking into bibrec table
     elif definition.startswith('BIBREC'):
         column_name = definition.replace('BIBREC:', '').strip()
         return get_data_for_definition_bibrec(column_name, recids_copy)
     else:
         write_message("The definition %s for method % could not be recognized" \
                       %(definition, method_name), stream=sys.stderr)
         return {}
 
 
 def apply_washer(data_dict, washer):
     '''The values are filtered using the washer function'''
     if not washer:
         return
     if washer.strip() == 'NOOP':
         return
     washer = washer.split(':')[0]#in case we have a locale defined
     try:
         method = BibSortWasher(washer)
         write_message('Washer method found: %s' %washer, verbose=5)
         for recid in data_dict:
             new_val = method.get_transformed_value(data_dict[recid])
             data_dict[recid] = new_val
     except InvenioBibSortWasherNotImplementedError, err:
         write_message("Washer %s is not implemented [%s]." \
                       %(washer, err), stream=sys.stderr)
 
 def locale_for_sorting(washer):
     """Identifies if any specific locale should be used, and it returns it"""
     if washer.find(":") > -1:
         lang = washer[washer.index(':')+1:]
         return websearch_templates.tmpl_localemap.get(lang, websearch_templates.tmpl_default_locale)
     return None
 
 def run_sorting_method(recids, method_name, method_id, definition, washer):
     """Does the actual sorting for the method_name
     for all the records in the database"""
     run_sorting_for_rnk = False
     if definition.startswith('RNK'):
         run_sorting_for_rnk = True
     field_data_dictionary = get_field_data(recids, method_name, definition)
     if not field_data_dictionary:
         write_message("POSSIBLE ERROR: The sorting method --%s-- has no data!" \
                       %method_name)
         return True
     apply_washer(field_data_dictionary, washer)
     #do we have any locale constraint?
     sorting_locale = locale_for_sorting(washer)
     sorted_data_list, sorted_data_dict = \
                 sort_dict(field_data_dictionary, CFG_BIBSORT_WEIGHT_DISTANCE, run_sorting_for_rnk, sorting_locale)
     executed = write_to_methoddata_table(method_id, field_data_dictionary, \
                                          sorted_data_dict, sorted_data_list)
     if not executed:
         return False
     if CFG_BIBSORT_BUCKETS > 1:
         bucket_dict, bucket_last_rec_dict = split_into_buckets(sorted_data_list, len(sorted_data_list))
         for idx in bucket_dict:
             executed = write_to_buckets_table(method_id, idx, bucket_dict[idx], \
                                               sorted_data_dict[bucket_last_rec_dict[idx]])
             if not executed:
                 return False
     else:
         executed = write_to_buckets_table(method_id, 1, intbitset(sorted_data_list), \
                                           sorted_data_list[-1])
         if not executed:
             return False
     return True
 
 
 def write_to_methoddata_table(id_method, data_dict, data_dict_ordered, data_list_sorted, update_timestamp=True):
     """Serialize the date and write it to the bsrMETHODDATA"""
     write_message('Starting serializing the data..', verbose=5)
     serialized_data_dict = serialize_via_marshal(data_dict)
     serialized_data_dict_ordered = serialize_via_marshal(data_dict_ordered)
     serialized_data_list_sorted = serialize_via_marshal(data_list_sorted)
     write_message('Serialization completed.', verbose=5)
     date = strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     if not update_timestamp:
         try:
             date = run_sql('SELECT last_updated from bsrMETHODDATA WHERE id_bsrMETHOD = %s', (id_method, ))[0][0]
         except IndexError:
             pass # keep the generated date
     write_message("Starting writing the data for method_id=%s " \
                   "to the database (table bsrMETHODDATA)" %id_method, verbose=5)
     try:
         write_message('Deleting old data..', verbose=5)
         run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s", (id_method, ))
         write_message('Inserting new data..', verbose=5)
         run_sql("INSERT into bsrMETHODDATA \
             (id_bsrMETHOD, data_dict, data_dict_ordered, data_list_sorted, last_updated) \
-            VALUES (%s, %s, %s, %s, %s)", \
+            VALUES (%s, _binary %s, _binary %s, _binary %s, %s)", \
             (id_method, serialized_data_dict, serialized_data_dict_ordered, \
              serialized_data_list_sorted, date, ))
     except Error, err:
         write_message("The error [%s] occured when inserting new bibsort data "\
                       "into bsrMETHODATA table" %err, sys.stderr)
         return False
     write_message('Writing to the bsrMETHODDATA successfully completed.', \
                   verbose=5)
     return True
 
 
 def write_to_buckets_table(id_method, bucket_no, bucket_data, bucket_last_value, update_timestamp=True):
     """Serialize the date and write it to the bsrMEHODDATA_BUCKETS"""
     write_message('Writing the data for bucket number %s for ' \
                   'method_id=%s to the database' \
                   %(bucket_no, id_method), verbose=5)
     write_message('Serializing data for bucket number %s' %bucket_no, verbose=5)
     serialized_bucket_data = bucket_data.fastdump()
     date = strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     if not update_timestamp:
         try:
             date = run_sql('SELECT last_updated from bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s and bucket_no = %s', \
                            (id_method, bucket_no))[0][0]
         except IndexError:
             pass # keep the generated date
     try:
         write_message('Deleting old data.', verbose=5)
         run_sql("DELETE FROM bsrMETHODDATABUCKET \
                 WHERE id_bsrMETHOD = %s AND bucket_no = %s", \
                 (id_method, bucket_no, ))
         write_message('Inserting new data.', verbose=5)
         run_sql("INSERT into bsrMETHODDATABUCKET \
             (id_bsrMETHOD, bucket_no, bucket_data, bucket_last_value, last_updated) \
-            VALUES (%s, %s, %s, %s, %s)", \
+            VALUES (%s, %s, _binary %s, %s, %s)", \
             (id_method, bucket_no, serialized_bucket_data, bucket_last_value, date, ))
     except Error, err:
         write_message("The error [%s] occured when inserting new bibsort data " \
                       "into bsrMETHODATA_BUCKETS table" %err, sys.stderr)
         return False
     write_message('Writing to bsrMETHODDATABUCKET for ' \
                   'bucket number %s completed.' %bucket_no, verbose=5)
     return True
 
 
 def split_into_buckets(sorted_data_list, data_size):
     """The sorted_data_list is split into equal buckets.
     Returns a dictionary containing the buckets and
     a dictionary containing the last record in each bucket"""
     write_message("Starting splitting the data into %s buckets." \
                   %CFG_BIBSORT_BUCKETS, verbose=5)
     bucket_dict = {}
     bucket_last_rec_dict = {}
     step = data_size/CFG_BIBSORT_BUCKETS
     i = 0
     for i in xrange(CFG_BIBSORT_BUCKETS - 1):
         bucket_dict[i+1] = intbitset(sorted_data_list[i*step:i*step+step])
         bucket_last_rec_dict[i+1] = sorted_data_list[i*step+step-1]
         write_message("Bucket %s done." %(i+1), verbose=5)
     #last bucket contains all the remaining data
     bucket_dict[CFG_BIBSORT_BUCKETS] = intbitset(sorted_data_list[(i+1)*step:])
     bucket_last_rec_dict[CFG_BIBSORT_BUCKETS] = sorted_data_list[-1]
     write_message("Bucket %s done." %CFG_BIBSORT_BUCKETS, verbose=5)
     write_message("Splitting completed.", verbose=5)
     return bucket_dict, bucket_last_rec_dict
 
 
 def sort_dict(dictionary, spacing=1, run_sorting_for_rnk=False, sorting_locale=None):
     """Sorting a dictionary. Returns a list of sorted recids
     and also a dictionary containing the recid: weight
     weight = index * spacing"""
     #10Mil records dictionary -> 36.9s
     write_message("Starting sorting the dictionary " \
                   "containing all the data..", verbose=5)
     sorted_records_dict_with_id = {}
 
     if sorting_locale:
         import locale
         orig_locale = locale.getlocale(locale.LC_ALL)
         try:
             locale.setlocale(locale.LC_ALL, sorting_locale)
         except locale.Error:
             try:
                 locale.setlocale(locale.LC_ALL, sorting_locale + '.UTF8')
             except locale.Error:
                 write_message("Setting locale to %s is not working.. ignoring locale")
         sorted_records_list = sorted(dictionary, key=dictionary.__getitem__, cmp=locale.strcoll, reverse=False)
         locale.setlocale(locale.LC_ALL, orig_locale)
     else:
         sorted_records_list = sorted(dictionary, key=dictionary.__getitem__, reverse=False)
 
     if run_sorting_for_rnk:
         #for ranking, we can keep the actual values associated with the recids
         return sorted_records_list, dictionary
     else:
         index = 1
         for recid in sorted_records_list:
             sorted_records_dict_with_id[recid] = index * spacing
             index += 1
     write_message("Dictionary sorted.", verbose=5)
     return sorted_records_list, sorted_records_dict_with_id
 
 
 def get_modified_or_inserted_recs(method_list):
     """Returns a list of recids that have been inserted or
     modified since the last update of the bibsort methods in method_list
     method_list should already contain a list of methods that
     SHOULD be updated, if it contains new methods, an error will be thrown"""
 
     if not method_list: #just to be on the safe side
         return 0
 
     try:
         query = "SELECT min(d.last_updated) from bsrMETHODDATA d, bsrMETHOD m \
                 WHERE m.name in (%s) AND d.id_bsrMETHOD = m.id" % \
                 ("%s," * len(method_list))[:-1]
         last_updated = str(run_sql(query, tuple(method_list))[0][0])
     except Error, err:
         write_message("Error when trying to get the last_updated date " \
                       "from bsrMETHODDATA: [%s]" %err, sys.stderr)
         return 0
     recids = []
     try:
         results = run_sql("SELECT id from bibrec \
                           where modification_date >= %s", (last_updated, ))
         if results:
             recids = [result[0] for result in results]
     except Error, err:
         write_message("Error when trying to get the list of " \
                       "modified records: [%s]" %err, sys.stderr)
         return 0
     return recids
 
 
 def get_rnk_methods(bibsort_methods):
     """Returns the list of bibsort methods (names) that are RNK methods"""
     return [method for method in bibsort_methods if \
             bibsort_methods[method]['definition'].startswith('RNK')]
 
 
 def get_modified_non_rnk_methods(non_rnk_method_list):
     """Returns 2 lists of non RNK methods:
     updated_ranking_methods = non RNK methods that need to be updated
     inserted_ranking_methods = non RNK methods, that have no data yet,
     so rebalancing should run on them"""
     updated_ranking_methods = []
     inserted_ranking_methods = []
     for method in non_rnk_method_list:
         try:
             dummy = str(run_sql('SELECT d.last_updated \
                                        FROM bsrMETHODDATA d, bsrMETHOD m \
                                        WHERE m.id = d.id_bsrMETHOD \
                                        AND m.name=%s', (method, ))[0][0])
             updated_ranking_methods.append(method)
         except IndexError: #method is not in bsrMETHODDATA -> is new
             inserted_ranking_methods.append(method)
     return updated_ranking_methods, inserted_ranking_methods
 
 
 def get_modified_rnk_methods(rnk_method_list, bibsort_methods):
     """Returns the list of RNK methods that have been recently modified,
     so they will need to have their bibsort data updated"""
     updated_ranking_methods = []
     deleted_ranking_methods = []
     for method in rnk_method_list:
         method_name = bibsort_methods[method]['definition'].replace('RNK:', '').strip()
         try:
             last_updated_rnk = str(run_sql('SELECT last_updated \
                                            FROM rnkMETHOD \
                                            WHERE name = %s', (method_name, ))[0][0])
         except IndexError:
             write_message("The method %s could not be found in rnkMETHOD" \
                       %(method_name), stream=sys.stderr)
             #this method does not exist in rnkMETHOD,
             #it might have been a mistype or it might have been deleted
             deleted_ranking_methods.append(method)
         if method not in deleted_ranking_methods:
             try:
                 last_updated_bsr = str(run_sql('SELECT d.last_updated \
                                        FROM bsrMETHODDATA d, bsrMETHOD m \
                                        WHERE m.id = d.id_bsrMETHOD \
                                        AND m.name=%s', (method, ))[0][0])
                 if last_updated_rnk >= last_updated_bsr:
                     # rnk data has been updated after bibsort ran
                     updated_ranking_methods.append(method)
                 else:
                     write_message("The method %s has not been updated "\
                                   "since the last run of bibsort." %method)
             except IndexError:
                 write_message("The method %s could not be found in bsrMETHODDATA" \
                       %(method))
                 # that means that the bibsort never run on this method, so let's run it
                 updated_ranking_methods.append(method)
 
     return updated_ranking_methods, deleted_ranking_methods
 
 
 def delete_bibsort_data_for_method(method_id):
     """This method will delete all data asociated with a method
     from bibsort tables (except bsrMETHOD).
     Returns False in case some error occured, True otherwise"""
     try:
         run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s", (method_id, ))
         run_sql("DELETE FROM bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s", (method_id, ))
     except:
         return False
     return True
 
 def delete_all_data_for_method(method_id):
     """This method will delete all data asociated with a method
     from bibsort tables.
     Returns False in case some error occured, True otherwise"""
     method_name = 'method name'
     try:
         run_sql("DELETE FROM bsrMETHODDATA WHERE id_bsrMETHOD = %s", (method_id, ))
         run_sql("DELETE FROM bsrMETHODDATABUCKET WHERE id_bsrMETHOD = %s", (method_id, ))
         run_sql("DELETE FROM bsrMETHODNAME WHERE id_bsrMETHOD = %s", (method_id, ))
         run_sql("DELETE FROM bsrMETHOD WHERE id = %s", (method_id, ))
         method_name = run_sql("SELECT name from bsrMETHOD WHERE id = %s", (method_id, ))[0][0]
     except Error:
         return False
     except IndexError:
         return True
     if method_name:# the method has not been deleted
         return False
     return True
 
 def add_sorting_method(method_name, method_definition, method_treatment):
     """This method will add a new sorting method in the database
     and update the config file"""
     try:
         run_sql("INSERT INTO bsrMETHOD(name, definition, washer) \
             VALUES (%s, %s, %s)", (method_name, method_definition, method_treatment))
     except Error:
         return False
     return True
 
 def update_bibsort_tables(recids, method, update_timestamp = True):
     """Updates the data structures for sorting method: method
     for the records in recids"""
 
     res = run_sql("SELECT id, definition, washer \
                   from bsrMETHOD where name = %s", (method, ))
     if res and res[0]:
         method_id = res[0][0]
         definition = res[0][1]
         washer = res[0][2]
     else:
         write_message('No sorting method called %s could be found ' \
                       'in bsrMETHOD table.' %method, sys.stderr)
         return False
     res = run_sql("SELECT data_dict, data_dict_ordered, data_list_sorted \
                   FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, ))
     if res and res[0]:
         data_dict = deserialize_via_marshal(res[0][0])
         data_dict_ordered = {}
         data_list_sorted = []
     else:
         write_message('No data could be found for the sorting method %s.' \
                       %method)
         return False #since this case should have been treated earlier
     #get the values for the recids that need to be recalculated
     field_data = get_field_data(recids, method, definition)
     if not field_data:
         write_message("Possible error: the method %s has no data for records %s." \
                       %(method, str(recids)))
     else:
         apply_washer(field_data, washer)
 
     #if a recid is not in field_data that is because no value was found for it
     #so it should be marked for deletion
     recids_to_delete = list(recids.difference(intbitset(field_data.keys())))
     recids_to_insert = []
     recids_to_modify = {}
     for recid in field_data:
         if recid in data_dict:
             if data_dict[recid] != field_data[recid]:
                 #we store the old value
                 recids_to_modify[recid] = data_dict[recid]
         else: # recid is new, and needs to be inserted
             recids_to_insert.append(recid)
 
     #remove the recids that were not previously in bibsort
     recids_to_delete = [recid for recid in recids_to_delete if recid in data_dict]
 
     #dicts to keep the ordered values for the recids - useful bor bucket insertion
     recids_current_ordered = {}
     recids_old_ordered = {}
 
     if recids_to_insert or recids_to_modify or recids_to_delete:
         data_dict_ordered = deserialize_via_marshal(res[0][1])
         data_list_sorted = deserialize_via_marshal(res[0][2])
         if recids_to_modify:
             write_message("%s records have been modified." \
                           %len(recids_to_modify), verbose=5)
             for recid in recids_to_modify:
                 recids_old_ordered[recid] = data_dict_ordered[recid]
                 perform_modify_record(data_dict, data_dict_ordered, \
                                 data_list_sorted, field_data[recid], recid)
         if recids_to_insert:
             write_message("%s records have been inserted." \
                           %len(recids_to_insert), verbose=5)
             for recid in recids_to_insert:
                 perform_insert_record(data_dict, data_dict_ordered, \
                                 data_list_sorted, field_data[recid], recid)
         if recids_to_delete:
             write_message("%s records have been deleted." \
                           %len(recids_to_delete), verbose=5)
             for recid in recids_to_delete:
                 perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid)
 
         for recid in recids_to_modify:
             recids_current_ordered[recid] = data_dict_ordered[recid]
         for recid in recids_to_insert:
             recids_current_ordered[recid] = data_dict_ordered[recid]
 
         #write the modifications to db
         executed = write_to_methoddata_table(method_id, data_dict, \
                                          data_dict_ordered, data_list_sorted, update_timestamp)
         if not executed:
             return False
 
         #update buckets
         try:
             perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp)
         except Error, err:
             write_message("[%s] The bucket data for method %s has not been updated" \
                           %(method, err), sys.stderr)
             return False
     return True
 
 
 def perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp = True):
     """Updates the buckets"""
     bucket_insert = {}
     bucket_delete = {}
     write_message("Updating the buckets for method_id = %s" %method_id, verbose=5)
     buckets = run_sql("SELECT bucket_no, bucket_last_value \
                       FROM bsrMETHODDATABUCKET \
                       WHERE id_bsrMETHOD = %s", (method_id, ))
     if not buckets:
         write_message("No bucket data found for method_id %s." \
                       %method_id, sys.stderr)
         raise Exception
     #sort the buckets to be sure we are iterating them in order(1 to max):
     buckets_dict = dict(buckets)
     for recid in recids_to_insert:
         for bucket_no in buckets_dict:
             if recids_current_ordered[recid] <= buckets_dict[bucket_no]:
                 bucket_insert.setdefault(bucket_no, []).append(recid)
                 break
 
     for recid in recids_old_ordered:
         record_inserted = 0
         record_deleted = 0
         for bucket_no in buckets_dict:
             bucket_value = int(buckets_dict[bucket_no])
             if record_inserted and record_deleted:
                 #both insertion and deletion have been registered
                 break
             if recids_current_ordered[recid] <= bucket_value and \
                 recids_old_ordered[recid] <= bucket_value and \
                 not record_inserted and \
                 not record_deleted:
                 #both before and after the modif,
                 #recid should be in the same bucket -> nothing to do
                 break
             if recids_current_ordered[recid] <= bucket_value and not record_inserted:
                 #recid should be, after the modif, here, so insert
                 bucket_insert.setdefault(bucket_no, []).append(recid)
                 record_inserted = 1
             if recids_old_ordered[recid] <= bucket_value and not record_deleted:
                 #recid was here before modif, must be removed
                 bucket_delete.setdefault(bucket_no, []).append(recid)
                 record_deleted = 1
 
     for bucket_no in buckets_dict:
         if (bucket_no in bucket_insert) or (bucket_no in bucket_delete):
             res = run_sql("SELECT bucket_data FROM bsrMETHODDATABUCKET \
                           where id_bsrMETHOD = %s AND bucket_no = %s", \
                           (method_id, bucket_no, ))
             bucket_data = intbitset(res[0][0])
             for recid in bucket_insert.get(bucket_no, []):
                 bucket_data.add(recid)
             for recid in bucket_delete.get(bucket_no, []):
                 if recid in bucket_data:
                     bucket_data.remove(recid)
             if update_timestamp:
                 date = strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                 run_sql("UPDATE bsrMETHODDATABUCKET \
-                    SET bucket_data = %s, last_updated = %s \
+                    SET bucket_data = _binary %s, last_updated = %s \
                     WHERE id_bsrMETHOD = %s AND bucket_no = %s", \
                     (bucket_data.fastdump(), date, method_id, bucket_no, ))
             else:
                 run_sql("UPDATE bsrMETHODDATABUCKET \
-                    SET bucket_data = %s \
+                    SET bucket_data = _binary %s \
                     WHERE id_bsrMETHOD = %s AND bucket_no = %s", \
                     (bucket_data.fastdump(), method_id, bucket_no, ))
             write_message("Updating bucket %s for method %s." %(bucket_no, method_id), verbose=5)
 
 
 def perform_modify_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE):
     """Modifies all the data structures with the new information
     about the record"""
     #remove the recid from the old position, to make place for the new value
     data_list_sorted.remove(recid)
     # from now on, it is the same thing as insert
     return perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing)
 
 
 def perform_insert_record(data_dict, data_dict_ordered, data_list_sorted, value, recid, spacing=CFG_BIBSORT_WEIGHT_DISTANCE):
     """Inserts a new record into all the data structures"""
     #data_dict
     data_dict[recid] = value
     #data_dict_ordered & data_list_sorted
     #calculate at which index the rec should be inserted in data_list_sorted
     index_for_insert = binary_search(data_list_sorted, value, data_dict)
     #we have to calculate the weight of this record in data_dict_ordered
     #and it will be the med between its neighbours in the data_list_sorted
     if index_for_insert == len(data_list_sorted):#insert at the end of the list
         #append at the end of the list
         data_list_sorted.append(recid)
         #weight = highest weight + the distance
         data_dict_ordered[recid] = data_dict_ordered[data_list_sorted[index_for_insert - 1]] + spacing
     else:
         if index_for_insert == 0: #insert at the begining of the list
             left_neighbor_weight = 0
         else:
             left_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert - 1]]
         right_neighbor_weight = data_dict_ordered[data_list_sorted[index_for_insert]]
         #the recid's weight will be the med between left and right
         weight = (right_neighbor_weight - left_neighbor_weight)/2
         if weight < 1: #there is no more space to insert, we have to create some space
             data_list_sorted.insert(index_for_insert, recid)
             data_dict_ordered[recid] = left_neighbor_weight + spacing
             create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing)
         else:
             data_list_sorted.insert(index_for_insert, recid)
             data_dict_ordered[recid] = left_neighbor_weight + weight
     write_message("Record %s done." %recid, verbose=5)
     return index_for_insert
 
 
 def perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid):
     """Delete a record from all the data structures"""
     #data_dict
     del data_dict[recid]
     #data_list_sorted
     data_list_sorted.remove(recid)
     #data_dict_ordered
     del data_dict_ordered[recid]
     write_message("Record %s done." %recid, verbose=5)
     return 1
 
 
 def create_space_for_new_weight(index_for_insert, data_dict_ordered, data_list_sorted, spacing):
     """In order to keep an order of the records in data_dict_ordered, when a new
     weight is inserted, there needs to be some place for it
     (ex: recid3 needs to be inserted between recid1-with weight=10 and recid2-with weight=11)
     The scope of this function is to increease the distance between recid1 and recid2
     (and thus all the weights after recid2) so that recid3 will have an integer weight"""
     for i in range(index_for_insert+1, len(data_list_sorted)):
         data_dict_ordered[data_list_sorted[i]] += 2 * spacing
 
 
 def binary_search(sorted_list, value, data_dict):
     """Binary Search O(log n)"""
     minimum = -1
     maximum = len(sorted_list)
     while maximum - minimum > 1:
         med = (maximum+minimum)/2
         recid1 = sorted_list[med]
         value1 = data_dict[recid1]
         if value1 > value:
             maximum = med
         elif value1 < value:
             minimum = med
         else:
             return med
     return minimum + 1
 
 
 def run_bibsort_update(recids=None, method_list=None):
     """Updates bibsort tables for the methods in method_list
     and for the records in recids.
 
     If recids is None: recids = all records that have been modified
     or inserted since last update
 
     If method_list is None: method_list = all the methods available
     in bsrMETHOD table"""
 
     write_message('Initial data for run_bibsort_update method: ' \
                   'number of recids = %s; method_list=%s' \
                   %(str(len(recids)), method_list), verbose=5)
     write_message('Updating sorting data.')
 
     bibsort_methods, errors = get_bibsort_methods_details(method_list)
     if errors:
         return False
     method_list = bibsort_methods.keys()
     if not method_list:
         write_message('No methods found in bsrMETHOD table.. exiting.')
         return True
 
     #we could have 4 types of methods:
     #(i) RNK methods -> they should be rebalanced, not updated
     #(ii) RNK methods to delete -> we should delete their data
     #(iii) non RNK methods to update
     #(iv) non RNK methods that are new -> they should be rebalanced(sorted), not updated
     #check which of the methods are RNK methods (they do not need modified recids)
     rnk_methods = get_rnk_methods(bibsort_methods)
     rnk_methods_updated, rnk_methods_deleted = get_modified_rnk_methods(rnk_methods, bibsort_methods)
     #check which of the methods have no data, so they are actually new,
     #so they need balancing(sorting) instead of updating
     non_rnk_methods = [method for method in bibsort_methods.keys() if method not in rnk_methods]
     non_rnk_methods_updated, non_rnk_methods_inserted = get_modified_non_rnk_methods(non_rnk_methods)
 
     #(i) + (iv)
     methods_to_balance = rnk_methods_updated + non_rnk_methods_inserted
     if methods_to_balance: # several methods require rebalancing(sorting) and not updating
         return run_bibsort_rebalance(methods_to_balance)
 
     #(ii)
     #remove the data for the ranking methods that have been deleted
     for method in rnk_methods_deleted:
         task_sleep_now_if_required(can_stop_too=True)
         task_update_progress("Deleting data for method %s" %method)
         write_message('Starting deleting the data for RNK method %s' %method, verbose=5)
         executed_ok = delete_bibsort_data_for_method(bibsort_methods[method]['id'])
         if not executed_ok:
             write_message('Method %s could not be deleted correctly, aborting..' \
                           %method, sys.stderr)
             return False
 
     #(iii)
     #methods to actually update
     if non_rnk_methods_updated: # we want to update some 'normal'(not RNK) tables, so we need recids
         update_timestamp = False
         if not recids:
             recids = get_modified_or_inserted_recs(non_rnk_methods_updated)
             if recids == 0: #error signal
                 return False
             if not recids:
                 write_message("No records inserted or modified in bibrec table " \
                           "since the last update of bsrMETHODDATA.")
                 return True
             write_message("These records have been recently modified/inserted: %s" \
                   %str(recids), verbose=5)
             update_timestamp = True
         recids_i = intbitset(recids)
         for method in non_rnk_methods_updated:
             task_sleep_now_if_required(can_stop_too=True)
             task_update_progress("Updating method %s" %method)
             write_message('Starting updating method %s' %method, verbose=5)
             executed_ok = update_bibsort_tables(recids_i, method, update_timestamp)
             if not executed_ok:
                 write_message('Method %s could not be executed correctly, aborting..' \
                           %method, sys.stderr)
                 return False
     return True
 
 
 def run_bibsort_rebalance(method_list = None):
     """Rebalances all buckets for the methods in method_list"""
     bibsort_methods, errors = get_bibsort_methods_details(method_list)
     if errors:
         return False
     if not bibsort_methods:
         write_message('No methods found.. exiting rebalancing.')
         return True
     #check if there are only ranking methods -> no need for recids
     rnk_methods = get_rnk_methods(bibsort_methods)
     non_rnk_method = [method for method in bibsort_methods.keys() if method not in rnk_methods]
 
     write_message('Running rebalancing for methods: %s' %bibsort_methods.keys())
 
     if non_rnk_method:# we have also 'normal' (no RNK) methods, so we need the recids
         recids = get_all_recids(including_deleted=False)
         write_message('Rebalancing will run for %s records.' \
                       %str(len(recids)), verbose=5)
         task_sleep_now_if_required(can_stop_too=True)
     else:
         recids = intbitset([])
         write_message('Rebalancing will run only for RNK methods')
     for name in bibsort_methods:
         task_update_progress('Rebalancing %s method.' %name)
         write_message('Starting sorting the data for %s method ... ' \
                           %name.upper())
         executed_ok = run_sorting_method(recids, name,
                                 bibsort_methods[name]['id'],
                                 bibsort_methods[name]['definition'],
                                 bibsort_methods[name]['washer'])
         if not executed_ok:
             write_message('Method %s could not be executed correctly.' \
                           %name, sys.stderr)
             return False
         write_message('Done.')
         task_sleep_now_if_required(can_stop_too=True)
     task_update_progress('Rebalancing done.')
     return True
 
 
 def main():
     """tests"""
     #print "Running bibsort_rebalance...."
     #run_bibsort_rebalance() #rebalances everything
     #print "Running bibsort_rebalance for title and author...."
     #run_bibsort_rebalance(['title', 'author']) #rebalances only these methods
     #print "Running bibsort_update...."
     #run_bibsort_update() #update all the methods
     #print "Running bibsort_update for title and author...."
     #run_bibsort_update(method_list = ['title', 'author'])
     #print "Running bibsort_update for records 1,2,3, title author...."
     #run_bibsort_update(recids = [1, 2, 3], method_list = ['title', 'author'])
 
 if __name__ == '__main__':
     main()
diff --git a/modules/bibupload/lib/bibupload.py b/modules/bibupload/lib/bibupload.py
index 5a056b147..07e11cf68 100644
--- a/modules/bibupload/lib/bibupload.py
+++ b/modules/bibupload/lib/bibupload.py
@@ -1,3003 +1,3003 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 CERN.
+# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 BibUpload: Receive MARC XML file and update the appropriate database
 tables according to options.
 """
 
 __revision__ = "$Id$"
 
 import os
 import re
 import sys
 import time
 from datetime import datetime
 from zlib import compress
 import socket
 import marshal
 import copy
 import tempfile
 import urlparse
 import urllib2
 import urllib
 
 from invenio.config import CFG_OAI_ID_FIELD, \
      CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \
      CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, \
      CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG, \
      CFG_BIBUPLOAD_STRONG_TAGS, \
      CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS, \
      CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \
      CFG_BIBUPLOAD_DELETE_FORMATS, \
      CFG_SITE_URL, \
      CFG_SITE_SECURE_URL, \
      CFG_SITE_RECORD, \
      CFG_OAI_PROVENANCE_ALTERED_SUBFIELD, \
      CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS, \
      CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE, \
      CFG_CERN_SITE, \
      CFG_BIBUPLOAD_MATCH_DELETED_RECORDS
 
 from invenio.jsonutils import json, CFG_JSON_AVAILABLE
 from invenio.bibupload_config import CFG_BIBUPLOAD_CONTROLFIELD_TAGS, \
     CFG_BIBUPLOAD_SPECIAL_TAGS, \
     CFG_BIBUPLOAD_DELETE_CODE, \
     CFG_BIBUPLOAD_DELETE_VALUE, \
     CFG_BIBUPLOAD_OPT_MODES
 from invenio.dbquery import run_sql
 from invenio.bibrecord import create_records, \
                               record_add_field, \
                               record_delete_field, \
                               record_xml_output, \
                               record_get_field_instances, \
                               record_get_field_value, \
                               record_get_field_values, \
                               field_get_subfield_values, \
                               field_get_subfield_instances, \
                               record_modify_subfield, \
                               record_delete_subfield_from, \
                               record_delete_fields, \
                               record_add_subfield_into, \
                               record_find_field, \
                               record_extract_oai_id, \
                               record_extract_dois, \
                               record_has_field, \
                               records_identical, \
                               record_drop_duplicate_fields
 from invenio.search_engine import get_record, record_exists, search_pattern
 from invenio.dateutils import convert_datestruct_to_datetext
 from invenio.errorlib import register_exception
 from invenio.bibcatalog import BIBCATALOG_SYSTEM
 from invenio.intbitset import intbitset
 from invenio.urlutils import make_user_agent_string
 from invenio.config import CFG_BIBDOCFILE_FILEDIR
 from invenio.bibtask import task_init, write_message, \
     task_set_option, task_get_option, task_get_task_param, \
     task_update_progress, task_sleep_now_if_required, fix_argv_paths, \
     RecoverableError
 from invenio.bibdocfile import BibRecDocs, file_strip_ext, normalize_format, \
     get_docname_from_url, check_valid_url, download_url, \
     KEEP_OLD_VALUE, decompose_bibdocfile_url, InvenioBibDocFileError, \
     bibdocfile_url_p, CFG_BIBDOCFILE_AVAILABLE_FLAGS, guess_format_from_url, \
     BibRelation, MoreInfo
 
 from invenio.search_engine import search_pattern
 
 from invenio.bibupload_revisionverifier import RevisionVerifier, \
                                                InvenioBibUploadConflictingRevisionsError, \
                                                InvenioBibUploadInvalidRevisionError, \
                                                InvenioBibUploadMissing005Error, \
                                                InvenioBibUploadUnchangedRecordError
 
 #Statistic variables
 stat = {}
 stat['nb_records_to_upload'] = 0
 stat['nb_records_updated'] = 0
 stat['nb_records_inserted'] = 0
 stat['nb_errors'] = 0
 stat['nb_holdingpen'] = 0
 stat['exectime'] = time.localtime()
 
 _WRITING_RIGHTS = None
 
 CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS = ('oracle', )
 
 CFG_HAS_BIBCATALOG = "UNKNOWN"
 def check_bibcatalog():
     """
     Return True if bibcatalog is available.
     """
     global CFG_HAS_BIBCATALOG # pylint: disable=W0603
     if CFG_HAS_BIBCATALOG != "UNKNOWN":
         return CFG_HAS_BIBCATALOG
     CFG_HAS_BIBCATALOG = True
     if BIBCATALOG_SYSTEM is not None:
         bibcatalog_response = BIBCATALOG_SYSTEM.check_system()
     else:
         bibcatalog_response = "No ticket system configured"
     if bibcatalog_response != "":
         write_message("BibCatalog error: %s\n" % (bibcatalog_response,))
         CFG_HAS_BIBCATALOG = False
     return CFG_HAS_BIBCATALOG
 
 # Let's set a reasonable timeout for URL request (e.g. FFT)
 socket.setdefaulttimeout(40)
 
 def parse_identifier(identifier):
     """Parse the identifier and determine if it is temporary or fixed"""
     id_str = str(identifier)
     if not id_str.startswith("TMP:"):
         return (False, identifier)
     else:
         return (True, id_str[4:])
 
 def resolve_identifier(tmps, identifier):
     """Resolves an identifier. If the identifier is not temporary, this
     function is an identity on the second argument. Otherwise, a resolved
     value is returned or an exception raised"""
 
     is_tmp, tmp_id = parse_identifier(identifier)
     if is_tmp:
         if not tmp_id in tmps:
             raise StandardError("Temporary identifier %s not present in the dictionary" % (tmp_id, ))
         if tmps[tmp_id] == -1:
             # the identifier has been signalised but never assigned a value - probably error during processing
             raise StandardError("Temporary identifier %s has been declared, but never assigned a value. Probably an error during processign of an appropriate FFT has happened. Please see the log" % (tmp_id, ))
         return int(tmps[tmp_id])
     else:
         return int(identifier)
 
 _re_find_001 = re.compile('<controlfield\\s+tag=("001"|\'001\')\\s*>\\s*(\\d*)\\s*</controlfield>', re.S)
 def bibupload_pending_recids():
     """This function embed a bit of A.I. and is more a hack than an elegant
     algorithm. It should be updated in case bibupload/bibsched are modified
     in incompatible ways.
     This function return the intbitset of all the records that are being
     (or are scheduled to be) touched by other bibuploads.
     """
     options = run_sql("""SELECT arguments FROM schTASK WHERE status<>'DONE' AND
         proc='bibupload' AND (status='RUNNING' OR status='CONTINUING' OR
         status='WAITING' OR status='SCHEDULED' OR status='ABOUT TO STOP' OR
         status='ABOUT TO SLEEP')""")
     ret = intbitset()
     xmls = []
     if options:
         for arguments in options:
             arguments = marshal.loads(arguments[0])
             for argument in arguments[1:]:
                 if argument.startswith('/'):
                     # XMLs files are recognizable because they're absolute
                     # files...
                     xmls.append(argument)
     for xmlfile in xmls:
         # Let's grep for the 001
         try:
             xml = open(xmlfile).read()
             ret += [int(group[1]) for group in _re_find_001.findall(xml)]
         except:
             continue
     return ret
 
 ### bibupload engine functions:
 def bibupload(record, opt_mode=None, opt_notimechange=0, oai_rec_id="", pretend=False,
         tmp_ids=None, tmp_vers=None):
     """Main function: process a record and fit it in the tables
     bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record
     metadata.
 
     Return (error_code, recID) of the processed record.
     """
     if tmp_ids is None:
         tmp_ids = {}
     if tmp_vers is None:
         tmp_vers = {}
     if opt_mode == 'reference':
         ## NOTE: reference mode has been deprecated in favour of 'correct'
         opt_mode = 'correct'
 
     assert(opt_mode in CFG_BIBUPLOAD_OPT_MODES)
 
     try:
         record_xml_output(record).decode('utf-8')
     except UnicodeDecodeError:
         msg = "    Failed: Invalid utf-8 characters."
         write_message(msg, verbose=1, stream=sys.stderr)
         return (1, -1, msg)
 
 
     error = None
     affected_tags = {}
     original_record = {}
     rec_old = {}
     record_modification_date = datetime.now() # will hold record creation/modification date
     record_had_altered_bit = False
     is_opt_mode_delete = False
 
     # Extraction of the Record Id from 001, SYSNO or OAIID or DOI tags:
     rec_id = retrieve_rec_id(record, opt_mode, pretend=pretend)
     if rec_id == -1:
         msg = "    Failed: either the record already exists and insert was " \
             "requested or the record does not exists and " \
             "replace/correct/append has been used"
         write_message(msg, verbose=1, stream=sys.stderr)
         return (1, -1, msg)
     elif rec_id > 0:
         write_message("   -Retrieve record ID (found %s): DONE." % rec_id, verbose=2)
         (unique_p, msg) = check_record_doi_is_unique(rec_id, record)
         if not unique_p:
             write_message(msg, verbose=1, stream=sys.stderr)
             return (1, int(rec_id), msg)
         if not record.has_key('001'):
             # Found record ID by means of SYSNO or OAIID or DOI, and the
             # input MARCXML buffer does not have this 001 tag, so we
             # should add it now:
             error = record_add_field(record, '001', controlfield_value=rec_id)
             if error is None:
                 msg = "   Failed: Error during adding the 001 controlfield "  \
                     "to the record"
                 write_message(msg, verbose=1, stream=sys.stderr)
                 return (1, int(rec_id), msg)
             else:
                 error = None
             write_message("   -Added tag 001: DONE.", verbose=2)
 
             write_message("   -Check if the xml marc file is already in the database: DONE" , verbose=2)
 
     record_deleted_p = False
     record_creation_date = None
     if opt_mode == 'insert' or \
     (opt_mode == 'replace_or_insert') and rec_id is None:
         insert_mode_p = True
         # Insert the record into the bibrec databases to have a recordId
         rec_id = create_new_record(pretend=pretend)
         write_message("   -Creation of a new record id (%d): DONE" % rec_id, verbose=2)
 
         # we add the record Id control field to the record
         error = record_add_field(record, '001', controlfield_value=rec_id)
         if error is None:
             msg = "   Failed: Error during adding the 001 controlfield "  \
                   "to the record"
             write_message(msg, verbose=1, stream=sys.stderr)
             return (1, int(rec_id), msg)
         else:
             error = None
 
         if '005' not in record:
             error = record_add_field(record, '005', controlfield_value=record_modification_date.strftime("%Y%m%d%H%M%S.0"))
             if error is None:
                 msg = "   ERROR: during adding to 005 controlfield to record"
                 write_message(msg, verbose=1, stream=sys.stderr)
                 return (1, int(rec_id), msg)
             else:
                 error = None
         else:
             write_message("   Note: 005 already existing upon inserting of new record. Keeping it.", verbose=2)
         record_creation_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(record['005'][0][3].split('.')[0], "%Y%m%d%H%M%S"))
 
     elif opt_mode != 'insert':
         insert_mode_p = False
         # Update Mode
         # Retrieve the old record to update
         rec_old = get_record(rec_id)
         record_had_altered_bit = record_get_field_values(rec_old, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], CFG_OAI_PROVENANCE_ALTERED_SUBFIELD)
         # Also save a copy to restore previous situation in case of errors
         original_record = get_record(rec_id)
 
         if rec_old is None:
             msg = "   Failed during the creation of the old record!"
             write_message(msg, verbose=1, stream=sys.stderr)
             return (1, int(rec_id), msg)
         else:
             write_message("   -Retrieve the old record to update: DONE", verbose=2)
 
         # flag to check whether the revisions have been verified and patch generated.
         # If revision verification failed, then we need to manually identify the affected tags
         # and process them
         revision_verified = False
         rev_verifier = RevisionVerifier()
         #check for revision conflicts before updating record
         if record_has_field(record, '005') and not CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS:
             write_message("     -Upload Record has 005. Verifying Revision", verbose=2)
             try:
                 rev_res = rev_verifier.verify_revision(record, original_record, opt_mode)
                 if rev_res:
                     opt_mode = rev_res[0]
                     record = rev_res[1]
                     affected_tags = rev_res[2]
                     revision_verified = True
                     write_message(lambda: "     -Patch record generated. Changing opt_mode to correct.\nPatch:\n%s " % record_xml_output(record), verbose=2)
                 else:
                     write_message("     -No Patch Record.", verbose=2)
             except InvenioBibUploadUnchangedRecordError, err:
                 msg = "     -ISSUE: %s" % err
                 write_message(msg, verbose=1, stream=sys.stderr)
                 write_message(msg, "     Continuing anyway in case there are FFT or other tags")
             except InvenioBibUploadConflictingRevisionsError, err:
                 msg = "     -ERROR: Conflicting Revisions - %s" % err
                 write_message(msg, verbose=1, stream=sys.stderr)
                 submit_ticket_for_holding_pen(rec_id, err, "Conflicting Revisions. Inserting record into holding pen.", pretend=pretend)
                 insert_record_into_holding_pen(record, str(rec_id), pretend=pretend)
                 return (2, int(rec_id), msg)
             except InvenioBibUploadInvalidRevisionError, err:
                 msg = "     -ERROR: Invalid Revision - %s" % err
                 write_message(msg)
                 submit_ticket_for_holding_pen(rec_id, err, "Invalid Revisions. Inserting record into holding pen.", pretend=pretend)
                 insert_record_into_holding_pen(record, str(rec_id), pretend=pretend)
                 return (2, int(rec_id), msg)
             except InvenioBibUploadMissing005Error, err:
                 msg = "     -ERROR: Missing 005 - %s" % err
                 write_message(msg)
                 submit_ticket_for_holding_pen(rec_id, err, "Missing 005. Inserting record into holding pen.", pretend=pretend)
                 insert_record_into_holding_pen(record, str(rec_id), pretend=pretend)
                 return (2, int(rec_id), msg)
         else:
             write_message("     - No 005 Tag Present. Resuming normal flow.", verbose=2)
 
         # dictionaries to temporarily hold original recs tag-fields
         existing_tags = {}
         retained_tags = {}
 
         # in case of delete operation affected tags should be deleted in delete_bibrec_bibxxx
         # but should not be updated again in STAGE 4
         # utilising the below flag
         is_opt_mode_delete = False
         if not revision_verified:
             # either 005 was not present or opt_mode was not correct/replace
             # in this case we still need to find out affected tags to process
             write_message("     - Missing 005 or opt_mode!=Replace/Correct.Revision Verifier not called.", verbose=2)
             # Identify affected tags
             if opt_mode == 'correct' or opt_mode == 'replace' or opt_mode == 'replace_or_insert':
                 rec_diff = rev_verifier.compare_records(record, original_record, opt_mode)
                 affected_tags = rev_verifier.retrieve_affected_tags_with_ind(rec_diff)
             elif opt_mode == 'delete':
                 # populate an intermediate dictionary
                 # used in upcoming step related to 'delete' mode
                 is_opt_mode_delete = True
                 for tag, fields in original_record.iteritems():
                     existing_tags[tag] = [tag + (field[1] != ' ' and field[1] or '_') + (field[2] != ' ' and field[2] or '_') for field in fields]
             elif opt_mode == 'append':
                 for tag, fields in record.iteritems():
                     if tag not in CFG_BIBUPLOAD_CONTROLFIELD_TAGS:
                         affected_tags[tag] = [(field[1], field[2]) for field in fields]
 
         # In Replace mode, take over old strong tags if applicable:
         if opt_mode == 'replace' or \
             opt_mode == 'replace_or_insert':
             copy_strong_tags_from_old_record(record, rec_old)
 
         # Delete tags to correct in the record
         if opt_mode == 'correct':
             delete_tags_to_correct(record, rec_old)
             write_message("   -Delete the old tags to correct in the old record: DONE",
                         verbose=2)
 
         # Delete tags specified if in delete mode
         if opt_mode == 'delete':
             record = delete_tags(record, rec_old)
             for tag, fields in record.iteritems():
                 retained_tags[tag] = [tag + (field[1] != ' ' and field[1] or '_') + (field[2] != ' ' and field[2] or '_') for field in fields]
             #identify the tags that have been deleted
             for tag in existing_tags.keys():
                 if tag not in retained_tags:
                     for item in existing_tags[tag]:
                         tag_to_add = item[0:3]
                         ind1, ind2 = item[3], item[4]
                         if tag_to_add in affected_tags and (ind1, ind2) not in affected_tags[tag_to_add]:
                             affected_tags[tag_to_add].append((ind1, ind2))
                         else:
                             affected_tags[tag_to_add] = [(ind1, ind2)]
                 else:
                     deleted = list(set(existing_tags[tag]) - set(retained_tags[tag]))
                     for item in deleted:
                         tag_to_add = item[0:3]
                         ind1, ind2 = item[3], item[4]
                         if tag_to_add in affected_tags and (ind1, ind2) not in affected_tags[tag_to_add]:
                             affected_tags[tag_to_add].append((ind1, ind2))
                         else:
                             affected_tags[tag_to_add] = [(ind1, ind2)]
 
             write_message("   -Delete specified tags in the old record: DONE", verbose=2)
 
         # Append new tag to the old record and update the new record with the old_record modified
         if opt_mode == 'append' or opt_mode == 'correct':
             record = append_new_tag_to_old_record(record, rec_old)
             write_message("   -Append new tags to the old record: DONE", verbose=2)
 
         write_message("     -Affected Tags found after comparing upload and original records: %s"%(str(affected_tags)), verbose=2)
 
         # 005 tag should be added everytime the record is modified
         # If an exiting record is modified, its 005 tag should be overwritten with a new revision value
         if record.has_key('005'):
             record_delete_field(record, '005')
             write_message("  Deleted the existing 005 tag.", verbose=2)
         last_revision = run_sql("SELECT MAX(job_date) FROM hstRECORD WHERE id_bibrec=%s", (rec_id, ))[0][0]
         if last_revision and last_revision.strftime("%Y%m%d%H%M%S.0") == record_modification_date.strftime("%Y%m%d%H%M%S.0"):
             ## We are updating the same record within the same seconds! It's less than
             ## the minimal granularity. Let's pause for 1 more second to take a breath :-)
             time.sleep(1)
             record_modification_date = datetime.now()
 
         error = record_add_field(record, '005', controlfield_value=record_modification_date.strftime("%Y%m%d%H%M%S.0"))
         if error is None:
             write_message("   Failed: Error during adding to 005 controlfield to record", verbose=1, stream=sys.stderr)
             return (1, int(rec_id))
         else:
             error=None
             write_message(lambda: "   -Added tag 005: DONE. " + str(record_get_field_value(record, '005', '', '')), verbose=2)
 
         # adding 005 to affected tags will delete the existing 005 entry
         # and update with the latest timestamp.
         if '005' not in affected_tags:
             affected_tags['005'] = [(' ', ' ')]
 
     write_message("   -Stage COMPLETED", verbose=2)
 
     record_deleted_p = False
     try:
         if not record_is_valid(record):
             msg = "ERROR: record is not valid"
             write_message(msg, verbose=1, stream=sys.stderr)
             return (1, -1, msg)
 
         # Have a look if we have FFT tags
         write_message("Stage 2: Start (Process FFT tags if exist).", verbose=2)
         record_had_FFT = False
         bibrecdocs = None
         if extract_tag_from_record(record, 'FFT') is not None:
             record_had_FFT = True
             if not writing_rights_p():
                 msg = "ERROR: no rights to write fulltext files"
                 write_message("   Stage 2 failed: %s" % msg,
                     verbose=1, stream=sys.stderr)
                 raise StandardError(msg)
             try:
                 bibrecdocs = BibRecDocs(rec_id)
                 record = elaborate_fft_tags(record, rec_id, opt_mode,
                                         pretend=pretend, tmp_ids=tmp_ids,
                                         tmp_vers=tmp_vers, bibrecdocs=bibrecdocs)
             except Exception, e:
                 register_exception()
                 msg = "   Stage 2 failed: ERROR: while elaborating FFT tags: %s" % e
                 write_message(msg, verbose=1, stream=sys.stderr)
                 return (1, int(rec_id), msg)
             if record is None:
                 msg = "   Stage 2 failed: ERROR: while elaborating FFT tags"
                 write_message(msg, verbose=1, stream=sys.stderr)
                 return (1, int(rec_id), msg)
             write_message("   -Stage COMPLETED", verbose=2)
         else:
             write_message("   -Stage NOT NEEDED", verbose=2)
 
         # Have a look if we have FFT tags
         write_message("Stage 2B: Start (Synchronize 8564 tags).", verbose=2)
         if record_had_FFT or extract_tag_from_record(record, '856') is not None:
             try:
                 if bibrecdocs is None:
                     bibrecdocs = BibRecDocs(rec_id)
                 record = synchronize_8564(rec_id, record, record_had_FFT, bibrecdocs, pretend=pretend)
                 # in case if FFT is in affected list make appropriate changes
                 if not insert_mode_p: # because for insert, all tags are affected
                     if ('4', ' ') not in affected_tags.get('856', []):
                         if '856' not in affected_tags:
                             affected_tags['856'] = [('4', ' ')]
                         elif ('4', ' ') not in affected_tags['856']:
                             affected_tags['856'].append(('4', ' '))
                     write_message("     -Modified field list updated with FFT details: %s" % str(affected_tags), verbose=2)
             except Exception, e:
                 register_exception(alert_admin=True)
                 msg = "   Stage 2B failed: ERROR: while synchronizing 8564 tags: %s" % e
                 write_message(msg, verbose=1, stream=sys.stderr)
                 return (1, int(rec_id), msg)
             if record is None:
                 msg = "   Stage 2B failed: ERROR: while synchronizing 8564 tags"
                 write_message(msg, verbose=1, stream=sys.stderr)
                 return (1, int(rec_id), msg)
             write_message("   -Stage COMPLETED", verbose=2)
         else:
             write_message("   -Stage NOT NEEDED", verbose=2)
 
         write_message("Stage 3: Start (Apply fields deletion requests).", verbose=2)
         write_message(lambda: "     Record before deletion:\n%s" % record_xml_output(record), verbose=9)
         # remove fields with __DELETE_FIELDS__
         # NOTE:creating a temporary deep copy of record for iteration to avoid RunTimeError
         # RuntimeError due to change in dictionary size during iteration
         tmp_rec = copy.deepcopy(record)
         for tag in tmp_rec:
             for data_tuple in record[tag]:
                 if (CFG_BIBUPLOAD_DELETE_CODE, CFG_BIBUPLOAD_DELETE_VALUE) in data_tuple[0]:
                     # delete the tag with particular indicator pairs from original record
                     record_delete_field(record, tag, data_tuple[1], data_tuple[2])
 
         write_message(lambda: "     Record after cleaning up fields to be deleted:\n%s" % record_xml_output(record), verbose=9)
 
         if opt_mode == 'append':
             write_message("Stage 3b: Drop duplicate fields in append mode.", verbose=2)
             record = record_drop_duplicate_fields(record)
             write_message(lambda: "     Record after dropping duplicate fields:\n%s" % record_xml_output(record), verbose=9)
 
         # Update of the BibFmt
         write_message("Stage 4: Start (Update bibfmt).", verbose=2)
 
         updates_exist = not records_identical(record, original_record)
         if updates_exist:
             # if record_had_altered_bit, this must be set to true, since the
             # record has been altered.
             if record_had_altered_bit:
                 oai_provenance_fields = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4])
                 for oai_provenance_field in oai_provenance_fields:
                     for i, (code, dummy_value) in enumerate(oai_provenance_field[0]):
                         if code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD:
                             oai_provenance_field[0][i] = (code, 'true')
                             tmp_indicators = (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3] == '_' and ' ' or CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4] == '_' and ' ' or CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4])
                             if tmp_indicators not in affected_tags.get(CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], []):
                                 if CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3] not in affected_tags:
                                     affected_tags[CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]] = [tmp_indicators]
                                 else:
                                     affected_tags[CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3]].append(tmp_indicators)
 
             write_message(lambda: "  Updates exists:\n%s\n!=\n%s" % (record, original_record), verbose=9)
             # format the single record as xml
             rec_xml_new = record_xml_output(record)
             # Update bibfmt with the format xm of this record
             modification_date = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(record_get_field_value(record, '005'), '%Y%m%d%H%M%S.0'))
             error = update_bibfmt_format(rec_id, rec_xml_new, 'xm', modification_date, pretend=pretend)
             if error == 1:
                 msg = "   Failed: ERROR: during update_bibfmt_format 'xm'"
                 write_message(msg, verbose=1, stream=sys.stderr)
                 return (1, int(rec_id), msg)
             if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE:
                 error = update_bibfmt_format(rec_id, marshal.dumps(record), 'recstruct', modification_date, pretend=pretend)
                 if error == 1:
                     msg = "   Failed: ERROR: during update_bibfmt_format 'recstruct'"
                     write_message(msg, verbose=1, stream=sys.stderr)
                     return (1, int(rec_id), msg)
             if not CFG_BIBUPLOAD_DISABLE_RECORD_REVISIONS:
                 # archive MARCXML format of this record for version history purposes:
                 if insert_mode_p:
                     error = archive_marcxml_for_history(rec_id, affected_fields={}, pretend=pretend)
                 else:
                     error = archive_marcxml_for_history(rec_id, affected_fields=affected_tags, pretend=pretend)
                 if error == 1:
                     msg = "   ERROR: Failed to archive MARCXML for history"
                     write_message(msg, verbose=1, stream=sys.stderr)
                     return (1, int(rec_id), msg)
                 else:
                     write_message("   -Archived MARCXML for history: DONE", verbose=2)
 
         # delete some formats like HB upon record change:
         if updates_exist or record_had_FFT:
             for format_to_delete in CFG_BIBUPLOAD_DELETE_FORMATS:
                 try:
                     delete_bibfmt_format(rec_id, format_to_delete, pretend=pretend)
                 except:
                     # OK, some formats like HB could not have been deleted, no big deal
                     pass
         write_message("   -Stage COMPLETED", verbose=2)
 
         ## Let's assert that one and only one 005 tag is existing at this stage.
         assert len(record['005']) == 1
 
         # Update the database MetaData
         write_message("Stage 5: Start (Update the database with the metadata).",
                     verbose=2)
         if insert_mode_p:
             update_database_with_metadata(record, rec_id, oai_rec_id, pretend=pretend)
             write_message("   -Stage COMPLETED", verbose=2)
         elif opt_mode in ('replace', 'replace_or_insert',
             'append', 'correct', 'delete') and updates_exist:
             # now we clear all the rows from bibrec_bibxxx from the old
             record_deleted_p = True
             delete_bibrec_bibxxx(rec_old, rec_id, affected_tags, pretend=pretend)
             # metadata update will insert tags that are available in affected_tags.
             # but for delete, once the tags have been deleted from bibrec_bibxxx, they dont have to be inserted
             # except for 005.
             if is_opt_mode_delete:
                 tmp_affected_tags = copy.deepcopy(affected_tags)
                 for tag in tmp_affected_tags:
                     if tag != '005':
                         affected_tags.pop(tag)
             write_message("   -Clean bibrec_bibxxx: DONE", verbose=2)
             update_database_with_metadata(record, rec_id, oai_rec_id, affected_tags, pretend=pretend)
             write_message("   -Stage COMPLETED", verbose=2)
         else:
             write_message("   -Stage NOT NEEDED in mode %s" % opt_mode,
                         verbose=2)
         record_deleted_p = False
 
         # Finally we update the bibrec table with the current date
         write_message("Stage 6: Start (Update bibrec table with current date).",
                     verbose=2)
         if opt_notimechange == 0 and (updates_exist or record_had_FFT):
             record_modification_date = convert_datestruct_to_datetext(time.localtime())
             write_message("   -Retrieved current localtime: DONE", verbose=2)
             update_bibrec_date(record_modification_date, rec_id, insert_mode_p, record_creation_date, pretend=pretend)
             write_message("   -Stage COMPLETED", verbose=2)
         else:
             write_message("   -Stage NOT NEEDED", verbose=2)
 
         # Increase statistics
         if insert_mode_p:
             stat['nb_records_inserted'] += 1
         else:
             stat['nb_records_updated'] += 1
 
         # Upload of this record finish
         write_message("Record "+str(rec_id)+" DONE", verbose=1)
         return (0, int(rec_id), "")
     finally:
         if record_deleted_p:
             ## BibUpload has failed living the record deleted. We should
             ## back the original record then.
             update_database_with_metadata(original_record, rec_id, oai_rec_id, pretend=pretend)
             write_message("   Restored original record", verbose=1, stream=sys.stderr)
 
 def record_is_valid(record):
     """
     Check if the record is valid. Currently this simply checks if the record
     has exactly one rec_id.
 
     @param record: the record
     @type record: recstruct
     @return: True if the record is valid
     @rtype: bool
     """
     rec_ids = record_get_field_values(record, tag="001")
     if len(rec_ids) != 1:
         write_message("    The record is not valid: it has not a single rec_id: %s" % (rec_ids), stream=sys.stderr)
         return False
     return True
 
 def find_record_ids_by_oai_id(oaiId):
     """
     A method finding the records identifier provided the oai identifier
     returns a list of identifiers matching a given oai identifier
     """
     # Is this record already in invenio (matching by oaiid)
     if oaiId:
         recids = search_pattern(p=oaiId, f=CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, m='e')
 
         # Is this record already in invenio (matching by reportnumber i.e.
         # particularly 037. Idea: to avoid double insertions)
         repnumber = oaiId.split(":")[-1]
         if repnumber:
             recids |= search_pattern(p = repnumber,
                                     f = "reportnumber",
                                     m = 'e' )
 
         # Is this record already in invenio (matching by reportnumber i.e.
         # particularly 037. Idea:  to avoid double insertions)
         repnumber = "arXiv:" + oaiId.split(":")[-1]
         recids |= search_pattern(p = repnumber,
                                 f = "reportnumber",
                                 m = 'e' )
 
         if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS:
             return recids
         else:
             if CFG_CERN_SITE:
                 return recids - (search_pattern(p='DELETED', f='980__%', m='e') | search_pattern(p='DUMMY', f='980__%', m='e'))
             else:
                 return recids - search_pattern(p='DELETED', f='980__%', m='e')
     else:
         return intbitset()
 
 def bibupload_post_phase(record, mode=None, rec_id="", pretend=False,
                          tmp_ids=None, tmp_vers=None):
     def _elaborate_tag(record, tag, fun):
         if extract_tag_from_record(record, tag) is not None:
             try:
                 record = fun()
             except Exception, e:
                 register_exception()
                 write_message("   Stage failed: ERROR: while elaborating %s tags: %s" % (tag, e),
                               verbose=1, stream=sys.stderr)
                 return (1, int(rec_id)) # TODO: ?
             if record is None:
                 write_message("   Stage failed: ERROR: while elaborating %s tags" % (tag, ),
                               verbose=1, stream=sys.stderr)
                 return (1, int(rec_id))
             write_message("   -Stage COMPLETED", verbose=2)
         else:
             write_message("   -Stage NOT NEEDED", verbose=2)
     if tmp_ids is None:
         tmp_ids = {}
     if tmp_vers is None:
         tmp_vers = {}
     _elaborate_tag(record, "BDR", lambda: elaborate_brt_tags(record, rec_id = rec_id,
                                                      mode = mode,
                                                      pretend = pretend,
                                                      tmp_ids = tmp_ids,
                                                      tmp_vers = tmp_vers))
 
 
     _elaborate_tag(record, "BDM", lambda: elaborate_mit_tags(record, rec_id = rec_id,
                                                      mode = mode,
                                                      pretend = pretend,
                                                      tmp_ids = tmp_ids,
                                                      tmp_vers = tmp_vers))
 
 def submit_ticket_for_holding_pen(rec_id, err, msg, pretend=False):
     """
     Submit a ticket via BibCatalog to report about a record that has been put
     into the Holding Pen.
     @rec_id: the affected record
     @err: the corresponding Exception
     msg: verbose message
     """
     from invenio import bibtask
     from invenio.webuser import get_email_from_username, get_uid_from_email
     user = task_get_task_param("user")
     uid = None
     if user:
         try:
             uid = get_uid_from_email(get_email_from_username(user))
         except Exception, err:
             write_message("WARNING: can't reliably retrieve uid for user %s: %s" % (user, err), stream=sys.stderr)
 
     if check_bibcatalog():
         text = """
 %(msg)s found for record %(rec_id)s: %(err)s
 
 See: <%(siteurl)s/record/edit/#state=edit&recid=%(rec_id)s>
 
 BibUpload task information:
     task_id: %(task_id)s
     task_specific_name: %(task_specific_name)s
     user: %(user)s
     task_params: %(task_params)s
     task_options: %(task_options)s""" % {
             "msg": msg,
             "rec_id": rec_id,
             "err": err,
             "siteurl": CFG_SITE_SECURE_URL,
             "task_id": task_get_task_param("task_id"),
             "task_specific_name": task_get_task_param("task_specific_name"),
             "user": user,
             "task_params": bibtask._TASK_PARAMS,
             "task_options": bibtask._OPTIONS}
         if not pretend:
             BIBCATALOG_SYSTEM.ticket_submit(subject="%s: %s by %s" % (msg, rec_id, user), recordid=rec_id, text=text, queue=CFG_BIBUPLOAD_CONFLICTING_REVISION_TICKET_QUEUE, owner=uid)
 
 def insert_record_into_holding_pen(record, oai_id, pretend=False):
     query = "INSERT INTO bibHOLDINGPEN (oai_id, changeset_date, changeset_xml, id_bibrec) VALUES (%s, NOW(), %s, %s)"
     xml_record = record_xml_output(record)
     bibrec_ids = find_record_ids_by_oai_id(oai_id)  # here determining the identifier of the record
     if len(bibrec_ids) > 0:
         bibrec_id = bibrec_ids.pop()
     else:
         # id not found by using the oai_id, let's use a wider search based
         # on any information we might have.
         bibrec_id = retrieve_rec_id(record, 'holdingpen', pretend=pretend)
         if bibrec_id is None:
             bibrec_id = 0
 
     if not pretend:
         run_sql(query, (oai_id, compress(xml_record), bibrec_id))
 
     # record_id is logged as 0! ( We are not inserting into the main database)
     log_record_uploading(oai_id, task_get_task_param('task_id', 0), 0, 'H', pretend=pretend)
     stat['nb_holdingpen'] += 1
 
 def print_out_bibupload_statistics():
     """Print the statistics of the process"""
     out = "Task stats: %(nb_input)d input records, %(nb_updated)d updated, " \
           "%(nb_inserted)d inserted, %(nb_errors)d errors, %(nb_holdingpen)d inserted to holding pen.  " \
           "Time %(nb_sec).2f sec." % { \
               'nb_input': stat['nb_records_to_upload'],
               'nb_updated': stat['nb_records_updated'],
               'nb_inserted': stat['nb_records_inserted'],
               'nb_errors': stat['nb_errors'],
               'nb_holdingpen': stat['nb_holdingpen'],
               'nb_sec': time.time() - time.mktime(stat['exectime']) }
     write_message(out)
 
 def open_marc_file(path):
     """Open a file and return the data"""
     try:
         # open the file containing the marc document
         marc_file = open(path, 'r')
         marc = marc_file.read()
         marc_file.close()
     except IOError, erro:
         write_message("ERROR: %s" % erro, verbose=1, stream=sys.stderr)
         if erro.errno == 2:
             # No such file or directory
             # Not scary
             e = RecoverableError('File does not exist: %s' % path)
         else:
             e = StandardError('File not accessible: %s' % path)
         raise e
     return marc
 
 def xml_marc_to_records(xml_marc):
     """create the records"""
     # Creation of the records from the xml Marc in argument
     recs = create_records(xml_marc, 1, 1)
     if recs == []:
         msg = "ERROR: Cannot parse MARCXML file."
         write_message(msg, verbose=1, stream=sys.stderr)
         raise StandardError(msg)
     elif recs[0][0] is None:
         msg = "ERROR: MARCXML file has wrong format: %s" % recs
         write_message(msg, verbose=1, stream=sys.stderr)
         raise RecoverableError(msg)
     else:
         recs = map((lambda x:x[0]), recs)
         return recs
 
 def find_record_format(rec_id, bibformat):
     """Look whether record REC_ID is formatted in FORMAT,
        i.e. whether FORMAT exists in the bibfmt table for this record.
 
        Return the number of times it is formatted: 0 if not, 1 if yes,
        2 if found more than once (should never occur).
     """
     out = 0
     query = """SELECT COUNT(*) FROM bibfmt WHERE id_bibrec=%s AND format=%s"""
     params = (rec_id, bibformat)
     res = []
     res = run_sql(query, params)
     out = res[0][0]
     return out
 
 def find_record_from_recid(rec_id):
     """
     Try to find record in the database from the REC_ID number.
     Return record ID if found, None otherwise.
     """
     res = run_sql("SELECT id FROM bibrec WHERE id=%s",
                     (rec_id,))
     if res:
         return res[0][0]
     else:
         return None
 
 def find_record_from_sysno(sysno):
     """
     Try to find record in the database from the external SYSNO number.
     Return record ID if found, None otherwise.
     """
     bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x'
     bibrec_bibxxx = 'bibrec_' + bibxxx
     res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb,
         %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s
         AND bb.id_bibxxx=b.id""" %
                     {'bibxxx': bibxxx,
                     'bibrec_bibxxx': bibrec_bibxxx},
                     (CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, sysno,))
     for recid in res:
         if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS:
             return recid[0]
         else:
             if record_exists(recid[0]) > 0: ## Only non deleted records
                 return recid[0]
     return None
 
 def find_records_from_extoaiid(extoaiid, extoaisrc=None):
     """
     Try to find records in the database from the external EXTOAIID number.
     Return list of record ID if found, None otherwise.
     """
     assert(CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5] == CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[:5])
     bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:2]+'x'
     bibrec_bibxxx = 'bibrec_' + bibxxx
 
     write_message('   Looking for extoaiid="%s" with extoaisrc="%s"' % (extoaiid, extoaisrc), verbose=9)
     id_bibrecs = intbitset(run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb,
         %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s
         AND bb.id_bibxxx=b.id""" %
                     {'bibxxx': bibxxx,
                     'bibrec_bibxxx': bibrec_bibxxx},
                     (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, extoaiid,)))
     write_message('   Partially found %s for extoaiid="%s"' % (id_bibrecs, extoaiid), verbose=9)
 
     ret = intbitset()
     for id_bibrec in id_bibrecs:
 
         if not CFG_BIBUPLOAD_MATCH_DELETED_RECORDS:
             if record_exists(id_bibrec) < 1:
                 ## We don't match not existing records
                 continue
 
         record = get_record(id_bibrec)
         instances = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4])
         write_message('   recid %s -> instances "%s"' % (id_bibrec, instances), verbose=9)
         for instance in instances:
             this_extoaisrc = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5])
             this_extoaisrc = this_extoaisrc and this_extoaisrc[0] or None
             this_extoaiid = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5])
             this_extoaiid = this_extoaiid and this_extoaiid[0] or None
             write_message("        this_extoaisrc -> %s, this_extoaiid -> %s" % (this_extoaisrc, this_extoaiid), verbose=9)
             if this_extoaiid == extoaiid:
                 write_message('   recid %s -> provenance "%s"' % (id_bibrec, this_extoaisrc), verbose=9)
                 if this_extoaisrc == extoaisrc:
                     write_message('Found recid %s for extoaiid="%s" with provenance="%s"' % (id_bibrec, extoaiid, extoaisrc), verbose=9)
                     ret.add(id_bibrec)
                     break
                 if this_extoaisrc is None:
                     write_message('WARNING: Found recid %s for extoaiid="%s" that doesn\'t specify any provenance, while input record does.' % (id_bibrec, extoaiid), stream=sys.stderr)
                 if extoaisrc is None:
                     write_message('WARNING: Found recid %s for extoaiid="%s" that specify a provenance (%s), while input record does not have a provenance.' % (id_bibrec, extoaiid, this_extoaisrc), stream=sys.stderr)
     return ret
 
 def find_record_from_oaiid(oaiid):
     """
     Try to find record in the database from the OAI ID number and OAI SRC.
     Return record ID if found, None otherwise.
     """
     bibxxx = 'bib'+CFG_OAI_ID_FIELD[0:2]+'x'
     bibrec_bibxxx = 'bibrec_' + bibxxx
     res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb,
         %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s
         AND bb.id_bibxxx=b.id""" %
                     {'bibxxx': bibxxx,
                     'bibrec_bibxxx': bibrec_bibxxx},
                     (CFG_OAI_ID_FIELD, oaiid,))
     for recid in res:
         if CFG_BIBUPLOAD_MATCH_DELETED_RECORDS:
             return recid[0]
         else:
             if record_exists(recid[0]) > 0: ## Only non deleted records
                 return recid[0]
     return None
 
 def find_record_from_doi(doi):
     """
     Try to find record in the database from the given DOI.
     Return record ID if found, None otherwise.
     """
     bibxxx = 'bib02x'
     bibrec_bibxxx = 'bibrec_' + bibxxx
     res = run_sql("""SELECT bb.id_bibrec, bb.field_number
         FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b
         WHERE b.tag=%%s AND b.value=%%s
         AND bb.id_bibxxx=b.id""" %
                     {'bibxxx': bibxxx,
                     'bibrec_bibxxx': bibrec_bibxxx},
                     ('0247_a', doi,))
 
     # For each of the result, make sure that it is really tagged as doi
     for (id_bibrec, field_number) in res:
 
         if not CFG_BIBUPLOAD_MATCH_DELETED_RECORDS:
             if record_exists(id_bibrec) < 1:
                 ## We don't match not existing records
                 continue
 
         res = run_sql("""SELECT bb.id_bibrec
         FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b
         WHERE b.tag=%%s AND b.value=%%s
         AND bb.id_bibxxx=b.id and bb.field_number=%%s and bb.id_bibrec=%%s""" %
                     {'bibxxx': bibxxx,
                     'bibrec_bibxxx': bibrec_bibxxx},
                     ('0247_2', "doi", field_number, id_bibrec))
         if res and res[0][0] == id_bibrec:
             return res[0][0]
 
     return None
 
 def extract_tag_from_record(record, tag_number):
     """ Extract the tag_number for record."""
     # first step verify if the record is not already in the database
     if record:
         return record.get(tag_number, None)
     return None
 
 def retrieve_rec_id(record, opt_mode, pretend=False, post_phase = False):
     """Retrieve the record Id from a record by using tag 001 or SYSNO or OAI ID or DOI
     tag. opt_mod is the desired mode.
 
     @param post_phase Tells if we are calling this method in the postprocessing phase. If true, we accept presence of 001 fields even in the insert mode
     @type post_phase boolean
     """
 
     rec_id = None
 
     # 1st step: we look for the tag 001
     tag_001 = extract_tag_from_record(record, '001')
     if tag_001 is not None:
         # We extract the record ID from the tag
         rec_id = tag_001[0][3]
         # if we are in insert mode => error
         if opt_mode == 'insert' and not post_phase:
             write_message("   Failed: tag 001 found in the xml" \
                           " submitted, you should use the option replace," \
                           " correct or append to replace an existing" \
                           " record. (-h for help)",
                           verbose=1, stream=sys.stderr)
             return -1
         else:
             # we found the rec id and we are not in insert mode => continue
             # we try to match rec_id against the database:
             if find_record_from_recid(rec_id) is not None:
                 # okay, 001 corresponds to some known record
                 return int(rec_id)
             elif opt_mode in ('replace', 'replace_or_insert'):
                 if task_get_option('force'):
                     # we found the rec_id but it's not in the system and we are
                     # requested to replace records. Therefore we create on the fly
                     # a empty record allocating the recid.
                     write_message("   WARNING: tag 001 found in the xml with"
                                 " value %(rec_id)s, but rec_id %(rec_id)s does"
                                 " not exist. Since the mode replace was"
                                 " requested the rec_id %(rec_id)s is allocated"
                                 " on-the-fly." % {"rec_id": rec_id},
                                 stream=sys.stderr)
                     return create_new_record(rec_id=rec_id, pretend=pretend)
                 else:
                     # Since --force was not used we are going to raise an error
                     write_message("   Failed: tag 001 found in the xml"
                                   " submitted with value %(rec_id)s. The"
                                   " corresponding record however does not"
                                   " exists. If you want to really create"
                                   " such record, please use the --force"
                                   " parameter when calling bibupload." % {
                                     "rec_id": rec_id}, stream=sys.stderr)
                     return -1
             else:
                 # The record doesn't exist yet. We shall have try to check
                 # the SYSNO or OAI or DOI id later.
                 write_message("   -Tag 001 value not found in database.",
                               verbose=9)
                 rec_id = None
     else:
         write_message("   -Tag 001 not found in the xml marc file.", verbose=9)
 
     if rec_id is None:
         # 2nd step we look for the SYSNO
         sysnos = record_get_field_values(record,
             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \
             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or "",
             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \
             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or "",
             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6])
         if sysnos:
             sysno = sysnos[0] # there should be only one external SYSNO
             write_message("   -Checking if SYSNO " + sysno + \
                           " exists in the database", verbose=9)
             # try to find the corresponding rec id from the database
             rec_id = find_record_from_sysno(sysno)
             if rec_id is not None:
                 # rec_id found
                 pass
             else:
                 # The record doesn't exist yet. We will try to check
                 # external and internal OAI ids later.
                 write_message("   -Tag SYSNO value not found in database.",
                               verbose=9)
                 rec_id = None
         else:
             write_message("   -Tag SYSNO not found in the xml marc file.",
                 verbose=9)
 
     if rec_id is None:
         # 2nd step we look for the external OAIID
         extoai_fields = record_get_field_instances(record,
             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \
             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or "",
             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \
             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or "")
         if extoai_fields:
             for field in extoai_fields:
                 extoaiid = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6])
                 extoaisrc = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6])
                 if extoaiid:
                     extoaiid = extoaiid[0]
                     if extoaisrc:
                         extoaisrc = extoaisrc[0]
                     else:
                         extoaisrc = None
                     write_message("   -Checking if EXTOAIID %s (%s) exists in the database" % (extoaiid, extoaisrc), verbose=9)
                     # try to find the corresponding rec id from the database
                     rec_ids = find_records_from_extoaiid(extoaiid, extoaisrc)
                     if rec_ids:
                         # rec_id found
                         rec_id = rec_ids.pop()
                         break
                     else:
                         # The record doesn't exist yet. We will try to check
                         # OAI id later.
                         write_message("   -Tag EXTOAIID value not found in database.",
                                     verbose=9)
                         rec_id = None
         else:
             write_message("   -Tag EXTOAIID not found in the xml marc file.", verbose=9)
 
     if rec_id is None:
         # 4th step we look for the OAI ID
         oaiidvalues = record_get_field_values(record,
             CFG_OAI_ID_FIELD[0:3],
             CFG_OAI_ID_FIELD[3:4] != "_" and \
             CFG_OAI_ID_FIELD[3:4] or "",
             CFG_OAI_ID_FIELD[4:5] != "_" and \
             CFG_OAI_ID_FIELD[4:5] or "",
             CFG_OAI_ID_FIELD[5:6])
         if oaiidvalues:
             oaiid = oaiidvalues[0] # there should be only one OAI ID
             write_message("   -Check if local OAI ID " + oaiid + \
                           " exist in the database", verbose=9)
 
             # try to find the corresponding rec id from the database
             rec_id = find_record_from_oaiid(oaiid)
             if rec_id is not None:
                 # rec_id found
                 pass
             else:
                 write_message("   -Tag OAI ID value not found in database.",
                               verbose=9)
                 rec_id = None
         else:
             write_message("   -Tag SYSNO not found in the xml marc file.",
                 verbose=9)
 
     if rec_id is None:
         # 5th step we look for the DOI.
         record_dois = record_extract_dois(record)
         matching_recids = set()
         if record_dois:
             # try to find the corresponding rec id from the database
             for record_doi in record_dois:
                 possible_recid = find_record_from_doi(record_doi)
                 if possible_recid:
                     matching_recids.add(possible_recid)
             if len(matching_recids) > 1:
                 # Oops, this record refers to DOI existing in multiple records.
                 # Dunno which one to choose.
                 write_message("   Failed: Multiple records found in the" \
                           " database %s that match the DOI(s) in the input" \
                           " MARCXML %s" % (repr(matching_recids), repr(record_dois)),
                           verbose=1, stream=sys.stderr)
                 return -1
             elif len(matching_recids) == 1:
                 rec_id = matching_recids.pop()
                 if opt_mode == 'insert':
                     write_message("   Failed: DOI tag matching record #%s found in the xml" \
                           " submitted, you should use the option replace," \
                           " correct or append to replace an existing" \
                           " record. (-h for help)" % rec_id,
                           verbose=1, stream=sys.stderr)
                     return -1
             else:
                 write_message("   - Tag DOI value not found in database.",
                                   verbose=9)
                 rec_id = None
         else:
             write_message("   -Tag DOI not found in the xml marc file.",
                 verbose=9)
 
     # Now we should have detected rec_id from SYSNO or OAIID
     # tags.  (None otherwise.)
     if rec_id:
         if opt_mode == 'insert':
             write_message("   Failed: Record found in the database," \
                           " you should use the option replace," \
                           " correct or append to replace an existing" \
                           " record. (-h for help)",
                           verbose=1, stream=sys.stderr)
             return -1
     else:
         if opt_mode != 'insert' and \
            opt_mode != 'replace_or_insert':
             write_message("   Failed: Record not found in the database."\
                           " Please insert the file before updating it."\
                           " (-h for help)", verbose=1, stream=sys.stderr)
             return -1
 
     return rec_id and int(rec_id) or None
 
 def check_record_doi_is_unique(rec_id, record):
     """
     Check that DOI found in 'record' does not exist in any other
     record than 'recid'.
 
     Return (boolean, msg) where 'boolean' would be True if the DOI is
     unique.
     """
     record_dois = record_extract_dois(record)
     if record_dois:
         matching_recids = set()
         for record_doi in record_dois:
             possible_recid = find_record_from_doi(record_doi)
             if possible_recid:
                 matching_recids.add(possible_recid)
         if len(matching_recids) > 1:
             # Oops, this record refers to DOI existing in multiple records.
             msg = "   Failed: Multiple records found in the" \
                       " database %s that match the DOI(s) in the input" \
                       " MARCXML %s" % (repr(matching_recids), repr(record_dois))
             return (False, msg)
         elif len(matching_recids) == 1:
             matching_recid = matching_recids.pop()
             if str(matching_recid) != str(rec_id):
                 # Oops, this record refers to DOI existing in a different record.
                 msg = "   Failed: DOI(s) %s found in this record (#%s)" \
                       " already exist(s) in another other record (#%s)" % \
                       (repr(record_dois), rec_id, matching_recid)
                 return (False, msg)
     return (True, "")
 
 ### Insert functions
 
 def create_new_record(rec_id=None, pretend=False):
     """
     Create new record in the database
 
     @param rec_id: if specified the new record will have this rec_id.
     @type rec_id: int
     @return: the allocated rec_id
     @rtype: int
 
     @note: in case of errors will be returned None
     """
     if rec_id is not None:
         try:
             rec_id = int(rec_id)
         except (ValueError, TypeError), error:
             write_message("   ERROR: during the creation_new_record function: %s "
         % error, verbose=1, stream=sys.stderr)
             return None
         if run_sql("SELECT id FROM bibrec WHERE id=%s", (rec_id, )):
             write_message("   ERROR: during the creation_new_record function: the requested rec_id %s already exists." % rec_id)
             return None
     if pretend:
         if rec_id:
             return rec_id
         else:
             return run_sql("SELECT max(id)+1 FROM bibrec")[0][0]
     if rec_id is not None:
         return run_sql("INSERT INTO bibrec (id, creation_date, modification_date) VALUES (%s, NOW(), NOW())", (rec_id, ))
     else:
         return run_sql("INSERT INTO bibrec (creation_date, modification_date) VALUES (NOW(), NOW())")
 
 def insert_bibfmt(id_bibrec, marc, bibformat, modification_date='1970-01-01 00:00:00', pretend=False):
     """Insert the format in the table bibfmt"""
     # compress the marc value
     pickled_marc =  compress(marc)
     try:
         time.strptime(modification_date, "%Y-%m-%d %H:%M:%S")
     except ValueError:
         modification_date = '1970-01-01 00:00:00'
 
     query = """INSERT LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value)
-        VALUES (%s, %s, %s, %s)"""
+        VALUES (%s, %s, %s, _binary %s)"""
     if not pretend:
         row_id  = run_sql(query, (id_bibrec, bibformat, modification_date, pickled_marc))
         return row_id
     else:
         return 1
 
 def insert_record_bibxxx(tag, value, pretend=False):
     """Insert the record into bibxxx"""
     # determine into which table one should insert the record
     table_name = 'bib'+tag[0:2]+'x'
 
     # check if the tag, value combination exists in the table
     query = """SELECT id,value FROM %s """ % table_name
     query += """ WHERE tag=%s AND value=%s"""
     params = (tag, value)
     res = None
     res = run_sql(query, params)
 
     # Note: compare now the found values one by one and look for
     # string binary equality (e.g. to respect lowercase/uppercase
     # match), regardless of the charset etc settings.  Ideally we
     # could use a BINARY operator in the above SELECT statement, but
     # we would have to check compatibility on various MySQLdb versions
     # etc; this approach checks all matched values in Python, not in
     # MySQL, which is less cool, but more conservative, so it should
     # work better on most setups.
     if res:
         for row in res:
             row_id = row[0]
             row_value = row[1]
             if row_value == value:
                 return (table_name, row_id)
 
     # We got here only when the tag, value combination was not found,
     # so it is now necessary to insert the tag, value combination into
     # bibxxx table as new.
     query = """INSERT INTO %s """ % table_name
     query += """ (tag, value) values (%s , %s)"""
     params = (tag, value)
     if not pretend:
         row_id = run_sql(query, params)
     else:
         return (table_name, 1)
     return (table_name, row_id)
 
 def insert_record_bibrec_bibxxx(table_name, id_bibxxx,
         field_number, id_bibrec, pretend=False):
     """Insert the record into bibrec_bibxxx"""
     # determine into which table one should insert the record
     full_table_name = 'bibrec_'+ table_name
 
     # insert the proper row into the table
     query = """INSERT INTO %s """ % full_table_name
     query += """(id_bibrec,id_bibxxx, field_number) values (%s , %s, %s)"""
     params = (id_bibrec, id_bibxxx, field_number)
     if not pretend:
         res = run_sql(query, params)
     else:
         return 1
     return res
 
 def synchronize_8564(rec_id, record, record_had_FFT, bibrecdocs, pretend=False):
     """
     Synchronize 8564_ tags and BibDocFile tables.
 
     This function directly manipulate the record parameter.
 
     @type rec_id: positive integer
     @param rec_id: the record identifier.
     @param record: the record structure as created by bibrecord.create_record
     @type record_had_FFT: boolean
     @param record_had_FFT: True if the incoming bibuploaded-record used FFT
     @return: the manipulated record (which is also modified as a side effect)
     """
     def merge_marc_into_bibdocfile(field, pretend=False):
         """
         Internal function that reads a single field and stores its content
         in BibDocFile tables.
         @param field: the 8564_ field containing a BibDocFile URL.
         """
         write_message('Merging field: %s' % (field, ), verbose=9)
         url = field_get_subfield_values(field, 'u')[:1] or field_get_subfield_values(field, 'q')[:1]
         description = field_get_subfield_values(field, 'y')[:1]
         comment = field_get_subfield_values(field, 'z')[:1]
         if url:
             recid, docname, docformat = decompose_bibdocfile_url(url[0])
             if recid != rec_id:
                 write_message("INFO: URL %s is not pointing to a fulltext owned by this record (%s)" % (url, recid), stream=sys.stderr)
             else:
                 try:
                     bibdoc = bibrecdocs.get_bibdoc(docname)
                     if description and not pretend:
                         bibdoc.set_description(description[0], docformat)
                     if comment and not pretend:
                         bibdoc.set_comment(comment[0], docformat)
                 except InvenioBibDocFileError:
                     ## Apparently the referenced docname doesn't exist anymore.
                     ## Too bad. Let's skip it.
                     write_message("WARNING: docname %s does not seem to exist for record %s. Has it been renamed outside FFT?" % (docname, recid), stream=sys.stderr)
 
     def merge_bibdocfile_into_marc(field, subfields):
         """
         Internal function that reads BibDocFile table entries referenced by
         the URL in the given 8564_ field and integrate the given information
         directly with the provided subfields.
 
         @param field: the 8564_ field containing a BibDocFile URL.
         @param subfields: the subfields corresponding to the BibDocFile URL
                           generated after BibDocFile tables.
         """
         write_message('Merging subfields %s into field %s' % (subfields, field), verbose=9)
         subfields = dict(subfields) ## We make a copy not to have side-effects
         subfield_to_delete = []
         for subfield_position, (code, value) in enumerate(field_get_subfield_instances(field)):
             ## For each subfield instance already existing...
             if code in subfields:
                 ## ...We substitute it with what is in BibDocFile tables
                 record_modify_subfield(record, '856', code, subfields[code],
                     subfield_position, field_position_global=field[4])
                 del subfields[code]
             else:
                 ## ...We delete it otherwise
                 subfield_to_delete.append(subfield_position)
 
         subfield_to_delete.sort()
 
         for counter, position in enumerate(subfield_to_delete):
             ## FIXME: Very hackish algorithm. Since deleting a subfield
             ## will alterate the position of following subfields, we
             ## are taking note of this and adjusting further position
             ## by using a counter.
             record_delete_subfield_from(record, '856', position - counter,
                 field_position_global=field[4])
 
         subfields = subfields.items()
         subfields.sort()
         for code, value in subfields:
             ## Let's add non-previously existing subfields
             record_add_subfield_into(record, '856', code, value,
                 field_position_global=field[4])
 
     def get_bibdocfile_managed_info():
         """
         Internal function, returns a dictionary of
         BibDocFile URL -> wanna-be subfields.
         This information is retrieved from internal BibDoc
         structures rather than from input MARC XML files
 
         @rtype: mapping
         @return: BibDocFile URL -> wanna-be subfields dictionary
         """
         ret = {}
         latest_files = bibrecdocs.list_latest_files(list_hidden=False)
         for afile in latest_files:
             url = afile.get_url()
             ret[url] = {'u': url}
             description = afile.get_description()
             comment = afile.get_comment()
             subformat = afile.get_subformat()
             size = afile.get_size()
             if description:
                 ret[url]['y'] = description
             if comment:
                 ret[url]['z'] = comment
             if subformat:
                 ret[url]['x'] = subformat
             ret[url]['s'] = str(size)
 
         return ret
 
     write_message("Synchronizing MARC of recid '%s' with:\n%s" % (rec_id, record), verbose=9)
     tags856s = record_get_field_instances(record, '856', '%', '%')
     write_message("Original 856%% instances: %s" % tags856s, verbose=9)
     tags8564s_to_add = get_bibdocfile_managed_info()
     write_message("BibDocFile instances: %s" % tags8564s_to_add, verbose=9)
     positions_tags8564s_to_remove = []
 
     for local_position, field in enumerate(tags856s):
         if field[1] == '4' and field[2] == ' ':
             write_message('Analysing %s' % (field, ), verbose=9)
             for url in field_get_subfield_values(field, 'u') + field_get_subfield_values(field, 'q'):
                 if url in tags8564s_to_add:
                     # there exists a link in the MARC of the record and the connection exists in BibDoc tables
                     if record_had_FFT:
                         merge_bibdocfile_into_marc(field, tags8564s_to_add[url])
                     else:
                         merge_marc_into_bibdocfile(field, pretend=pretend)
                     del tags8564s_to_add[url]
                     break
                 elif bibdocfile_url_p(url) and decompose_bibdocfile_url(url)[0] == rec_id:
                     # The link exists and is potentially correct-looking link to a document
                     # moreover, it refers to current record id ... but it does not exist in
                     # internal BibDoc structures. This could have happen in the case of renaming a document
                     # or its removal. In both cases we have to remove link... a new one will be created
                     positions_tags8564s_to_remove.append(local_position)
                     write_message("%s to be deleted and re-synchronized" % (field, ),  verbose=9)
                     break
 
     record_delete_fields(record, '856', positions_tags8564s_to_remove)
 
     tags8564s_to_add = tags8564s_to_add.values()
     tags8564s_to_add.sort()
     ## FIXME: we are not yet able to preserve the sorting
     ## of 8564 tags WRT FFT in BibUpload.
     ## See ticket #1606.
     for subfields in tags8564s_to_add:
         subfields = subfields.items()
         subfields.sort()
         record_add_field(record, '856', '4', ' ', subfields=subfields)
 
     write_message('Final record: %s' % record, verbose=9)
     return record
 
 def _get_subfield_value(field, subfield_code, default=None):
     res = field_get_subfield_values(field, subfield_code)
     if res != [] and res != None:
         return res[0]
     else:
         return default
 
 
 def elaborate_mit_tags(record, rec_id, mode, pretend = False, tmp_ids = {},
                        tmp_vers = {}):
     """
     Uploading MoreInfo -> BDM tags
     """
     tuple_list = extract_tag_from_record(record, 'BDM')
 
     # Now gathering information from BDR tags - to be processed later
     write_message("Processing BDM entries of the record ")
     recordDocs = BibRecDocs(rec_id)
 
     if tuple_list:
         for mit in record_get_field_instances(record, 'BDM', ' ', ' '):
             relation_id = _get_subfield_value(mit, "r")
             bibdoc_id = _get_subfield_value(mit, "i")
             # checking for a possibly temporary ID
             if not (bibdoc_id is None):
                 bibdoc_id = resolve_identifier(tmp_ids, bibdoc_id)
 
             bibdoc_ver = _get_subfield_value(mit, "v")
             if not (bibdoc_ver is None):
                 bibdoc_ver = resolve_identifier(tmp_vers, bibdoc_ver)
 
             bibdoc_name = _get_subfield_value(mit, "n")
             bibdoc_fmt = _get_subfield_value(mit, "f")
             moreinfo_str = _get_subfield_value(mit, "m")
 
             if bibdoc_id == None:
                 if bibdoc_name == None:
                     raise StandardError("Incorrect relation. Neither name nor identifier of the first obejct has been specified")
                 else:
                     # retrieving the ID based on the document name (inside current record)
                     # The document is attached to current record.
                     try:
                         bibdoc_id = recordDocs.get_docid(bibdoc_name)
                     except:
                         raise StandardError("BibDoc of a name %s does not exist within a record" % (bibdoc_name, ))
             else:
                 if bibdoc_name != None:
                     write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr)
             if (moreinfo_str is None or mode in ("replace", "correct")) and (not pretend):
 
                 MoreInfo(docid=bibdoc_id , version = bibdoc_ver,
                          docformat = bibdoc_fmt, relation = relation_id).delete()
 
             if (not moreinfo_str is None) and (not pretend):
                 MoreInfo.create_from_serialised(moreinfo_str,
                                                 docid=bibdoc_id,
                                                 version = bibdoc_ver,
                                                 docformat = bibdoc_fmt,
                                                 relation = relation_id)
     return record
 
 def elaborate_brt_tags(record, rec_id, mode, pretend=False, tmp_ids = {}, tmp_vers = {}):
     """
     Process BDR tags describing relations between existing objects
     """
     tuple_list = extract_tag_from_record(record, 'BDR')
 
     # Now gathering information from BDR tags - to be processed later
     relations_to_create = []
     write_message("Processing BDR entries of the record ")
     recordDocs = BibRecDocs(rec_id) #TODO: check what happens if there is no record yet ! Will the class represent an empty set?
 
     if tuple_list:
         for brt in record_get_field_instances(record, 'BDR', ' ', ' '):
 
             relation_id = _get_subfield_value(brt, "r")
 
             bibdoc1_id = None
             bibdoc1_name = None
             bibdoc1_ver = None
             bibdoc1_fmt = None
             bibdoc2_id = None
             bibdoc2_name = None
             bibdoc2_ver = None
             bibdoc2_fmt = None
 
             if not relation_id:
                 bibdoc1_id = _get_subfield_value(brt, "i")
                 bibdoc1_name = _get_subfield_value(brt, "n")
 
 
                 if bibdoc1_id == None:
                     if bibdoc1_name == None:
                         raise StandardError("Incorrect relation. Neither name nor identifier of the first obejct has been specified")
                     else:
                         # retrieving the ID based on the document name (inside current record)
                         # The document is attached to current record.
                         try:
                             bibdoc1_id = recordDocs.get_docid(bibdoc1_name)
                         except:
                             raise StandardError("BibDoc of a name %s does not exist within a record" % \
                                                     (bibdoc1_name, ))
                 else:
                     # resolving temporary identifier
                     bibdoc1_id = resolve_identifier(tmp_ids, bibdoc1_id)
                     if bibdoc1_name != None:
                         write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr)
 
                 bibdoc1_ver = _get_subfield_value(brt, "v")
                 if not (bibdoc1_ver is None):
                     bibdoc1_ver = resolve_identifier(tmp_vers, bibdoc1_ver)
                 bibdoc1_fmt = _get_subfield_value(brt, "f")
 
                 bibdoc2_id = _get_subfield_value(brt, "j")
                 bibdoc2_name = _get_subfield_value(brt, "o")
 
                 if bibdoc2_id == None:
                     if bibdoc2_name == None:
                         raise StandardError("Incorrect relation. Neither name nor identifier of the second obejct has been specified")
                     else:
                         # retrieving the ID based on the document name (inside current record)
                         # The document is attached to current record.
                         try:
                             bibdoc2_id = recordDocs.get_docid(bibdoc2_name)
                         except:
                             raise StandardError("BibDoc of a name %s does not exist within a record" % (bibdoc2_name, ))
                 else:
                     bibdoc2_id = resolve_identifier(tmp_ids, bibdoc2_id)
                     if bibdoc2_name != None:
                         write_message("WARNING: both name and id of the first document of a relation have been specified. Ignoring the name", stream=sys.stderr)
 
 
 
                 bibdoc2_ver = _get_subfield_value(brt, "w")
                 if not (bibdoc2_ver is None):
                     bibdoc2_ver = resolve_identifier(tmp_vers, bibdoc2_ver)
                 bibdoc2_fmt = _get_subfield_value(brt, "g")
 
             control_command = _get_subfield_value(brt, "d")
             relation_type = _get_subfield_value(brt, "t")
 
             if not relation_type and not relation_id:
                 raise StandardError("The relation type must be specified")
 
             more_info = _get_subfield_value(brt, "m")
 
             # the relation id might be specified in the case of updating
             # MoreInfo table instead of other fields
             rel_obj = None
             if not relation_id:
                 rels = BibRelation.get_relations(rel_type = relation_type,
                                                  bibdoc1_id = bibdoc1_id,
                                                  bibdoc2_id = bibdoc2_id,
                                                  bibdoc1_ver = bibdoc1_ver,
                                                  bibdoc2_ver = bibdoc2_ver,
                                                  bibdoc1_fmt = bibdoc1_fmt,
                                                  bibdoc2_fmt = bibdoc2_fmt)
                 if len(rels) > 0:
                     rel_obj = rels[0]
                     relation_id = rel_obj.id
             else:
                 rel_obj = BibRelation(rel_id=relation_id)
 
             relations_to_create.append((relation_id, bibdoc1_id, bibdoc1_ver,
                                  bibdoc1_fmt, bibdoc2_id, bibdoc2_ver,
                                  bibdoc2_fmt, relation_type, more_info,
                                  rel_obj, control_command))
 
     record_delete_field(record, 'BDR', ' ', ' ')
 
     if mode in ("insert", "replace_or_insert", "append", "correct", "replace"):
         # now creating relations between objects based on the data
         if not pretend:
             for (relation_id, bibdoc1_id, bibdoc1_ver, bibdoc1_fmt,
                  bibdoc2_id,  bibdoc2_ver, bibdoc2_fmt, rel_type,
                  more_info, rel_obj, control_command) in relations_to_create:
                 if rel_obj == None:
                     rel_obj = BibRelation.create(bibdoc1_id = bibdoc1_id,
                                                     bibdoc1_ver = bibdoc1_ver,
                                                     bibdoc1_fmt = bibdoc1_fmt,
                                                     bibdoc2_id = bibdoc2_id,
                                                     bibdoc2_ver = bibdoc2_ver,
                                                     bibdoc2_fmt = bibdoc2_fmt,
                                                     rel_type = rel_type)
                     relation_id = rel_obj.id
 
                 if mode in ("replace"):
                     # Clearing existing MoreInfo content
                     rel_obj.get_more_info().delete()
 
                 if more_info:
                     MoreInfo.create_from_serialised(more_info, relation = relation_id)
 
                 if control_command == "DELETE":
                     rel_obj.delete()
     else:
         write_message("BDR tag is not processed in the %s mode" % (mode, ))
     return record
 
 def elaborate_fft_tags(record, rec_id, mode, pretend=False,
                        tmp_ids = {}, tmp_vers = {}, bibrecdocs=None):
     """
     Process FFT tags that should contain $a with file pathes or URLs
     to get the fulltext from.  This function enriches record with
     proper 8564 URL tags, downloads fulltext files and stores them
     into var/data structure where appropriate.
 
     CFG_BIBUPLOAD_WGET_SLEEP_TIME defines time to sleep in seconds in
     between URL downloads.
 
     Note: if an FFT tag contains multiple $a subfields, we upload them
     into different 856 URL tags in the metadata.  See regression test
     case test_multiple_fft_insert_via_http().
     """
 
     # Let's define some handy sub procedure.
     def _add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, modification_date, pretend=False):
         """Adds a new format for a given bibdoc. Returns True when everything's fine."""
         write_message('Add new format to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s, modification_date: %s' % (repr(bibdoc), url, docformat, docname, doctype, newname, description, comment, flags, modification_date), verbose=9)
         try:
             if not url: # Not requesting a new url. Just updating comment & description
                 return _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=pretend)
             try:
                 if not pretend:
                     bibdoc.add_file_new_format(url, description=description, comment=comment, flags=flags, modification_date=modification_date)
             except StandardError, e:
                 write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because format already exists (%s)." % (url, docformat, docname, doctype, newname, description, comment, flags, modification_date, e), stream=sys.stderr)
                 raise
         except Exception, e:
             write_message("ERROR: in adding '%s' as a new format because of: %s" % (url, e), stream=sys.stderr)
             raise
         return True
 
     def _add_new_version(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, modification_date, pretend=False):
         """Adds a new version for a given bibdoc. Returns True when everything's fine."""
         write_message('Add new version to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s' % (repr(bibdoc), url, docformat, docname, doctype, newname, description, comment, flags), verbose=9)
         try:
             if not url:
                 return _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=pretend)
             try:
                 if not pretend:
                     bibdoc.add_file_new_version(url, description=description, comment=comment, flags=flags, modification_date=modification_date)
             except StandardError, e:
                 write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because '%s'." % (url, docformat, docname, doctype, newname, description, comment, flags, modification_date, e), stream=sys.stderr)
                 raise
         except Exception, e:
             write_message("ERROR: in adding '%s' as a new version because of: %s" % (url, e), stream=sys.stderr)
             raise
         return True
 
     def _update_description_and_comment(bibdoc, docname, docformat, description, comment, flags, pretend=False):
         """Directly update comments and descriptions."""
         write_message('Just updating description and comment for %s with format %s with description %s, comment %s and flags %s' % (docname, docformat, description, comment, flags), verbose=9)
         try:
             if not pretend:
                 bibdoc.set_description(description, docformat)
                 bibdoc.set_comment(comment, docformat)
                 for flag in CFG_BIBDOCFILE_AVAILABLE_FLAGS:
                     if flag in flags:
                         bibdoc.set_flag(flag, docformat)
                     else:
                         bibdoc.unset_flag(flag, docformat)
         except StandardError, e:
             write_message("('%s', '%s', '%s', '%s', '%s') description and comment not updated because '%s'." % (docname, docformat, description, comment, flags, e))
             raise
         return True
 
     def _process_document_moreinfos(more_infos, docname, version, docformat, mode):
         if not mode in ('correct', 'append', 'replace_or_insert', 'replace', 'insert'):
             #print "exited because the mode is incorrect"
             return
 
         docid = None
         try:
             docid = bibrecdocs.get_docid(docname)
         except:
             raise StandardError("MoreInfo: No document of a given name associated with the record")
 
         if not version:
             # We have to retrieve the most recent version ...
             version = bibrecdocs.get_bibdoc(docname).get_latest_version()
 
         doc_moreinfo_s, version_moreinfo_s, version_format_moreinfo_s, format_moreinfo_s = more_infos
 
         if mode in ("replace", "replace_or_insert"):
             if doc_moreinfo_s: #only if specified, otherwise do not touch
                 MoreInfo(docid = docid).delete()
 
             if format_moreinfo_s: #only if specified... otherwise do not touch
                 MoreInfo(docid = docid, docformat = docformat).delete()
 
         if not doc_moreinfo_s is None:
             MoreInfo.create_from_serialised(ser_str = doc_moreinfo_s, docid = docid)
 
         if not version_moreinfo_s is None:
             MoreInfo.create_from_serialised(ser_str = version_moreinfo_s,
                                             docid = docid, version = version)
         if not version_format_moreinfo_s is None:
             MoreInfo.create_from_serialised(ser_str = version_format_moreinfo_s,
                                             docid = docid, version = version,
                                             docformat = docformat)
         if not format_moreinfo_s is None:
             MoreInfo.create_from_serialised(ser_str = format_moreinfo_s,
                                             docid = docid, docformat = docformat)
 
     if mode == 'delete':
         raise StandardError('FFT tag specified but bibupload executed in --delete mode')
 
     tuple_list = extract_tag_from_record(record, 'FFT')
 
 
     if tuple_list: # FFT Tags analysis
         write_message("FFTs: "+str(tuple_list), verbose=9)
         docs = {} # docnames and their data
 
         for fft in record_get_field_instances(record, 'FFT', ' ', ' '):
             # Very first, we retrieve the potentially temporary odentifiers...
             #even if the rest fails, we should include them in teh dictionary
 
             version = _get_subfield_value(fft, 'v', '')
             # checking if version is temporary... if so, filling a different varaible
             is_tmp_ver, bibdoc_tmpver = parse_identifier(version)
             if is_tmp_ver:
                 version = None
             else:
                 bibdoc_tmpver = None
             if not version: #treating cases of empty string etc...
                 version = None
 
             bibdoc_tmpid = field_get_subfield_values(fft, 'i')
             if bibdoc_tmpid:
                 bibdoc_tmpid = bibdoc_tmpid[0]
             else:
                 bibdoc_tmpid
             is_tmp_id, bibdoc_tmpid = parse_identifier(bibdoc_tmpid)
             if not is_tmp_id:
                 bibdoc_tmpid = None
 
 
             # In the case of having temporary id's, we dont resolve them yet but signaklise that they have been used
             # value -1 means that identifier has been declared but not assigned a value yet
             if bibdoc_tmpid:
                 if bibdoc_tmpid in tmp_ids:
                     write_message("WARNING: the temporary identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpid, ), stream=sys.stderr)
                 else:
                     tmp_ids[bibdoc_tmpid] = -1
 
             if bibdoc_tmpver:
                 if bibdoc_tmpver in tmp_vers:
                     write_message("WARNING: the temporary version identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpver, ), stream=sys.stderr)
                 else:
                     tmp_vers[bibdoc_tmpver] = -1
 
 
             # Let's discover the type of the document
             # This is a legacy field and will not be enforced any particular
             # check on it.
             doctype = _get_subfield_value(fft, 't', 'Main') #Default is Main
 
             # Let's discover the url.
             url = field_get_subfield_values(fft, 'a')
             if url:
                 url = url[0]
                 try:
                     check_valid_url(url)
                 except StandardError, e:
                     raise StandardError, "fft '%s' specifies in $a a location ('%s') with problems: %s" % (fft, url, e)
             else:
                 url = ''
 
 #TODO: a lot of code can be compactified using similar syntax ... should be more readable on the longer scale
 #      maybe right side expressions look a bit cryptic, but the elaborate_fft function would be much clearer
 
 
             if mode == 'correct' and doctype != 'FIX-MARC':
                 arg2 = ""
             else:
                 arg2 = KEEP_OLD_VALUE
             description =  _get_subfield_value(fft, 'd', arg2)
 
             # Let's discover the description
 #            description = field_get_subfield_values(fft, 'd')
 #            if description != []:
 #                description = description[0]
 #            else:
 #                if mode == 'correct' and doctype != 'FIX-MARC':
                     ## If the user require to correct, and do not specify
                     ## a description this means she really want to
                     ## modify the description.
 #                    description = ''
 #                else:
 #                    description = KEEP_OLD_VALUE
 
             # Let's discover the desired docname to be created/altered
             name = field_get_subfield_values(fft, 'n')
             if name:
                 ## Let's remove undesired extensions
                 name = file_strip_ext(name[0] + '.pdf')
             else:
                 if url:
                     name = get_docname_from_url(url)
                 elif mode != 'correct' and doctype != 'FIX-MARC':
                     raise StandardError, "WARNING: fft '%s' doesn't specifies either a location in $a or a docname in $n" % str(fft)
                 else:
                     continue
 
             # Let's discover the desired new docname in case we want to change it
             newname = field_get_subfield_values(fft, 'm')
             if newname:
                 newname = file_strip_ext(newname[0] + '.pdf')
             else:
                 newname = name
 
             # Let's discover the desired format
             docformat = field_get_subfield_values(fft, 'f')
             if docformat:
                 docformat = normalize_format(docformat[0])
             else:
                 if url:
                     docformat = guess_format_from_url(url)
                 else:
                     docformat = ""
 
             # Let's discover the icon
             icon = field_get_subfield_values(fft, 'x')
             if icon != []:
                 icon = icon[0]
                 if icon != KEEP_OLD_VALUE:
                     try:
                         check_valid_url(icon)
                     except StandardError, e:
                         raise StandardError, "fft '%s' specifies in $x an icon ('%s') with problems: %s" % (fft, icon, e)
             else:
                 icon = ''
 
             # Let's discover the comment
             comment = field_get_subfield_values(fft, 'z')
             if comment != []:
                 comment = comment[0]
             else:
                 if mode == 'correct' and doctype != 'FIX-MARC':
                     ## See comment on description
                     comment = ''
                 else:
                     comment = KEEP_OLD_VALUE
 
             # Let's discover the restriction
             restriction = field_get_subfield_values(fft, 'r')
             if restriction != []:
                 restriction = restriction[0]
             else:
                 if mode == 'correct' and doctype != 'FIX-MARC':
                     ## See comment on description
                     restriction = ''
                 else:
                     restriction = KEEP_OLD_VALUE
 
 
             document_moreinfo = _get_subfield_value(fft, 'w')
             version_moreinfo = _get_subfield_value(fft, 'p')
             version_format_moreinfo = _get_subfield_value(fft, 'b')
             format_moreinfo = _get_subfield_value(fft, 'u')
 
 
             # Let's discover the timestamp of the file (if any)
             timestamp = field_get_subfield_values(fft, 's')
             if timestamp:
                 try:
                     timestamp = datetime(*(time.strptime(timestamp[0], "%Y-%m-%d %H:%M:%S")[:6]))
                 except ValueError:
                     write_message('WARNING: The timestamp is not in a good format, thus will be ignored. The format should be YYYY-MM-DD HH:MM:SS', stream=sys.stderr)
                     timestamp = ''
             else:
                 timestamp = ''
 
             flags = field_get_subfield_values(fft, 'o')
 
             for flag in flags:
                 if flag not in CFG_BIBDOCFILE_AVAILABLE_FLAGS:
                     raise StandardError, "fft '%s' specifies a non available flag: %s" % (fft, flag)
 
             if docs.has_key(name): # new format considered
                 (doctype2, newname2, restriction2, version2, urls, dummybibdoc_moreinfos2, dummybibdoc_tmpid2, dummybibdoc_tmpver2 ) = docs[name]
                 if doctype2 != doctype:
                     raise StandardError, "fft '%s' specifies a different doctype from previous fft with docname '%s'" % (str(fft), name)
                 if newname2 != newname:
                     raise StandardError, "fft '%s' specifies a different newname from previous fft with docname '%s'" % (str(fft), name)
                 if restriction2 != restriction:
                     raise StandardError, "fft '%s' specifies a different restriction from previous fft with docname '%s'" % (str(fft), name)
                 if version2 != version:
                     raise StandardError, "fft '%s' specifies a different version than the previous fft with docname '%s'" % (str(fft), name)
                 for (dummyurl2, format2, dummydescription2, dummycomment2, dummyflags2, dummytimestamp2) in urls:
                     if docformat == format2:
                         raise StandardError, "fft '%s' specifies a second file '%s' with the same format '%s' from previous fft with docname '%s'" % (str(fft), url, docformat, name)
                 if url or docformat:
                     urls.append((url, docformat, description, comment, flags, timestamp))
                 if icon:
                     urls.append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp))
             else:
                 if url or docformat:
                     docs[name] = (doctype, newname, restriction, version, [(url, docformat, description, comment, flags, timestamp)], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver)
                     if icon:
                         docs[name][4].append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp))
                 elif icon:
                     docs[name] = (doctype, newname, restriction, version, [(icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags, timestamp)], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver)
                 else:
                     docs[name] = (doctype, newname, restriction, version, [], [document_moreinfo, version_moreinfo, version_format_moreinfo, format_moreinfo], bibdoc_tmpid, bibdoc_tmpver)
 
         write_message('Result of FFT analysis:\n\tDocs: %s' % (docs,), verbose=9)
 
         # Let's remove all FFT tags
         record_delete_field(record, 'FFT', ' ', ' ')
 
         ## Let's pre-download all the URLs to see if, in case of mode 'correct' or 'append'
         ## we can avoid creating a new revision.
 
 
         for docname, (doctype, newname, restriction, version, urls, more_infos, bibdoc_tmpid, bibdoc_tmpver ) in docs.items():
             downloaded_urls = []
             try:
                 bibdoc = bibrecdocs.get_bibdoc(docname)
             except InvenioBibDocFileError:
                 ## A bibdoc with the given docname does not exists.
                 ## So there is no chance we are going to revise an existing
                 ## format with an identical file :-)
                 bibdoc = None
 
             new_revision_needed = False
             for url, docformat, description, comment, flags, timestamp in urls:
                 if url:
                     try:
                         downloaded_url = download_url(url, docformat)
                         write_message("%s saved into %s" % (url, downloaded_url), verbose=9)
                     except Exception, err:
                         write_message("ERROR: in downloading '%s' because of: %s" % (url, err), stream=sys.stderr)
                         raise
                     if mode == 'correct' and bibdoc is not None and not new_revision_needed:
                         downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp))
                         if not bibrecdocs.check_file_exists(downloaded_url, docformat):
                             new_revision_needed = True
                         else:
                             write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr)
                     elif mode == 'append' and bibdoc is not None:
                         if not bibrecdocs.check_file_exists(downloaded_url, docformat):
                             downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp))
                         else:
                             write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr)
                     else:
                         downloaded_urls.append((downloaded_url, docformat, description, comment, flags, timestamp))
                 else:
                     downloaded_urls.append(('', docformat, description, comment, flags, timestamp))
             if mode == 'correct' and bibdoc is not None and not new_revision_needed:
                 ## Since we don't need a new revision (because all the files
                 ## that are being uploaded are different)
                 ## we can simply remove the urls but keep the other information
                 write_message("No need to add a new revision for docname %s for recid %s" % (docname, rec_id), verbose=2)
                 docs[docname] = (doctype, newname, restriction, version, [('', docformat, description, comment, flags, timestamp) for (dummy, docformat, description, comment, flags, timestamp) in downloaded_urls], more_infos, bibdoc_tmpid, bibdoc_tmpver)
                 for downloaded_url, dummy, dummy, dummy, dummy, dummy in downloaded_urls:
                     ## Let's free up some space :-)
                     if downloaded_url and os.path.exists(downloaded_url):
                         os.remove(downloaded_url)
             else:
                 if downloaded_urls or mode != 'append':
                     docs[docname] = (doctype, newname, restriction, version, downloaded_urls, more_infos, bibdoc_tmpid, bibdoc_tmpver)
                 else:
                     ## In case we are in append mode and there are no urls to append
                     ## we discard the whole FFT
                     del docs[docname]
 
         if mode == 'replace': # First we erase previous bibdocs
             if not pretend:
                 for bibdoc in bibrecdocs.list_bibdocs():
                     bibdoc.delete()
                     bibrecdocs.dirty = True
 
         for docname, (doctype, newname, restriction, version, urls, more_infos, bibdoc_tmpid, bibdoc_tmpver) in docs.iteritems():
             write_message("Elaborating olddocname: '%s', newdocname: '%s', doctype: '%s', restriction: '%s', urls: '%s', mode: '%s'" % (docname, newname, doctype, restriction, urls, mode), verbose=9)
             if mode in ('insert', 'replace'): # new bibdocs, new docnames, new marc
                 if newname in bibrecdocs.get_bibdoc_names():
                     write_message("('%s', '%s') not inserted because docname already exists." % (newname, urls), stream=sys.stderr)
                     raise StandardError("('%s', '%s') not inserted because docname already exists." % (newname, urls), stream=sys.stderr)
                 try:
                     if not pretend:
                         bibdoc = bibrecdocs.add_bibdoc(doctype, newname)
                         bibdoc.set_status(restriction)
                     else:
                         bibdoc = None
                 except Exception, e:
                     write_message("('%s', '%s', '%s') not inserted because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr)
                     raise e
                 for (url, docformat, description, comment, flags, timestamp) in urls:
                     assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend))
             elif mode == 'replace_or_insert': # to be thought as correct_or_insert
                 try:
                     bibdoc = bibrecdocs.get_bibdoc(docname)
                     found_bibdoc = True
                 except InvenioBibDocFileError:
                     found_bibdoc = False
                 else:
                     if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'):
                         if newname != docname:
                             try:
                                 if not pretend:
                                     bibrecdocs.change_name(newname=newname, docid=bibdoc.id)
                                     write_message(lambda: "After renaming: %s" % bibrecdocs, verbose=9)
                             except StandardError, e:
                                 write_message('ERROR: in renaming %s to %s: %s' % (docname, newname, e), stream=sys.stderr)
                                 raise
                 try:
                     bibdoc = bibrecdocs.get_bibdoc(newname)
                     found_bibdoc = True
                 except InvenioBibDocFileError:
                     found_bibdoc = False
                 else:
                     if doctype == 'PURGE':
                         if not pretend:
                             bibdoc.purge()
                             bibrecdocs.dirty = True
                     elif doctype == 'DELETE':
                         if not pretend:
                             bibdoc.delete()
                             bibrecdocs.dirty = True
                     elif doctype == 'EXPUNGE':
                         if not pretend:
                             bibdoc.expunge()
                             bibrecdocs.dirty = True
                     elif doctype == 'FIX-ALL':
                         if not pretend:
                             bibrecdocs.fix(docname)
                     elif doctype == 'FIX-MARC':
                         pass
                     elif doctype == 'DELETE-FILE':
                         if urls:
                             for (url, docformat, description, comment, flags, timestamp) in urls:
                                 if not pretend:
                                     bibdoc.delete_file(docformat, version)
                     elif doctype == 'REVERT':
                         try:
                             if not pretend:
                                 bibdoc.revert(version)
                         except Exception, e:
                             write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr)
                             raise
                     else:
                         if restriction != KEEP_OLD_VALUE:
                             if not pretend:
                                 bibdoc.set_status(restriction)
                         # Since the docname already existed we have to first
                         # bump the version by pushing the first new file
                         # then pushing the other files.
                         if urls:
                             (first_url, first_format, first_description, first_comment, first_flags, first_timestamp) = urls[0]
                             other_urls = urls[1:]
                             assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, first_timestamp, pretend=pretend))
                             for (url, docformat, description, comment, flags, timestamp) in other_urls:
                                 assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend))
                     ## Let's refresh the list of bibdocs.
                 if not found_bibdoc:
                     if not pretend:
                         bibdoc = bibrecdocs.add_bibdoc(doctype, newname)
                         bibdoc.set_status(restriction)
                         for (url, docformat, description, comment, flags, timestamp) in urls:
                             assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp))
             elif mode == 'correct':
                 try:
                     bibdoc = bibrecdocs.get_bibdoc(docname)
                     found_bibdoc = True
                 except InvenioBibDocFileError:
                     found_bibdoc = False
                 else:
                     if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'):
                         if newname != docname:
                             try:
                                 if not pretend:
                                     bibrecdocs.change_name(newname=newname, docid=bibdoc.id)
                                     write_message(lambda: "After renaming: %s" % bibrecdocs, verbose=9)
                             except StandardError, e:
                                 write_message('ERROR: in renaming %s to %s: %s' % (docname, newname, e), stream=sys.stderr)
                                 raise
                 try:
                     bibdoc = bibrecdocs.get_bibdoc(newname)
                     found_bibdoc = True
                 except InvenioBibDocFileError:
                     found_bibdoc = False
                 else:
                     if doctype == 'PURGE':
                         if not pretend:
                             bibdoc.purge()
                             bibrecdocs.dirty = True
                     elif doctype == 'DELETE':
                         if not pretend:
                             bibdoc.delete()
                             bibrecdocs.dirty = True
                     elif doctype == 'EXPUNGE':
                         if not pretend:
                             bibdoc.expunge()
                             bibrecdocs.dirty = True
                     elif doctype == 'FIX-ALL':
                         if not pretend:
                             bibrecdocs.fix(newname)
                     elif doctype == 'FIX-MARC':
                         pass
                     elif doctype == 'DELETE-FILE':
                         if urls:
                             for (url, docformat, description, comment, flags, timestamp) in urls:
                                 if not pretend:
                                     bibdoc.delete_file(docformat, version)
                     elif doctype == 'REVERT':
                         try:
                             if not pretend:
                                 bibdoc.revert(version)
                         except Exception, e:
                             write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr)
                             raise
                     else:
                         if restriction != KEEP_OLD_VALUE:
                             if not pretend:
                                 bibdoc.set_status(restriction)
                         if doctype and doctype != KEEP_OLD_VALUE:
                             if not pretend:
                                 bibdoc.change_doctype(doctype)
                         if urls:
                             (first_url, first_format, first_description, first_comment, first_flags, first_timestamp) = urls[0]
                             other_urls = urls[1:]
                             assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, first_timestamp, pretend=pretend))
                             for (url, docformat, description, comment, flags, timestamp) in other_urls:
                                 assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend))
                 if not found_bibdoc:
                     if doctype in ('PURGE', 'DELETE', 'EXPUNGE', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE', 'REVERT'):
                         write_message("('%s', '%s', '%s') not performed because '%s' docname didn't existed." % (doctype, newname, urls, docname), stream=sys.stderr)
                         raise StandardError
                     else:
                         if not pretend:
                             bibdoc = bibrecdocs.add_bibdoc(doctype, newname)
                             bibdoc.set_status(restriction)
                             for (url, docformat, description, comment, flags, timestamp) in urls:
                                 assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp))
             elif mode == 'append':
                 found_bibdoc = False
                 try:
                     bibdoc = bibrecdocs.get_bibdoc(docname)
                     found_bibdoc = True
                 except InvenioBibDocFileError:
                     found_bibdoc = False
                 else:
                     for (url, docformat, description, comment, flags, timestamp) in urls:
                         assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp, pretend=pretend))
                 if not found_bibdoc:
                     try:
                         if not pretend:
                             bibdoc = bibrecdocs.add_bibdoc(doctype, docname)
                             bibdoc.set_status(restriction)
                             for (url, docformat, description, comment, flags, timestamp) in urls:
                                 assert(_add_new_format(bibdoc, url, docformat, docname, doctype, newname, description, comment, flags, timestamp))
                     except Exception, e:
                         register_exception()
                         write_message("('%s', '%s', '%s') not appended because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr)
                         raise
             if not pretend and doctype not in ('PURGE', 'DELETE', 'EXPUNGE'):
                 _process_document_moreinfos(more_infos, newname, version, urls and urls[0][1], mode)
 
             # resolving temporary version and identifier
             if bibdoc_tmpid:
                 if bibdoc_tmpid in tmp_ids and tmp_ids[bibdoc_tmpid] != -1:
                     write_message("WARNING: the temporary identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpid, ), stream=sys.stderr)
                 else:
                     tmp_ids[bibdoc_tmpid] = bibrecdocs.get_docid(docname)
 
             if bibdoc_tmpver:
                 if bibdoc_tmpver in tmp_vers and tmp_vers[bibdoc_tmpver] != -1:
                     write_message("WARNING: the temporary version identifier %s has been declared more than once. Ignoring the second occurance" % (bibdoc_tmpver, ), stream=sys.stderr)
                 else:
                     if version == None:
                         if version:
                             tmp_vers[bibdoc_tmpver] = version
                         else:
                             tmp_vers[bibdoc_tmpver] = bibrecdocs.get_bibdoc(docname).get_latest_version()
                     else:
                         tmp_vers[bibdoc_tmpver] = version
     return record
 
 
 ### Update functions
 
 def update_bibrec_date(record_modification_date, bibrec_id, insert_mode_p, record_creation_date=None, pretend=False):
     """
     Update the date of the record in bibrec table.
 
     Note: record_creation_date is mandatory if insert_mode_p=True.
     """
     if insert_mode_p:
         query = """UPDATE bibrec SET creation_date=%s, modification_date=%s WHERE id=%s"""
         params = (record_creation_date, record_modification_date, bibrec_id)
     else:
         query = """UPDATE bibrec SET modification_date=%s WHERE id=%s"""
         params = (record_modification_date, bibrec_id)
     if not pretend:
         run_sql(query, params)
     write_message("   -Update record creation/modification date: DONE" , verbose=2)
 
 def update_bibfmt_format(id_bibrec, format_value, format_name, modification_date=None, pretend=False):
     """Update the format in the table bibfmt"""
     if modification_date is None:
         modification_date = time.strftime('%Y-%m-%d %H:%M:%S')
     else:
         try:
             time.strptime(modification_date, "%Y-%m-%d %H:%M:%S")
         except ValueError:
             modification_date = '1970-01-01 00:00:00'
 
     # We check if the format is already in bibFmt
     nb_found = find_record_format(id_bibrec, format_name)
     if nb_found == 1:
         # we are going to update the format
         # compress the format_value value
         pickled_format_value =  compress(format_value)
         # update the format:
-        query = """UPDATE LOW_PRIORITY bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s"""
+        query = """UPDATE LOW_PRIORITY bibfmt SET last_updated=%s, value=_binary %s WHERE id_bibrec=%s AND format=%s"""
         params = (modification_date, pickled_format_value, id_bibrec, format_name)
         if not pretend:
             row_id  = run_sql(query, params)
         if not pretend and row_id is None:
             write_message("   ERROR: during update_bibfmt_format function", verbose=1, stream=sys.stderr)
             return 1
         else:
             write_message("   -Update the format %s in bibfmt: DONE" % format_name , verbose=2)
             return 0
 
     elif nb_found > 1:
         write_message("   Failed: Same format %s found several time in bibfmt for the same record." % format_name, verbose=1, stream=sys.stderr)
         return 1
     else:
         # Insert the format information in BibFMT
         res = insert_bibfmt(id_bibrec, format_value, format_name, modification_date, pretend=pretend)
         if res is None:
             write_message("   ERROR: during insert_bibfmt", verbose=1, stream=sys.stderr)
             return 1
         else:
             write_message("   -Insert the format %s in bibfmt: DONE" % format_name , verbose=2)
             return 0
 
 def delete_bibfmt_format(id_bibrec, format_name, pretend=False):
     """
     Delete format FORMAT_NAME from bibfmt table fo record ID_BIBREC.
     """
     if not pretend:
         run_sql("DELETE LOW_PRIORITY FROM bibfmt WHERE id_bibrec=%s and format=%s", (id_bibrec, format_name))
     return 0
 
 
 def archive_marcxml_for_history(recID, affected_fields, pretend=False):
     """
     Archive current MARCXML format of record RECID from BIBFMT table
     into hstRECORD table.  Useful to keep MARCXML history of records.
 
     Return 0 if everything went fine.  Return 1 otherwise.
     """
     res = run_sql("SELECT id_bibrec, value, last_updated FROM bibfmt WHERE format='xm' AND id_bibrec=%s",
                     (recID,))
 
     db_affected_fields = ""
     if affected_fields:
         tmp_affected_fields = {}
         for field in affected_fields:
             if field.isdigit(): #hack for tags from RevisionVerifier
                 for ind in affected_fields[field]:
                     tmp_affected_fields[(field + ind[0] + ind[1] + "%").replace(" ", "_")] = 1
             else:
                 pass #future implementation for fields
         tmp_affected_fields = tmp_affected_fields.keys()
         tmp_affected_fields.sort()
         db_affected_fields = ",".join(tmp_affected_fields)
     if res and not pretend:
         run_sql("""INSERT INTO hstRECORD (id_bibrec, marcxml, job_id, job_name, job_person, job_date, job_details, affected_fields)
-                                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s)""",
+                                    VALUES (%s,_binary %s,%s,%s,%s,%s,_binary %s,%s)""",
                 (res[0][0], res[0][1], task_get_task_param('task_id', 0), 'bibupload', task_get_task_param('user', 'UNKNOWN'), res[0][2],
                     'mode: ' + task_get_option('mode', 'UNKNOWN') + '; file: ' + task_get_option('file_path', 'UNKNOWN') + '.',
                 db_affected_fields))
     return 0
 
 def update_database_with_metadata(record, rec_id, oai_rec_id="oai", affected_tags=None, pretend=False):
     """Update the database tables with the record and the record id given in parameter"""
 
     # extract only those tags that have been affected.
     # check happens at subfield level. This is to prevent overhead
     # associated with inserting already existing field with given ind pair
     write_message("update_database_with_metadata: record=%s, rec_id=%s, oai_rec_id=%s, affected_tags=%s" % (record, rec_id, oai_rec_id, affected_tags), verbose=9)
     tmp_record = {}
     if affected_tags:
         for tag in record.keys():
             if tag in affected_tags.keys():
                 write_message("     -Tag %s found to be modified.Setting up for update" % tag, verbose=9)
                 # initialize new list to hold affected field
                 new_data_tuple_list = []
                 for data_tuple in record[tag]:
                     ind1 = data_tuple[1]
                     ind2 = data_tuple[2]
                     if (ind1, ind2) in affected_tags[tag]:
                         write_message("     -Indicator pair (%s, %s) added to update list" % (ind1, ind2), verbose=9)
                         new_data_tuple_list.append(data_tuple)
                 tmp_record[tag] = new_data_tuple_list
         write_message(lambda: "     -Modified fields: \n%s" % record_xml_output(tmp_record), verbose=2)
     else:
         tmp_record = record
 
     for tag in tmp_record.keys():
         # check if tag is not a special one:
         if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS:
             # for each tag there is a list of tuples representing datafields
             tuple_list = tmp_record[tag]
             # this list should contain the elements of a full tag [tag, ind1, ind2, subfield_code]
             tag_list = []
             tag_list.append(tag)
             for single_tuple in tuple_list:
                 # these are the contents of a single tuple
                 subfield_list = single_tuple[0]
                 ind1 = single_tuple[1]
                 ind2 = single_tuple[2]
                 # append the ind's to the full tag
                 if ind1 == '' or ind1 == ' ':
                     tag_list.append('_')
                 else:
                     tag_list.append(ind1)
                 if ind2 == '' or ind2 == ' ':
                     tag_list.append('_')
                 else:
                     tag_list.append(ind2)
                 datafield_number = single_tuple[4]
 
                 if tag in CFG_BIBUPLOAD_SPECIAL_TAGS:
                     # nothing to do for special tags (FFT, BDR, BDM)
                     pass
                 elif tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and tag != "001":
                     value = single_tuple[3]
                     # get the full tag
                     full_tag = ''.join(tag_list)
 
                     # update the tables
                     write_message("   insertion of the tag "+full_tag+" with the value "+value, verbose=9)
                     # insert the tag and value into into bibxxx
                     (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend)
                     #print 'tname, bibrow', table_name, bibxxx_row_id;
                     if table_name is None or bibxxx_row_id is None:
                         write_message("   Failed: during insert_record_bibxxx", verbose=1, stream=sys.stderr)
                     # connect bibxxx and bibrec with the table bibrec_bibxxx
                     res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend)
                     if res is None:
                         write_message("   Failed: during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr)
                 else:
                     # get the tag and value from the content of each subfield
                     for subfield in subfield_list:
                         subtag = subfield[0]
                         value = subfield[1]
                         tag_list.append(subtag)
                         # get the full tag
                         full_tag = ''.join(tag_list)
                         # update the tables
                         write_message("   insertion of the tag "+full_tag+" with the value "+value, verbose=9)
                         # insert the tag and value into into bibxxx
                         (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend)
                         if table_name is None or bibxxx_row_id is None:
                             write_message("   Failed: during insert_record_bibxxx", verbose=1, stream=sys.stderr)
                         # connect bibxxx and bibrec with the table bibrec_bibxxx
                         res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend)
                         if res is None:
                             write_message("   Failed: during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr)
                         # remove the subtag from the list
                         tag_list.pop()
                 tag_list.pop()
                 tag_list.pop()
             tag_list.pop()
     write_message("   -Update the database with metadata: DONE", verbose=2)
 
     log_record_uploading(oai_rec_id, task_get_task_param('task_id', 0), rec_id, 'P', pretend=pretend)
 
 def append_new_tag_to_old_record(record, rec_old):
     """Append new tags to a old record"""
 
     def _append_tag(tag):
         if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS:
             if tag == '001':
                 pass
             else:
                 # if it is a controlfield, just access the value
                 for single_tuple in record[tag]:
                     controlfield_value = single_tuple[3]
                     # add the field to the old record
                     newfield_number = record_add_field(rec_old, tag,
                         controlfield_value=controlfield_value)
                     if newfield_number is None:
                         write_message("   ERROR: when adding the field"+tag, verbose=1, stream=sys.stderr)
         else:
             # For each tag there is a list of tuples representing datafields
             for single_tuple in record[tag]:
                 # We retrieve the information of the tag
                 subfield_list = single_tuple[0]
                 ind1 = single_tuple[1]
                 ind2 = single_tuple[2]
                 if '%s%s%s' % (tag, ind1 == ' ' and '_' or ind1, ind2 == ' ' and '_' or ind2) in (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[:5]):
                     ## We don't want to append the external identifier
                     ## if it is already existing.
                     if record_find_field(rec_old, tag, single_tuple)[0] is not None:
                         write_message("      Not adding tag: %s ind1=%s ind2=%s subfields=%s: it's already there" % (tag, ind1, ind2, subfield_list), verbose=9)
                         continue
                 # We add the datafield to the old record
                 write_message("      Adding tag: %s ind1=%s ind2=%s subfields=%s" % (tag, ind1, ind2, subfield_list), verbose=9)
                 newfield_number = record_add_field(rec_old, tag, ind1,
                     ind2, subfields=subfield_list)
                 if newfield_number is None:
                     write_message("   ERROR: when adding the field"+tag, verbose=1, stream=sys.stderr)
 
     # Go through each tag in the appended record
     for tag in record:
         _append_tag(tag)
     return rec_old
 
 def copy_strong_tags_from_old_record(record, rec_old):
     """
     Look for strong tags in RECORD and REC_OLD.  If no strong tags are
     found in RECORD, then copy them over from REC_OLD.  This function
     modifies RECORD structure on the spot.
     """
     for strong_tag in CFG_BIBUPLOAD_STRONG_TAGS:
         if not record_get_field_instances(record, strong_tag, strong_tag[3:4] or '%', strong_tag[4:5] or '%'):
             strong_tag_old_field_instances = record_get_field_instances(rec_old, strong_tag)
             if strong_tag_old_field_instances:
                 for strong_tag_old_field_instance in strong_tag_old_field_instances:
                     sf_vals, fi_ind1, fi_ind2, controlfield, dummy = strong_tag_old_field_instance
                     record_add_field(record, strong_tag, fi_ind1, fi_ind2, controlfield, sf_vals)
     return
 
 ### Delete functions
 def delete_tags(record, rec_old):
     """
     Returns a record structure with all the fields in rec_old minus the
     fields in record.
 
     @param record: The record containing tags to delete.
     @type record: record structure
 
     @param rec_old: The original record.
     @type rec_old: record structure
 
     @return: The modified record.
     @rtype: record structure
     """
     returned_record = copy.deepcopy(rec_old)
     for tag, fields in record.iteritems():
         if tag in ('001', ):
             continue
         for field in fields:
             local_position = record_find_field(returned_record, tag, field)[1]
             if local_position is not None:
                 record_delete_field(returned_record, tag, field_position_local=local_position)
     return returned_record
 
 def delete_tags_to_correct(record, rec_old):
     """
     Delete tags from REC_OLD which are also existing in RECORD.  When
     deleting, pay attention not only to tags, but also to indicators,
     so that fields with the same tags but different indicators are not
     deleted.
     """
     ## Some fields are controlled via provenance information.
     ## We should re-add saved fields at the end.
     fields_to_readd = {}
     for tag in CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS:
         if tag[:3] in record:
             tmp_field_instances = record_get_field_instances(record, tag[:3], tag[3], tag[4]) ## Let's discover the provenance that will be updated
             provenances_to_update = []
             for instance in tmp_field_instances:
                 for code, value in instance[0]:
                     if code == tag[5]:
                         if value not in provenances_to_update:
                             provenances_to_update.append(value)
                         break
                 else:
                     ## The provenance is not specified.
                     ## let's add the special empty provenance.
                     if '' not in provenances_to_update:
                         provenances_to_update.append('')
             potential_fields_to_readd = record_get_field_instances(rec_old, tag[:3], tag[3], tag[4]) ## Let's take all the field corresponding to tag
             ## Let's save apart all the fields that should be updated, but
             ## since they have a different provenance not mentioned in record
             ## they should be preserved.
             fields = []
             for sf_vals, ind1, ind2, dummy_cf, dummy_line in potential_fields_to_readd:
                 for code, value in sf_vals:
                     if code == tag[5]:
                         if value not in provenances_to_update:
                             fields.append(sf_vals)
                         break
                 else:
                     if '' not in provenances_to_update:
                         ## Empty provenance, let's protect in any case
                         fields.append(sf_vals)
             fields_to_readd[tag] = fields
 
     # browse through all the tags from the MARCXML file:
     for tag in record:
         # check if the tag exists in the old record too:
         if tag in rec_old and tag != '001':
             # the tag does exist, so delete all record's tag+ind1+ind2 combinations from rec_old
             for dummy_sf_vals, ind1, ind2, dummy_cf, dummyfield_number in record[tag]:
                 write_message("      Delete tag: " + tag + " ind1=" + ind1 + " ind2=" + ind2, verbose=9)
                 record_delete_field(rec_old, tag, ind1, ind2)
 
     ## Ok, we readd necessary fields!
     for tag, fields in fields_to_readd.iteritems():
         for sf_vals in fields:
             write_message("      Adding tag: " + tag[:3] + " ind1=" + tag[3] + " ind2=" + tag[4] + " code=" + str(sf_vals), verbose=9)
             record_add_field(rec_old, tag[:3], tag[3], tag[4], subfields=sf_vals)
 
 def delete_bibrec_bibxxx(record, id_bibrec, affected_tags={}, pretend=False):
     """Delete the database record from the table bibxxx given in parameters"""
 
     # we clear all the rows from bibrec_bibxxx from the old record
     # clearing only those tags that have been modified.
     write_message(lambda: "delete_bibrec_bibxxx(record=%s, id_bibrec=%s, affected_tags=%s)" % (record, id_bibrec, affected_tags), verbose=9)
     for tag in affected_tags:
         # sanity check with record keys just to make sure its fine.
         if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS:
             write_message("%s found in record"%tag, verbose=2)
             # for each name construct the bibrec_bibxxx table name
             table_name = 'bib'+tag[0:2]+'x'
             bibrec_table = 'bibrec_'+table_name
             # delete all the records with proper id_bibrec. Indicators matter for individual affected tags
             tmp_ind_1 = ''
             tmp_ind_2 = ''
             # construct exact tag value using indicators
             for ind_pair in affected_tags[tag]:
                 if ind_pair[0] == ' ':
                     tmp_ind_1 = '_'
                 else:
                     tmp_ind_1 = ind_pair[0]
 
                 if ind_pair[1] == ' ':
                     tmp_ind_2 = '_'
                 else:
                     tmp_ind_2 = ind_pair[1]
                 # need to escape incase of underscore so that mysql treats it as a char
                 tag_val = tag+"\\"+tmp_ind_1+"\\"+tmp_ind_2 + '%'
                 query = """DELETE br.* FROM `%s` br,`%s` b where br.id_bibrec=%%s and br.id_bibxxx=b.id and b.tag like %%s""" % (bibrec_table, table_name)
                 params = (id_bibrec, tag_val)
                 write_message(query % params, verbose=9)
                 if not pretend:
                     run_sql(query, params)
         else:
             write_message("%s not found"%tag, verbose=2)
 
 def main():
     """Main that construct all the bibtask."""
     task_init(authorization_action='runbibupload',
             authorization_msg="BibUpload Task Submission",
             description="""Receive MARC XML file and update appropriate database
 tables according to options.
 Examples:
     $ bibupload -i input.xml
 """,
             help_specific_usage="""  -a, --append\t\tnew fields are appended to the existing record
   -c, --correct\t\tfields are replaced by the new ones in the existing record, except
 \t\t\twhen overridden by CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS
   -i, --insert\t\tinsert the new record in the database
   -r, --replace\t\tthe existing record is entirely replaced by the new one,
 \t\t\texcept for fields in CFG_BIBUPLOAD_STRONG_TAGS
   -d, --delete\t\tspecified fields are deleted in existing record
   -n, --notimechange\tdo not change record last modification date when updating
   -o, --holdingpen\tInsert record into holding pen instead of the normal database
   --pretend\t\tdo not really insert/append/correct/replace the input file
   --force\t\twhen --replace, use provided 001 tag values, even if the matching
 \t\t\trecord does not exist (thus allocating it on-the-fly)
   --callback-url\tSend via a POST request a JSON-serialized answer (see admin guide), in
 \t\t\torder to provide a feedback to an external service about the outcome of the operation.
   --nonce\t\twhen used together with --callback add the nonce value in the JSON message.
   --special-treatment=MODE\tif "oracle" is specified, when used together with --callback_url,
 \t\t\tPOST an application/x-www-form-urlencoded request where the JSON message is encoded
 \t\t\tinside a form field called "results".
 """,
             version=__revision__,
             specific_params=("ircazdnoS:",
                  [
                    "insert",
                    "replace",
                    "correct",
                    "append",
                    "reference",
                    "delete",
                    "notimechange",
                    "holdingpen",
                    "pretend",
                    "force",
                    "callback-url=",
                    "nonce=",
                    "special-treatment=",
                    "stage=",
                  ]),
             task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter,
             task_run_fnc=task_run_core,
             task_submit_check_options_fnc=task_submit_check_options)
 
 def task_submit_elaborate_specific_parameter(key, value, opts, args): # pylint: disable=W0613
     """ Given the string key it checks it's meaning, eventually using the
     value. Usually it fills some key in the options dict.
     It must return True if it has elaborated the key, False, if it doesn't
     know that key.
     eg:
     if key in ['-n', '--number']:
         task_get_option(\1) = value
         return True
     return False
     """
 
     # No time change option
     if key in ("-n", "--notimechange"):
         task_set_option('notimechange', 1)
 
     # Insert mode option
     elif key in ("-i", "--insert"):
         if task_get_option('mode') == 'replace':
             # if also replace found, then set to replace_or_insert
             task_set_option('mode', 'replace_or_insert')
         else:
             task_set_option('mode', 'insert')
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
 
     # Replace mode option
     elif key in ("-r", "--replace"):
         if task_get_option('mode') == 'insert':
             # if also insert found, then set to replace_or_insert
             task_set_option('mode', 'replace_or_insert')
         else:
             task_set_option('mode', 'replace')
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
     # Holding pen mode option
     elif key in ("-o", "--holdingpen"):
         write_message("Holding pen mode", verbose=3)
         task_set_option('mode', 'holdingpen')
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
     # Correct mode option
     elif key in ("-c", "--correct"):
         task_set_option('mode', 'correct')
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
 
     # Append mode option
     elif key in ("-a", "--append"):
         task_set_option('mode', 'append')
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
 
     # Deprecated reference mode option (now correct)
     elif key in ("-z", "--reference"):
         task_set_option('mode', 'correct')
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
 
     elif key in ("-d", "--delete"):
         task_set_option('mode', 'delete')
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
 
     elif key in ("--pretend",):
         task_set_option('pretend', True)
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
 
     elif key in ("--force",):
         task_set_option('force', True)
         fix_argv_paths([args[0]])
         task_set_option('file_path', os.path.abspath(args[0]))
 
     elif key in ("--callback-url", ):
         task_set_option('callback_url', value)
     elif key in ("--nonce", ):
         task_set_option('nonce', value)
     elif key in ("--special-treatment", ):
         if value.lower() in CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS:
             if value.lower() == 'oracle':
                 task_set_option('oracle_friendly', True)
         else:
             print >> sys.stderr, """The specified value is not in the list of allowed special treatments codes: %s""" % CFG_BIBUPLOAD_ALLOWED_SPECIAL_TREATMENTS
             return False
     elif key in ("-S", "--stage"):
         print >> sys.stderr, """WARNING: the --stage parameter is deprecated and ignored."""
     else:
         return False
     return True
 
 
 def task_submit_check_options():
     """ Reimplement this method for having the possibility to check options
     before submitting the task, in order for example to provide default
     values. It must return False if there are errors in the options.
     """
     if task_get_option('mode') is None:
         write_message("Please specify at least one update/insert mode!",
                       stream=sys.stderr)
         return False
 
     file_path = task_get_option('file_path')
     if file_path is None:
         write_message("Missing filename! -h for help.", stream=sys.stderr)
         return False
 
     try:
         open(file_path).read().decode('utf-8')
     except IOError:
         write_message("""File is not accessible: %s""" % file_path,
                       stream=sys.stderr)
         return False
     except UnicodeDecodeError:
         write_message("""File encoding is not valid utf-8: %s""" % file_path,
                       stream=sys.stderr)
         return False
 
     return True
 
 def writing_rights_p():
     """Return True in case bibupload has the proper rights to write in the
     fulltext file folder."""
     if _WRITING_RIGHTS is not None:
         return _WRITING_RIGHTS
     try:
         if not os.path.exists(CFG_BIBDOCFILE_FILEDIR):
             os.makedirs(CFG_BIBDOCFILE_FILEDIR)
         fd, filename = tempfile.mkstemp(suffix='.txt', prefix='test', dir=CFG_BIBDOCFILE_FILEDIR)
         test = os.fdopen(fd, 'w')
         test.write('TEST')
         test.close()
         if open(filename).read() != 'TEST':
             raise IOError("Can not successfully write and readback %s" % filename)
         os.remove(filename)
     except:
         register_exception(alert_admin=True)
         return False
     return True
 
 def post_results_to_callback_url(results, callback_url):
     write_message("Sending feedback to %s" % callback_url)
     if not CFG_JSON_AVAILABLE:
         from warnings import warn
         warn("--callback-url used but simplejson/json not available")
         return
     json_results = json.dumps(results)
 
     write_message("Message to send: %s" % json_results, verbose=9)
     ## <scheme>://<netloc>/<path>?<query>#<fragment>
     scheme, dummynetloc, dummypath, dummyquery, dummyfragment = urlparse.urlsplit(callback_url)
     ## See: http://stackoverflow.com/questions/111945/is-there-any-way-to-do-http-put-in-python
     if scheme == 'http':
         opener = urllib2.build_opener(urllib2.HTTPHandler)
     elif scheme == 'https':
         opener = urllib2.build_opener(urllib2.HTTPSHandler)
     else:
         raise ValueError("Scheme not handled %s for callback_url %s" % (scheme, callback_url))
     if task_get_option('oracle_friendly'):
         write_message("Oracle friendly mode requested", verbose=9)
         request = urllib2.Request(callback_url, data=urllib.urlencode({'results': json_results}))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
     else:
         request = urllib2.Request(callback_url, data=json_results)
         request.add_header('Content-Type', 'application/json')
     request.add_header('User-Agent', make_user_agent_string('BibUpload'))
     write_message("Headers about to be sent: %s" % request.headers, verbose=9)
     write_message("Data about to be sent: %s" % request.data, verbose=9)
     res = opener.open(request)
     msg = res.read()
     write_message("Result of posting the feedback: %s %s" % (res.code, res.msg), verbose=9)
     write_message("Returned message is: %s" % msg, verbose=9)
     return res
 
 def bibupload_records(records, opt_mode=None, opt_notimechange=0,
                       pretend=False, callback_url=None, results_for_callback=None):
     """perform the task of uploading a set of records
     returns list of (error_code, recid) tuples for separate records
     """
     #Dictionaries maintaining temporary identifiers
     # Structure: identifier -> number
 
     tmp_ids = {}
     tmp_vers = {}
 
     results = []
     # The first phase -> assigning meaning to temporary identifiers
 
     if opt_mode == 'reference':
         ## NOTE: reference mode has been deprecated in favour of 'correct'
         opt_mode = 'correct'
 
     record = None
     for record in records:
         record_id = record_extract_oai_id(record)
         task_sleep_now_if_required(can_stop_too=True)
         if opt_mode == "holdingpen":
                     #inserting into the holding pen
             write_message("Inserting into holding pen", verbose=3)
             insert_record_into_holding_pen(record, record_id, pretend=pretend)
         else:
             write_message("Inserting into main database", verbose=3)
             error = bibupload(
                 record,
                 opt_mode = opt_mode,
                 opt_notimechange = opt_notimechange,
                 oai_rec_id = record_id,
                 pretend = pretend,
                 tmp_ids = tmp_ids,
                 tmp_vers = tmp_vers)
             results.append(error)
             if error[0] == 1:
                 if record:
                     write_message(lambda: record_xml_output(record),
                                   stream=sys.stderr)
                 else:
                     write_message("Record could not have been parsed",
                                   stream=sys.stderr)
                 stat['nb_errors'] += 1
                 if callback_url:
                     results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]})
             elif error[0] == 2:
                 if record:
                     write_message(lambda: record_xml_output(record),
                                   stream=sys.stderr)
                 else:
                     write_message("Record could not have been parsed",
                                   stream=sys.stderr)
                 if callback_url:
                     results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]})
             elif error[0] == 0:
                 if callback_url:
                     from invenio.search_engine import print_record
                     results_for_callback['results'].append({'recid': error[1], 'success': True, "marcxml": print_record(error[1], 'xm'), 'url': "%s/%s/%s" % (CFG_SITE_URL, CFG_SITE_RECORD, error[1])})
             else:
                 if callback_url:
                     results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]})
             # stat us a global variable
             task_update_progress("Done %d out of %d." % \
                                      (stat['nb_records_inserted'] + \
                                           stat['nb_records_updated'],
                                       stat['nb_records_to_upload']))
 
     # Second phase -> Now we can process all entries where temporary identifiers might appear (BDR, BDM)
 
     write_message("Identifiers table after processing: %s  versions: %s" % (str(tmp_ids), str(tmp_vers)), verbose=2)
     write_message("Uploading BDR and BDM fields")
     if opt_mode != "holdingpen":
         for record in records:
             record_id = retrieve_rec_id(record, opt_mode, pretend=pretend, post_phase = True)
             bibupload_post_phase(record,
                                  rec_id = record_id,
                                  mode = opt_mode,
                                  pretend = pretend,
                                  tmp_ids = tmp_ids,
                                  tmp_vers = tmp_vers)
 
 
     return results
 
 def task_run_core():
     """ Reimplement to add the body of the task."""
     write_message("Input file '%s', input mode '%s'." %
             (task_get_option('file_path'), task_get_option('mode')))
     write_message("STAGE 0:", verbose=2)
 
     if task_get_option('file_path') is not None:
         write_message("start preocessing", verbose=3)
         task_update_progress("Reading XML input")
         recs = xml_marc_to_records(open_marc_file(task_get_option('file_path')))
         stat['nb_records_to_upload'] = len(recs)
         write_message("   -Open XML marc: DONE", verbose=2)
         task_sleep_now_if_required(can_stop_too=True)
         write_message("Entering records loop", verbose=3)
         callback_url = task_get_option('callback_url')
         results_for_callback = {'results': []}
 
         if recs is not None:
             # We proceed each record by record
             bibupload_records(records=recs, opt_mode=task_get_option('mode'),
                               opt_notimechange=task_get_option('notimechange'),
                               pretend=task_get_option('pretend'),
                               callback_url=callback_url,
                               results_for_callback=results_for_callback)
         else:
             write_message("   ERROR: bibupload failed: No record found",
                         verbose=1, stream=sys.stderr)
         callback_url = task_get_option("callback_url")
         if callback_url:
             nonce = task_get_option("nonce")
             if nonce:
                 results_for_callback["nonce"] = nonce
             post_results_to_callback_url(results_for_callback, callback_url)
 
     if task_get_task_param('verbose') >= 1:
         # Print out the statistics
         print_out_bibupload_statistics()
 
     # Check if they were errors
     return not stat['nb_errors'] >= 1
 
 def log_record_uploading(oai_rec_id, task_id, bibrec_id, insertion_db, pretend=False):
     if oai_rec_id != "" and oai_rec_id != None:
         query = """UPDATE oaiHARVESTLOG SET date_inserted=NOW(), inserted_to_db=%s, id_bibrec=%s WHERE oai_id = %s AND bibupload_task_id = %s ORDER BY date_harvested LIMIT 1"""
         if not pretend:
             run_sql(query, (str(insertion_db), str(bibrec_id), str(oai_rec_id), str(task_id), ))
 
 if __name__ == "__main__":
     main()
diff --git a/modules/bibupload/lib/bibupload_regression_tests.py b/modules/bibupload/lib/bibupload_regression_tests.py
index c8af4285b..fba667b68 100644
--- a/modules/bibupload/lib/bibupload_regression_tests.py
+++ b/modules/bibupload/lib/bibupload_regression_tests.py
@@ -1,5744 +1,5744 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 CERN.
+# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 # pylint: disable=C0301
 
 """Regression tests for the BibUpload."""
 
 __revision__ = "$Id$"
 
 import re
 from invenio.testutils import InvenioTestCase
 import os
 import time
 import sys
 import zlib
 from marshal import loads
 from zlib import decompress
 from urllib import urlencode
 from urllib2 import urlopen
 import pprint
 if sys.hexversion < 0x2060000:
     from md5 import md5
 else:
     from hashlib import md5 # pylint: disable=E0611
 
 from invenio.config import CFG_OAI_ID_FIELD, CFG_PREFIX, CFG_SITE_URL, CFG_TMPDIR, \
      CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \
      CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, \
      CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG, \
      CFG_BINDIR, \
      CFG_SITE_RECORD, \
      CFG_DEVEL_SITE, \
      CFG_BIBUPLOAD_REFERENCE_TAG, \
      CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE
 from invenio.access_control_config import CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS
 from invenio import bibupload
 from invenio.search_engine import print_record, get_record
 from invenio.jsonutils import json
 from invenio.dbquery import run_sql, get_table_status_info
 from invenio.dateutils import convert_datestruct_to_datetext
 from invenio.testutils import make_test_suite, run_test_suite, test_web_page_content
 from invenio.textutils import encode_for_xml
 from invenio.bibtask import task_set_task_param, setup_loggers, task_set_option, task_low_level_submission
 from invenio.bibrecord import record_has_field,record_get_field_value, records_identical, create_record
 from invenio.shellutils import run_shell_command
 from invenio.bibdocfile import BibRecDocs, BibRelation, MoreInfo
 import base64
 import cPickle
 
 # helper functions:
 
 RE_005 = re.compile(re.escape('tag="005"'))
 
 def get_record_from_bibxxx(recid):
     """Return a recstruct built from bibxxx tables"""
     record = "<record>"
     record += """        <controlfield tag="001">%s</controlfield>\n""" % recid
     # controlfields
     query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\
             "WHERE bb.id_bibrec=%s AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\
             "ORDER BY bb.field_number, b.tag ASC"
     res = run_sql(query, (recid, ))
     for row in res:
         field, value = row[0], row[1]
         value = encode_for_xml(value)
         record += """        <controlfield tag="%s">%s</controlfield>\n""" % \
                 (encode_for_xml(field[0:3]), value)
     # datafields
     i = 1 # Do not process bib00x and bibrec_bib00x, as
             # they are controlfields. So start at bib01x and
             # bibrec_bib00x (and set i = 0 at the end of
             # first loop)
     for digit1 in range(0, 10):
         for digit2 in range(i, 10):
             bx = "bib%d%dx" % (digit1, digit2)
             bibx = "bibrec_bib%d%dx" % (digit1, digit2)
             query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\
                     "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\
                     "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx)
             res = run_sql(query, (recid, str(digit1)+str(digit2)+'%'))
             field_number_old = -999
             field_old = ""
             for row in res:
                 field, value, field_number = row[0], row[1], row[2]
                 ind1, ind2 = field[3], field[4]
                 if ind1 == "_" or ind1 == "":
                     ind1 = " "
                 if ind2 == "_" or ind2 == "":
                     ind2 = " "
                 if field_number != field_number_old or field[:-1] != field_old[:-1]:
                     if field_number_old != -999:
                         record += """        </datafield>\n"""
                     record += """        <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \
                                 (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2))
                     field_number_old = field_number
                     field_old = field
                 # print subfield value
                 value = encode_for_xml(value)
                 record += """            <subfield code="%s">%s</subfield>\n""" % \
                     (encode_for_xml(field[-1:]), value)
 
             # all fields/subfields printed in this run, so close the tag:
             if field_number_old != -999:
                 record += """        </datafield>\n"""
         i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x
     # we are at the end of printing the record:
     record += "    </record>\n"
     return record
 
 def remove_tag_001_from_xmbuffer(xmbuffer):
     """Remove tag 001 from MARCXML buffer.  Useful for testing two
        MARCXML buffers without paying attention to recIDs attributed
        during the bibupload.
     """
     return re.sub(r'<controlfield tag="001">.*</controlfield>', '', xmbuffer)
 
 def compare_xmbuffers(xmbuffer1, xmbuffer2):
     """Compare two XM (XML MARC) buffers by removing whitespaces and version
        numbers in tags 005 before testing.
     """
 
     def remove_blanks_from_xmbuffer(xmbuffer):
         """Remove \n and blanks from XMBUFFER."""
         out = xmbuffer.replace("\n", "")
         out = out.replace(" ", "")
         return out
 
     # remove 005 revision numbers:
     xmbuffer1 = re.sub(r'<controlfield tag="005">.*?</controlfield>', '', xmbuffer1)
     xmbuffer2 = re.sub(r'<controlfield tag="005">.*?</controlfield>', '', xmbuffer2)
 
     # remove whitespace:
     xmbuffer1 = remove_blanks_from_xmbuffer(xmbuffer1)
     xmbuffer2 = remove_blanks_from_xmbuffer(xmbuffer2)
 
     if len(RE_005.findall(xmbuffer1)) > 1:
         return "More than 1 005 tag has been found in the first XM: %s" % xmbuffer1
     if len(RE_005.findall(xmbuffer2)) > 1:
         return "More than 1 005 tag has been found in the second XM: %s" % xmbuffer2
 
 
     if xmbuffer1 != xmbuffer2:
         return "\n=" + xmbuffer1 + "=\n" + '!=' + "\n=" + xmbuffer2 + "=\n"
 
     return ''
 
 def remove_tag_001_from_hmbuffer(hmbuffer):
     """Remove tag 001 from HTML MARC buffer.  Useful for testing two
        HTML MARC buffers without paying attention to recIDs attributed
        during the bibupload.
     """
     return re.sub(r'(^|\n)(<pre style="margin: 1em 0px;">)?[0-9]{9}\s001__\s\d+($|\n)', '', hmbuffer)
 
 def compare_hmbuffers(hmbuffer1, hmbuffer2):
     """Compare two HM (HTML MARC) buffers by removing whitespaces
        before testing.
     """
 
     hmbuffer1 = hmbuffer1.strip()
     hmbuffer2 = hmbuffer2.strip()
 
     # remove eventual <pre>...</pre> formatting:
     hmbuffer1 = re.sub(r'^<pre style="margin: 1em 0px;">', '', hmbuffer1)
     hmbuffer2 = re.sub(r'^<pre style="margin: 1em 0px;">', '', hmbuffer2)
     hmbuffer1 = re.sub(r'</pre>$', '', hmbuffer1)
     hmbuffer2 = re.sub(r'</pre>$', '', hmbuffer2)
 
     # remove 005 revision numbers:
     hmbuffer1 = re.sub(r'(^|\n)[0-9]{9}\s005.*($|\n)', '\n', hmbuffer1)
     hmbuffer2 = re.sub(r'(^|\n)[0-9]{9}\s005.*($|\n)', '\n', hmbuffer2)
     hmbuffer1 = hmbuffer1.strip()
     hmbuffer2 = hmbuffer2.strip()
 
     # remove leading recid, leaving only field values:
     hmbuffer1 = re.sub(r'(^|\n)[0-9]{9}\s', '', hmbuffer1)
     hmbuffer2 = re.sub(r'(^|\n)[0-9]{9}\s', '', hmbuffer2)
 
     # remove leading whitespace:
     hmbuffer1 = re.sub(r'(^|\n)\s+', '', hmbuffer1)
     hmbuffer2 = re.sub(r'(^|\n)\s+', '', hmbuffer2)
 
     compared_hmbuffers = hmbuffer1 == hmbuffer2
 
     if not compared_hmbuffers:
         return "\n=" + hmbuffer1 + "=\n" + '!=' + "\n=" + hmbuffer2 + "=\n"
 
     return ''
 
 def wipe_out_record_from_all_tables(recid):
     """
     Wipe out completely the record and all its traces of RECID from
     the database (bibrec, bibrec_bibxxx, bibxxx, bibfmt).  Useful for
     the time being for test cases.
     """
     # delete all the linked bibdocs
     try:
         for bibdoc in BibRecDocs(recid).list_bibdocs():
             bibdoc.expunge()
         # delete from bibrec:
         run_sql("DELETE FROM bibrec WHERE id=%s", (recid,))
         # delete from bibrec_bibxxx:
         for i in range(0, 10):
             for j in range(0, 10):
                 run_sql("DELETE FROM %(bibrec_bibxxx)s WHERE id_bibrec=%%s" %  # kwalitee: disable=sql
                         {'bibrec_bibxxx': "bibrec_bib%i%ix" % (i, j)},
                         (recid,))
         # delete all unused bibxxx values:
         for i in range(0, 10):
             for j in range(0, 10):
                 run_sql("DELETE %(bibxxx)s FROM %(bibxxx)s " \
                         " LEFT JOIN %(bibrec_bibxxx)s " \
                         " ON %(bibxxx)s.id=%(bibrec_bibxxx)s.id_bibxxx " \
                         " WHERE %(bibrec_bibxxx)s.id_bibrec IS NULL" % \
                         {'bibxxx': "bib%i%ix" % (i, j),
                         'bibrec_bibxxx': "bibrec_bib%i%ix" % (i, j)})
         # delete from bibfmt:
         run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s", (recid,))
         # delete from bibrec_bibdoc:
         run_sql("DELETE FROM bibrec_bibdoc WHERE id_bibrec=%s", (recid,))
         # delete from holdingpen
         run_sql("DELETE FROM bibHOLDINGPEN WHERE id_bibrec=%s", (recid,))
         # delete from hstRECORD
         run_sql("DELETE FROM hstRECORD WHERE id_bibrec=%s", (recid,))
     except Exception, err:
         print >> sys.stderr, "Exception captured while wiping records: %s" % err
 
 
 def try_url_download(url):
     """Try to download a given URL"""
     try:
         open_url = urlopen(url)
         open_url.read()
     except Exception, e:
         raise StandardError("Downloading %s is impossible because of %s"
             % (url, str(e)))
     return True
 
 
 class GenericBibUploadTest(InvenioTestCase):
     """Generic BibUpload testing class with predefined
     setUp and tearDown methods.
     """
     def setUp(self):
         self.verbose = 0
         setup_loggers()
         task_set_task_param('verbose', self.verbose)
         self.last_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0]
         self.tear_down = True ## For debugging, whether to call tearDown
         self.webcolled_recids = [] ## List of record webcolled to be re-webcolled upon tearDown
 
     def tearDown(self):
         if self.tear_down:
             for recid in run_sql("SELECT id FROM bibrec WHERE id>%s", (self.last_recid,)):
                 wipe_out_record_from_all_tables(recid[0])
             for recid in list(self.webcolled_recids):
                 self.force_webcoll(recid)
 
     def force_webcoll(self, recid):
         self.webcolled_recids.append(recid)
         from invenio.bibindex_engine_config import CFG_BIBINDEX_INDEX_TABLE_TYPE
         from invenio import bibindex_engine
         from invenio import websearch_webcoll
         ## Reset the collection global cache
         websearch_webcoll.COLLECTION_HOUSE = {}
         bibindex_engine.WordTable("collection",
                                   table_type=CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"]
                                  ).add_recIDs([[recid, recid]], 1)
         #sleep 1s to make sure all tables are ready
         time.sleep(1)
         c = websearch_webcoll.Collection()
         c.calculate_reclist()
         c.update_reclist()
 
     def check_record_consistency(self, recid):
         rec_in_history = create_record(decompress(run_sql("SELECT marcxml FROM hstRECORD WHERE id_bibrec=%s ORDER BY job_date DESC LIMIT 1", (recid, ))[0][0]))[0]
         rec_in_xm = create_record(decompress(run_sql("SELECT value FROM bibfmt WHERE id_bibrec=%s AND format='xm'", (recid, ))[0][0]))[0]
         rec_in_bibxxx = create_record(get_record_from_bibxxx(recid))[0]
         self.failUnless(records_identical(rec_in_xm, rec_in_history, skip_005=False), "\n%s\n!=\n%s\n" % (rec_in_xm, rec_in_history))
         self.failUnless(records_identical(rec_in_xm, rec_in_bibxxx, skip_005=False, ignore_duplicate_subfields=True, ignore_duplicate_controlfields=True), "\n%s\n!=\n%s\n" % (rec_in_xm, rec_in_bibxxx))
         if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE:
             rec_in_recstruct = loads(decompress(run_sql("SELECT value FROM bibfmt WHERE id_bibrec=%s AND format='recstruct'", (recid, ))[0][0]))
             self.failUnless(records_identical(rec_in_xm, rec_in_recstruct, skip_005=False, ignore_subfield_order=True), "\n%s\n!=\n%s\n" % (rec_in_xm, rec_in_recstruct))
 
 class BibUploadRealCaseRemovalDOIViaBibEdit(GenericBibUploadTest):
     def test_removal_of_doi_via_bibedit(self):
         test = """<record>
   <datafield tag="980" ind1=" " ind2=" ">
     <subfield code="a">HEP</subfield>
   </datafield>
   <datafield tag="100" ind1=" " ind2=" ">
     <subfield code="a">Fiore, Gaetano</subfield>
   </datafield>
   <datafield tag="245" ind1=" " ind2=" ">
     <subfield code="a">On quantum mechanics with a magnetic field on R**n and on a torus T**n, and their relation</subfield>
   </datafield>
   <datafield tag="773" ind1=" " ind2=" ">
     <subfield code="p">Int.J.Theor.Phys.</subfield>
     <subfield code="v">52</subfield>
     <subfield code="c">877-896</subfield>
     <subfield code="y">2013</subfield>
   </datafield>
   <datafield tag="650" ind1="1" ind2="7">
     <subfield code="2">INSPIRE</subfield>
     <subfield code="a">General Physics</subfield>
   </datafield>
   <datafield tag="980" ind1=" " ind2=" ">
     <subfield code="a">Published</subfield>
   </datafield>
   <datafield tag="300" ind1=" " ind2=" ">
     <subfield code="a">20</subfield>
   </datafield>
   <datafield tag="269" ind1=" " ind2=" ">
     <subfield code="c">2013</subfield>
   </datafield>
   <datafield tag="653" ind1="1" ind2=" ">
     <subfield code="9">author</subfield>
     <subfield code="a">Bloch theory with magnetic field</subfield>
   </datafield>
   <datafield tag="653" ind1="1" ind2=" ">
     <subfield code="9">author</subfield>
     <subfield code="a">Fiber bundles</subfield>
   </datafield>
   <datafield tag="653" ind1="1" ind2=" ">
     <subfield code="9">author</subfield>
     <subfield code="a">Gauge symmetry</subfield>
   </datafield>
   <datafield tag="653" ind1="1" ind2=" ">
     <subfield code="9">author</subfield>
     <subfield code="a">Quantization on manifolds</subfield>
   </datafield>
   <datafield tag="520" ind1=" " ind2=" ">
     <subfield code="9">Springer</subfield>
     <subfield code="a">We show in elementary terms the equivalence in a general gauge of a U(1)-gauge theory of a scalar charged particle on a torus to the analogous theory on ℝ( )n( ) constrained by quasiperiodicity under translations in the lattice Λ. The latter theory provides a global description of the former: the quasiperiodic wavefunctions ψ defined on ℝ( )n( ) play the role of sections of the associated hermitean line bundle E on , since also E admits a global description as a quotient. The components of the covariant derivatives corresponding to a constant (necessarily integral) magnetic field B=dA generate a Lie algebra g ( )Q( ) and together with the periodic functions the algebra of observables . The non-abelian part of g ( )Q( ) is a Heisenberg Lie algebra with the electric charge operator Q as the central generator, the corresponding Lie group G ( )Q( ) acts on the Hilbert space as the translation group up to phase factors. Also the space of sections of E is mapped into itself by g∈G ( )Q( ). We identify the socalled magnetic translation group as a subgroup of the observables’ group Y ( )Q( ). We determine the unitary irreducible representations of corresponding to integer charges and for each of them an associated orthonormal basis explicitly in configuration space. We also clarify how in the n=2m case a holomorphic structure and Theta functions arise on the associated complex torus.</subfield>
   </datafield>
   <datafield tag="024" ind1="7" ind2=" ">
     <subfield code="2">DOI</subfield>
     <subfield code="a">10.1007/s10773-012-1396-z</subfield>
   </datafield>
   <datafield tag="035" ind1=" " ind2=" ">
     <subfield code="a">Fiore:2013nua</subfield>
     <subfield code="9">INSPIRETeX</subfield>
   </datafield>
   <datafield tag="980" ind1=" " ind2=" ">
     <subfield code="a">Published</subfield>
   </datafield>
   <datafield tag="980" ind1=" " ind2=" ">
     <subfield code="a">Citeable</subfield>
   </datafield>
 </record>
 """
         recs = create_record(test)
         _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid)
         new_rec = get_record(recid)
         del new_rec['024'] ## let's delete DOI
         _, recid2, _ = bibupload.bibupload(new_rec, opt_mode='replace')
         self.assertEqual(recid, recid2)
         self.check_record_consistency(recid2)
 
 
 class BibUploadTypicalBibEditSessionTest(GenericBibUploadTest):
     """Testing a typical BibEdit session"""
 
     def setUp(self):
         GenericBibUploadTest.setUp(self)
         self.test = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """
         recs = bibupload.xml_marc_to_records(self.test)
         # We call the main function with the record as a parameter
         _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(self.recid)
         # We retrieve the inserted xml
         inserted_xm = print_record(self.recid, 'xm')
         # Compare if the two MARCXML are the same
         self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm),
                                           self.test), '')
         self.history = run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, )) # kwalitee: disable=sql
         self.timestamp = run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,))
         self.tag005 = get_record(self.recid)['005'][0][3]
 
     def test_simple_replace(self):
         """BibUpload - test a simple replace as in BibEdit"""
         marc_to_replace1 = """
         <record>
         <controlfield tag="001">%(recid)s</controlfield>
         <controlfield tag="005">%(tag005)s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Foo</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
          <datafield tag="520" ind1=" " ind2=" ">
           <subfield code="a">bla bla bla</subfield>
          </datafield>
         </record>
         """ % {'recid': self.recid, 'tag005': self.tag005}
         recs = bibupload.xml_marc_to_records(marc_to_replace1)
         # We call the main function with the record as a parameter
         _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(self.recid)
         ## The change should have been applied!
         self.failUnless(records_identical(recs[0], get_record(self.recid)), "\n%s\n!=\n%s\n" % (recs[0], get_record(self.recid)))
 
         marc_to_replace2 = """
         <record>
         <controlfield tag="001">%(recid)s</controlfield>
         <controlfield tag="005">%(tag005)s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
          <datafield tag="700" ind1=" " ind2=" ">
           <subfield code="a">Queen Elisabeth</subfield>
           <subfield code="u">Great Britain</subfield>
          </datafield>
         </record>
         """ % {'recid': self.recid, 'tag005': self.tag005}
 
         expected_marc = """
         <record>
         <controlfield tag="001">%(recid)s</controlfield>
         <controlfield tag="005">%(tag005)s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Foo</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
          <datafield tag="520" ind1=" " ind2=" ">
           <subfield code="a">bla bla bla</subfield>
          </datafield>
          <datafield tag="700" ind1=" " ind2=" ">
           <subfield code="a">Queen Elisabeth</subfield>
           <subfield code="u">Great Britain</subfield>
          </datafield>
         </record>
         """ % {'recid': self.recid, 'tag005': self.tag005}
         recs = bibupload.xml_marc_to_records(marc_to_replace2)
         # We call the main function with the record as a parameter
         _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(self.recid)
         ## The change should have been merged with the previous without conflict
         self.failUnless(records_identical(bibupload.xml_marc_to_records(expected_marc)[0], get_record(self.recid)))
 
     def test_replace_with_conflict(self):
         """BibUpload - test a replace as in BibEdit that leads to conflicts"""
         marc_to_replace1 = """
         <record>
         <controlfield tag="001">%(recid)s</controlfield>
         <controlfield tag="005">%(tag005)s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Foo</subfield>
           <subfield code="u">Test Institute2</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
          <datafield tag="520" ind1=" " ind2=" ">
           <subfield code="a">bla bla bla</subfield>
          </datafield>
         </record>
         """ % {'recid': self.recid, 'tag005': self.tag005}
         recs = bibupload.xml_marc_to_records(marc_to_replace1)
         # We call the main function with the record as a parameter
         _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(self.recid)
 
         ## The change should have been applied!
         self.failUnless(records_identical(recs[0], get_record(self.recid)), "\n%s\n!=\n%s" % (recs[0], get_record(self.recid)))
 
         marc_to_replace2 = """
         <record>
         <controlfield tag="001">%(recid)s</controlfield>
         <controlfield tag="005">%(tag005)s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Queen Elisabeth</subfield>
           <subfield code="u">Great Britain</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">No more Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
          <datafield tag="520" ind1=" " ind2=" ">
           <subfield code="a">bla bla bla</subfield>
          </datafield>
         </record>
         """ % {'recid': self.recid, 'tag005': self.tag005}
 
         recs = bibupload.xml_marc_to_records(marc_to_replace2)
         # We call the main function with the record as a parameter
         _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(self.recid)
         ## The change should have been merged with the previous without conflict
         self.failUnless(records_identical(bibupload.xml_marc_to_records(marc_to_replace1)[0], get_record(self.recid)), "%s != %s" % (bibupload.xml_marc_to_records(marc_to_replace1)[0], get_record(self.recid)))
         self.failUnless(records_identical(bibupload.xml_marc_to_records(marc_to_replace2)[0], bibupload.xml_marc_to_records(zlib.decompress(run_sql("SELECT changeset_xml FROM bibHOLDINGPEN WHERE id_bibrec=%s", (self.recid,))[0][0]))[0]))
 
 class BibUploadNoUselessHistoryTest(GenericBibUploadTest):
     """Testing generation of history only when necessary"""
 
     def setUp(self):
         GenericBibUploadTest.setUp(self)
         self.test = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """
         recs = bibupload.xml_marc_to_records(self.test)
         # We call the main function with the record as a parameter
         _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(self.recid)
         # We retrieve the inserted xml
         inserted_xm = print_record(self.recid, 'xm')
         # Compare if the two MARCXML are the same
         self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm),
                                           self.test), '')
         self.history = run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, )) # kwalitee: disable=sql
         self.timestamp = run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,))
 
     def test_replace_identical_record(self):
         """bibupload - replace with identical record does not touch history"""
         xml_to_upload = """
         <record>
         <controlfield tag="001">%s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """ % self.recid
         recs = bibupload.xml_marc_to_records(xml_to_upload)
         # We call the main function with the record as a parameter
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(recid)
         self.assertEqual(self.recid, recid)
         self.assertEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql
         self.assertEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,)))
 
     def test_correct_identical_correction(self):
         """bibupload - correct with identical correction does not touch history"""
         xml_to_upload = """
         <record>
         <controlfield tag="001">%s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
         </record>
         """ % self.recid
         recs = bibupload.xml_marc_to_records(xml_to_upload)
         # We call the main function with the record as a parameter
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
         self.assertEqual(self.recid, recid)
         self.maxDiff = None
         self.assertEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql
         self.assertEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,)))
 
 
     def test_replace_different_record(self):
         """bibupload - replace with different records does indeed touch history"""
         xml_to_upload = """
         <record>
         <controlfield tag="001">%s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """ % self.recid
         recs = bibupload.xml_marc_to_records(xml_to_upload)
         # We call the main function with the record as a parameter
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(recid)
         self.assertEqual(self.recid, recid)
         self.assertNotEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql
         self.failUnless(len(self.history) == 1 and len(run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) == 2) # kwalitee: disable=sql
         self.assertNotEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,)))
 
     def test_correct_different_correction(self):
         """bibupload - correct with different correction does indeed touch history"""
         xml_to_upload = """
         <record>
         <controlfield tag="001">%s</controlfield>
         <controlfield tag="003">FooBar</controlfield>
         </record>
         """ % self.recid
         recs = bibupload.xml_marc_to_records(xml_to_upload)
         # We call the main function with the record as a parameter
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
         self.assertEqual(self.recid, recid)
         self.assertNotEqual(self.history, run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) # kwalitee: disable=sql
         self.failUnless(len(self.history) == 1 and len(run_sql("SELECT * FROM hstRECORD WHERE id_bibrec=%s", (self.recid, ))) == 2) # kwalitee: disable=sql
         self.assertNotEqual(self.timestamp, run_sql("SELECT modification_date FROM bibrec WHERE id=%s", (self.recid,)))
 
 
 class BibUploadCallbackURLTest(GenericBibUploadTest):
     """Testing usage of CLI callback_url"""
 
     def setUp(self):
         GenericBibUploadTest.setUp(self)
         self.test = """<record>
         <datafield tag ="245" ind1=" " ind2=" ">
         <subfield code="a">something</subfield>
         </datafield>
         <datafield tag ="700" ind1=" " ind2=" ">
         <subfield code="a">Tester, J Y</subfield>
         <subfield code="u">MIT</subfield>
         </datafield>
         <datafield tag ="700" ind1=" " ind2=" ">
         <subfield code="a">Tester, K J</subfield>
         <subfield code="u">CERN2</subfield>
         </datafield>
         <datafield tag ="700" ind1=" " ind2=" ">
         <subfield code="a">Tester, G</subfield>
         <subfield code="u">CERN3</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test11</subfield>
         <subfield code="c">test31</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test12</subfield>
         <subfield code="c">test32</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test13</subfield>
         <subfield code="c">test33</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="b">test21</subfield>
         <subfield code="d">test41</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="b">test22</subfield>
         <subfield code="d">test42</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test14</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="e">test51</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="e">test52</subfield>
         </datafield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, T</subfield>
         <subfield code="u">CERN</subfield>
         </datafield>
         </record>"""
         self.testfile_path = os.path.join(CFG_TMPDIR, 'bibupload_regression_test_input.xml')
         open(self.testfile_path, "w").write(self.test)
         self.resultfile_path = os.path.join(CFG_TMPDIR, 'bibupload_regression_test_result.json')
 
     if CFG_DEVEL_SITE:
         def test_simple_insert_callback_url(self):
             """bibupload - --callback-url with simple insert"""
 #
             taskid = task_low_level_submission('bibupload', 'test', '-i', self.testfile_path, '--callback-url', CFG_SITE_URL + '/httptest/post2?%s' % urlencode({"save": self.resultfile_path}), '-v0')
             run_shell_command(CFG_BINDIR + '/bibupload %s', [str(taskid)])
             results = json.loads(open(self.resultfile_path).read())
             self.failUnless('results' in results)
             self.assertEqual(len(results['results']), 1)
             self.failUnless(results['results'][0]['success'])
             self.failUnless(results['results'][0]['recid'] > 0)
             self.failUnless("""<subfield code="a">Tester, J Y</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
 
 class BibUploadBibRelationsTest(GenericBibUploadTest):
     def setUp(self):
         GenericBibUploadTest.setUp(self)
         self.upload_xml = """<record>
     <datafield tag="100" ind1=" " ind2=" ">
       <subfield code="a">A very wise author</subfield>
     </datafield>
     <datafield tag="FFT" ind1=" " ind2=" ">
       <subfield code="a">%(url_site)s/img/user-icon-1-20x20.gif</subfield>
       <subfield code="t">Main</subfield>
       <subfield code="n">docname</subfield>
       <subfield code="i">TMP:id_identifier1</subfield>
       <subfield code="v">TMP:ver_identifier1</subfield>
     </datafield>
     <datafield tag="FFT" ind1=" " ind2=" ">
       <subfield code="a">%(url_site)s/record/8/files/9812226.pdf?version=1</subfield>
       <subfield code="t">Main</subfield>
       <subfield code="n">docname2</subfield>
       <subfield code="i">TMP:id_identifier2</subfield>
       <subfield code="v">TMP:ver_identifier2</subfield>
     </datafield>
     <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="i">TMP:id_identifier1</subfield>
       <subfield code="v">TMP:ver_identifier1</subfield>
       <subfield code="j">TMP:id_identifier2</subfield>
       <subfield code="w">TMP:ver_identifier2</subfield>
       <subfield code="t">is_extracted_from</subfield>
     </datafield>
   </record>""" % {'url_site' : CFG_SITE_URL}
 
     def test_upload_with_tmpids(self):
         """bibupload - Trying to upload a relation between two new documents ... and then to delete"""
         recs = bibupload.xml_marc_to_records(self.upload_xml)
         _, recid, _ =  bibupload.bibupload_records(recs, opt_mode='insert')[0]
         # ertrive document numbers and check if there exists a relation between them
         brd = BibRecDocs(recid)
 
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record")
 
         rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from")
         self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the first document")
 
         rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from")
         self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the second document")
         created_relation_id = rels[0].id
 
         rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
         upload_xml_2 = """<record>
     <controlfield tag="001">%(rec_id)s</controlfield>
     <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="r">%(rel_id)s</subfield>
       <subfield code="d">DELETE</subfield>
     </datafield>
   </record>""" % {'rel_id' : created_relation_id, 'rec_id' : recid}
         recs = bibupload.xml_marc_to_records(upload_xml_2)
 
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         brd = BibRecDocs(recid)
 
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record")
 
         rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
         rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the second document")
         rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
     def test_delete_by_docids(self):
         """bibupload - delete relation entry by the docid inside the currently modified record
 
         Uploading a sample relation and trying to modify it by refering to other parameters than
         the relation number"""
         recs = bibupload.xml_marc_to_records(self.upload_xml)
         dummyerr, recid, _ =  bibupload.bibupload_records(recs, opt_mode='insert')[0]
         brd = BibRecDocs(recid)
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of attached documents")
 
         rel = (docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0]
 
         upload_xml_2 = """<record>
     <controlfield tag="001">%(rec_id)s</controlfield>
     <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="i">%(first_docid)s</subfield>
       <subfield code="v">%(first_docver)s</subfield>
       <subfield code="j">%(second_docid)s</subfield>
       <subfield code="w">%(second_docver)s</subfield>
       <subfield code="t">is_extracted_from</subfield>
       <subfield code="d">DELETE</subfield>
     </datafield>
   </record>""" % { 'rec_id' : recid,
                    'first_docid': rel.bibdoc1_id,
                    'first_docver' : rel.bibdoc1_ver,
                    'second_docid': rel.bibdoc2_id,
                    'second_docver' : rel.bibdoc2_ver}
 
         recs = bibupload.xml_marc_to_records(upload_xml_2)
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         brd = BibRecDocs(recid)
 
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record")
 
         rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
         rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the second document")
         rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
     def test_remove_by_name(self):
         """bibupload - trying removing relation by providing bibdoc names rather than relation numbers"""
         recs = bibupload.xml_marc_to_records(self.upload_xml)
         _, recid, _ =  bibupload.bibupload_records(recs, opt_mode='insert')[0]
         brd = BibRecDocs(recid)
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of attached documents")
 
         rel = (docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0]
 
         upload_xml_2 = """<record>
     <controlfield tag="001">%(rec_id)s</controlfield>
     <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="n">docname</subfield>
       <subfield code="v">%(first_docver)s</subfield>
       <subfield code="o">docname2</subfield>
       <subfield code="w">%(second_docver)s</subfield>
       <subfield code="t">is_extracted_from</subfield>
       <subfield code="d">DELETE</subfield>
     </datafield>
   </record>""" % {'rec_id' : recid,
                    'first_docver' : rel.bibdoc1_ver,
                    'second_docver' : rel.bibdoc2_ver}
 
         # the above is incorrect ! we assert that nothing has been removed
         recs = bibupload.xml_marc_to_records(upload_xml_2)
         _ = bibupload.bibupload_records(recs, opt_mode='correct')[0]
         brd = BibRecDocs(recid)
 
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record")
 
         rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
         rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the second document")
         rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
     def test_remove_by_name_incorrect(self):
         """bibupload - trying removing relation by providing bibdoc names rather than relation numbers, but providing incorrect name"""
         recs = bibupload.xml_marc_to_records(self.upload_xml)
         _, recid, _ =  bibupload.bibupload_records(recs, opt_mode='insert')[0]
         brd = BibRecDocs(recid)
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of attached documents")
 
         rel = (docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0]
 
         upload_xml_2 = """<record>
     <controlfield tag="001">%(rec_id)s</controlfield>
     <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="n">docname1</subfield>
       <subfield code="v">%(first_docver)s</subfield>
       <subfield code="o">docname2</subfield>
       <subfield code="w">%(second_docver)s</subfield>
       <subfield code="t">is_extracted_from</subfield>
       <subfield code="d">DELETE</subfield>
     </datafield>
   </record>""" % { 'rec_id' : recid,
                    'first_docver' : rel.bibdoc1_ver,
                    'second_docver' : rel.bibdoc2_ver}
 
         # the above is incorrect ! we assert that nothing has been removed
         recs = bibupload.xml_marc_to_records(upload_xml_2)
         _ = bibupload.bibupload_records(recs, opt_mode='correct')[0]
         brd = BibRecDocs(recid)
 
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of documents attached to a record")
 
         rels = docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from")
         self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the first document")
 
         rels = docs[1].get_incoming_relations("is_extracted_from") + docs[1].get_outgoing_relations("is_extracted_from")
         self.assertEqual(1, len(rels), "Incorrect number of relations retrieved from the second document")
 
         rels = docs[0].get_incoming_relations("different_type_of_relation") + docs[0].get_outgoing_relations("different_type_of_relation")
         self.assertEqual(0, len(rels), "Incorrect number of relations retrieved from the first document")
 
     def _upload_initial_moreinfo_key(self):
         """Prepare MoreInfo with sample keys and check it has been correctly uploaded
 
         uploaded dic: {"ns1" : {"k1":"val1", "k2":[1,2,3,"something"], "k3" : (1,3,2,"something else"), "k4" : {"a":"b", 1:2}}}
         ... after encoding gives KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2szJwpwNAooSTEKSTMKSTIKUydzb21ldGhpbmcgZWxzZScKdHA1CnNTJ2syJwpwNgoobHA3CkkxCmFJMgphSTMKYVMnc29tZXRoaW5nJwpwOAphc1MnazEnCnA5ClMndmFsMScKcDEwCnNTJ2s0JwpwMTEKKGRwMTIKUydhJwpTJ2InCnNJMQpJMgpzc3Mu
         """
         moreinfo_str = "KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2szJwpwNAooSTEKSTMKSTIKUydzb21ldGhpbmcgZWxzZScKdHA1CnNTJ2syJwpwNgoobHA3CkkxCmFJMgphSTMKYVMnc29tZXRoaW5nJwpwOAphc1MnazEnCnA5ClMndmFsMScKcDEwCnNTJ2s0JwpwMTEKKGRwMTIKUydhJwpTJ2InCnNJMQpJMgpzc3Mu"
         xml_to_upload = """<record>
     <datafield tag="100" ind1=" " ind2=" ">
       <subfield code="a">A very wise author</subfield>
     </datafield>
     <datafield tag="FFT" ind1=" " ind2=" ">
       <subfield code="a">%(url_site)s/img/user-icon-1-20x20.gif</subfield>
       <subfield code="t">Main</subfield>
       <subfield code="n">docname</subfield>
       <subfield code="i">TMP:id_identifier1</subfield>
       <subfield code="v">TMP:ver_identifier1</subfield>
     </datafield>
     <datafield tag="FFT" ind1=" " ind2=" ">
       <subfield code="a">%(url_site)s/record/8/files/9812226.pdf?version=1</subfield>
       <subfield code="t">Main</subfield>
       <subfield code="n">docname2</subfield>
       <subfield code="i">TMP:id_identifier2</subfield>
       <subfield code="v">TMP:ver_identifier2</subfield>
     </datafield>
     <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="i">TMP:id_identifier1</subfield>
       <subfield code="v">TMP:ver_identifier1</subfield>
       <subfield code="j">TMP:id_identifier2</subfield>
       <subfield code="w">TMP:ver_identifier2</subfield>
       <subfield code="t">is_extracted_from</subfield>
       <subfield code="m">%(moreinfo_str)s</subfield>
     </datafield>
   </record>""" % {'url_site' : CFG_SITE_URL, 'moreinfo_str' : moreinfo_str}
         recs = bibupload.xml_marc_to_records(xml_to_upload)
         dummyerr, recid, dummy =  bibupload.bibupload_records(recs, opt_mode='insert')[0]
 
         brd = BibRecDocs(recid)
 
         docs = brd.list_bibdocs()
         self.assertEqual(2, len(docs), "Incorrect number of attached documents")
         return ((docs[0].get_incoming_relations("is_extracted_from") + docs[0].get_outgoing_relations("is_extracted_from"))[0], recid)
 
     def test_add_relation_moreinfo_key(self):
         """bibupload - upload new MoreInfo key into the dictionary related to a relation"""
 
         rel, _ = self._upload_initial_moreinfo_key()
         # asserting correctness of data
 
 
         self.assertEqual(rel.more_info.get_data("ns1", "k1"), "val1", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k1)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[0], 1, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[2], 3, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[3], "something", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k3"), (1,3,2,"something else") , "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k3)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k4")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)")
         self.assertEqual(rel.more_info.get_data("ns1", "k4")["a"], "b", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)")
 
     def test_modify_relation_moreinfo_key(self):
         """bibupload - modify existing MoreInfo key """
 
         #the update : {"ns1":{"k1": "different value"}}
         rel, recid = self._upload_initial_moreinfo_key()
         moreinfo_str = "KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2sxJwpwNApTJ2RpZmZlcmVudCB2YWx1ZScKcDUKc3Mu"
         upload_xml =  """
         <record>
         <controlfield tag="001">%(rec_id)s</controlfield>
         <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="n">docname</subfield>
       <subfield code="o">docname2</subfield>
       <subfield code="v">1</subfield>
       <subfield code="w">1</subfield>
       <subfield code="t">is_extracted_from</subfield>
       <subfield code="m">%(moreinfo_str)s</subfield>
     </datafield>
         </record>""" % {"rec_id" : recid, "moreinfo_str": moreinfo_str}
 
 
 
         recs = bibupload.xml_marc_to_records(upload_xml)
 
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         rel = BibRelation(rel_id = rel.id)
 
         self.assertEqual(rel.more_info.get_data("ns1", "k1"), "different value", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k1)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[0], 1, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[2], 3, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[3], "something", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k3"), (1,3,2,"something else") , "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k3)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k4")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)")
         self.assertEqual(rel.more_info.get_data("ns1", "k4")["a"], "b", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)")
 
         self.assertEqual(rel.more_info.get_data("ns2", "k4"), None, "Retrieved not none value for nonexisting namespace !")
 
     def test_remove_relation_moreinfo_key(self):
         """bibupload - remove existing MoreInfo key """
 
         #the update : {"ns1":{"k3": None}}
         rel, recid = self._upload_initial_moreinfo_key()
         moreinfo_str = "KGRwMQpTJ25zMScKcDIKKGRwMwpTJ2szJwpwNApOc3Mu"
         upload_xml =  """
         <record>
         <controlfield tag="001">%(rec_id)s</controlfield>
         <datafield tag="BDR" ind1=" " ind2=" ">
       <subfield code="n">docname</subfield>
       <subfield code="o">docname2</subfield>
       <subfield code="v">1</subfield>
       <subfield code="w">1</subfield>
       <subfield code="t">is_extracted_from</subfield>
       <subfield code="m">%(moreinfo_str)s</subfield>
     </datafield>
         </record>""" % {"rec_id" : recid, "moreinfo_str": moreinfo_str}
 
         recs = bibupload.xml_marc_to_records(upload_xml)
         bibupload.bibupload_records(recs, opt_mode='correct')
         rel = BibRelation(rel_id = rel.id)
 
         self.assertEqual(rel.more_info.get_data("ns1", "k1"), "val1", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k1)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[0], 1, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[2], 3, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
         self.assertEqual(rel.more_info.get_data("ns1", "k2")[3], "something", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k2)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k3"), None , "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k3)")
 
         self.assertEqual(rel.more_info.get_data("ns1", "k4")[1], 2, "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)")
         self.assertEqual(rel.more_info.get_data("ns1", "k4")["a"], "b", "Retrieved incorrect data from the MoreInfo Dictionary (namespace : ns1 key: k4)")
 
 class BibUploadMoreInfoTest(GenericBibUploadTest):
     """bibupload -  Testing upload of different types of MoreInfo """
 
     def _dict_checker(self, dic, more_info, equal = True):
         """ Check the more_info for being conform with the dictionary
         @param equal - The mode of conformity. True means that the dictionary
                has to be equal with the MoreInfo. False means that dictionary
                has to be contained in the MoreInfo
         """
         for namespace in dic:
             for key in dic[namespace]:
                 self.assertEqual(cPickle.dumps(dic[namespace][key]),
                                  cPickle.dumps(more_info.get_data(namespace, key)),
                                  "Different values for the value of key %s in the namespace %s inside of the MoreInfo object" % \
                                      (namespace, key))
 
         if equal:
             for namespace in more_info.get_namespaces():
                 for key in more_info.get_keys(namespace):
                     self.assertTrue(namespace in dic,
                                     "namespace %s present in the MoreInfo, but not present in the dictionary" % \
                                         (namespace, ))
                     self.assertTrue(key in dic[namespace],
                                     "key %s present in the namespace %s of the MoreInfo but not present in the dictionary" % \
                                         (namespace, key))
                     self.assertEqual(cPickle.dumps(more_info.get_data(namespace, key)),
                                      cPickle.dumps(dic[namespace][key]),
                                      "Value for namespace '%s' and key '%s' varies between MoreInfo and the dictionary. moreinfo value: '%s' dictionary value: '%s'" % \
                                          (namespace, key, repr(more_info.get_data(namespace, key)), repr(dic[namespace][key])))
     def test_relation_moreinfo_insert(self):
         """bibupload - Testing the upload of BibRelation and corresponding MoreInfo field"""
         # Cleaning existing data
         rels = BibRelation.get_relations(bibdoc1_id = 70, bibdoc2_id = 71, rel_type = "is_extracted_from")
         for rel in rels:
             rel.delete()
 
         # Uploading
         relation_upload_template = """
         <record>
            <datafield tag="BDR" ind1=" " ind2=" ">
               <subfield code="i">70</subfield>
               <subfield code="j">71</subfield>
               <subfield code="t">is_extracted_from</subfield>
               <subfield code="m">%s</subfield>
            </datafield>
            <datafield tag="100" ind1=" " ind2=" ">
               <subfield code="a">Some author</subfield>
            </datafield>
         </record>"""
         data_to_insert = {"first namespace": {"k1" : "val1", "k2" : "val2"},
                           "second" : {"k1" : "#@$#$@###!!!", "k123": {1:2, 9: (6,2,7)}}}
         serialised = base64.b64encode(cPickle.dumps(data_to_insert))
 
         recs = bibupload.xml_marc_to_records(relation_upload_template % (serialised, ))
         bibupload.bibupload_records(recs, opt_mode='insert')[0]
 
         # Verifying the correctness of the uploaded data
         rels = BibRelation.get_relations(bibdoc1_id = 70, bibdoc2_id = 71, rel_type = "is_extracted_from")
         self.assertEqual(len(rels), 1)
         rel = rels[0]
 
         self.assertEqual(rel.bibdoc1_id, 70)
         self.assertEqual(rel.bibdoc2_id, 71)
         self.assertEqual(rel.get_data("first namespace", "k1"), "val1")
         self.assertEqual(rel.get_data("first namespace", "k2"), "val2")
         self.assertEqual(rel.get_data("second", "k1"), "#@$#$@###!!!")
         self.assertEqual(rel.get_data("second", "k123")[1], 2)
         self.assertEqual(rel.get_data("second", "k123")[9], (6,2,7))
 
         self._dict_checker(data_to_insert, rel.more_info)
 
         # Cleaning after the upload ... just in case we have selected more
         for rel in rels:
             rel.delete()
 
     def _serialise_data(self, data):
         return base64.b64encode(cPickle.dumps(data))
 
     # Subfield tags used to upload particular types of MoreInfo
     _mi_bibdoc = "w"
     _mi_bibdoc_version = "p"
     _mi_bibdoc_version_format = "b"
     _mi_bibdoc_format = "u"
 
     def _generate_moreinfo_tag(self, mi_type, data):
         """
         """
         serialised = self._serialise_data(data)
         return """<subfield code="%s">%s</subfield>""" % (mi_type, serialised)
 
     def test_document_moreinfo_insert(self):
         """bibupload - Inserting new MoreInfo to the document
         1) Inserting new MoreInfo to new document
         2) Inserting new MoreInfo keys existing document version
         3) Removing keys from MoreInfo
         4) Removing document and asserting, MoreInfo gets removed as well
         5) Overriding MoreInfo keys
         """
 
         moreinfo_upload_template = """
         <record>
             <datafield tag="FFT" ind1=" " ind2=" ">
                <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
                <subfield code="n">0106015_01</subfield>
                <subfield code="f">.jpg</subfield>
                <subfield code="r">restricted_picture</subfield>
                %%(additional_content)s
            </datafield>
            <datafield tag="100" ind1=" " ind2=" ">
               <subfield code="a">Some author</subfield>
            </datafield>
         </record>""" % {"siteurl": CFG_SITE_URL}
 
         sfs = []
         sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc,
                                                {"first namespace" :
                                                 {"type": "document moreinfo"}}))
         sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc_version,
                                                {"first namespace" :
                                                 {"type": "Bibdoc - version moreinfo"}}))
         sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc_version_format,
                                                {"first namespace" :
                                                 {"type": "Bibdoc - version, format moreinfo"}}))
         sfs.append(self._generate_moreinfo_tag(BibUploadMoreInfoTest._mi_bibdoc_format,
                                                {"first namespace" :
                                                 {"type": "Bibdoc - format moreinfo"}}))
         marcxml_1 = moreinfo_upload_template % {"additional_content" : "\n".join(sfs)}
         recs = bibupload.xml_marc_to_records(marcxml_1)
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         # now checking if all the data has been uploaded correctly
 
         bdr = BibRecDocs(recid)
         doc = bdr.list_bibdocs()[0]
         docid = doc.get_id()
         mi_doc = MoreInfo(docid = docid)
         mi_doc_ver = MoreInfo(docid = docid, version = 1)
         mi_doc_ver_fmt = MoreInfo(docid = docid, version = 1, docformat=".jpg")
         mi_doc_fmt = MoreInfo(docid = docid, docformat=".jpg")
         self._dict_checker({"first namespace" : {"type": "document moreinfo"}},
                            mi_doc, equal=False) # in case of the document only inclusive check
         self._dict_checker({"first namespace" : {"type": "Bibdoc - version moreinfo"}},
                            mi_doc_ver)
         self._dict_checker({"first namespace" : {
                     "type": "Bibdoc - version, format moreinfo"}},
                            mi_doc_ver_fmt)
         self._dict_checker({"first namespace" : {"type": "Bibdoc - format moreinfo"}},
                            mi_doc_fmt)
         #now appending to a particular version of MoreInfo
         # uplad new key to an existing dictionary of a version
         def _get_mit_template(recid, bibdocid=None, bibdocname=None,
                               version=None, docformat=None, relation=None, data=None):
             if data is None:
                 ser = None
             else:
                 ser = base64.b64encode(cPickle.dumps(data))
             subfields = []
 
             for s_code, val in (("r", relation), ("i", bibdocid),
                                 ("n", bibdocname), ("v", version),
                                 ("f", docformat) , ("m", ser)):
                 if not val is None:
                     subfields.append("""<subfield code="%s">%s</subfield>""" % \
                                          (s_code, val))
 
             return """<record>
               <controlfield tag="001">%s</controlfield>
               <datafield tag="BDM" ind1=" " ind2=" ">
                  %s
               </datafield>
               </record>""" % (str(recid), ("\n".join(subfields)))
 
         marcxml_2 = _get_mit_template(recid, version = 1, bibdocid = docid,
                                        data= {"first namespace" :
                                               {"new key": {1:2, 987:678}}})
         recs = bibupload.xml_marc_to_records(marcxml_2)
         bibupload.bibupload_records(recs, opt_mode='append')
         mi = MoreInfo(docid = docid, version = 1)
 
         self._dict_checker({
                 "first namespace" : {"type": "Bibdoc - version moreinfo",
                                      "new key": {1:2, 987:678}
                                      }
                 }, mi)
 
         #removing the entire old content of the MoreInfo and uploading new
         data = {"ns1" : {"nk1": 12, "mk1": "this is new content"},
                 "namespace two" : {"ddd" : "bbb"}}
         marcxml_3 = _get_mit_template(recid, version = 1, bibdocid = docid,
                                        data= data)
         recs = bibupload.xml_marc_to_records(marcxml_3)
         bibupload.bibupload_records(recs, opt_mode='correct')
         mi = MoreInfo(docid = docid, version = 1)
         self._dict_checker(data, mi)
 
         # removing a particular key
 
         marcxml_4 = _get_mit_template(recid, version = 1, bibdocid = docid,
                                        data= {"ns1": {"nk1" : None}})
         recs = bibupload.xml_marc_to_records(marcxml_4)
         bibupload.bibupload_records(recs, opt_mode='append')
         mi = MoreInfo(docid = docid, version = 1)
         self._dict_checker( {"ns1" : { "mk1": "this is new content"},
                 "namespace two" : {"ddd" : "bbb"}}, mi)
 
         # adding new key
         marcxml_5 = _get_mit_template(recid, version = 1, bibdocid = docid,
                                        data= {"ns1": {"newkey" : "newvalue"}})
         recs = bibupload.xml_marc_to_records(marcxml_5)
         bibupload.bibupload_records(recs, opt_mode='append')
         mi = MoreInfo(docid = docid, version = 1)
         self._dict_checker( {"ns1" : { "mk1": "this is new content", "newkey" : "newvalue"},
                 "namespace two" : {"ddd" : "bbb"}}, mi)
 
 
 class BibUploadInsertModeTest(GenericBibUploadTest):
     """Testing insert mode."""
 
     def setUp(self):
         # pylint: disable=C0103
         """Initialise the MARCXML variable"""
         GenericBibUploadTest.setUp(self)
         self.test = """<record>
         <datafield tag ="245" ind1=" " ind2=" ">
         <subfield code="a">something</subfield>
         </datafield>
         <datafield tag ="700" ind1=" " ind2=" ">
         <subfield code="a">Tester, J Y</subfield>
         <subfield code="u">MIT</subfield>
         </datafield>
         <datafield tag ="700" ind1=" " ind2=" ">
         <subfield code="a">Tester, K J</subfield>
         <subfield code="u">CERN2</subfield>
         </datafield>
         <datafield tag ="700" ind1=" " ind2=" ">
         <subfield code="a">Tester, G</subfield>
         <subfield code="u">CERN3</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test11</subfield>
         <subfield code="c">test31</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test12</subfield>
         <subfield code="c">test32</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test13</subfield>
         <subfield code="c">test33</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="b">test21</subfield>
         <subfield code="d">test41</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="b">test22</subfield>
         <subfield code="d">test42</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="a">test14</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="e">test51</subfield>
         </datafield>
         <datafield tag ="111" ind1=" " ind2=" ">
         <subfield code="e">test52</subfield>
         </datafield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, T</subfield>
         <subfield code="u">CERN</subfield>
         </datafield>
         </record>"""
         self.test_hm = """
         100__ $$aTester, T$$uCERN
         111__ $$atest11$$ctest31
         111__ $$atest12$$ctest32
         111__ $$atest13$$ctest33
         111__ $$btest21$$dtest41
         111__ $$btest22$$dtest42
         111__ $$atest14
         111__ $$etest51
         111__ $$etest52
         245__ $$asomething
         700__ $$aTester, J Y$$uMIT
         700__ $$aTester, K J$$uCERN2
         700__ $$aTester, G$$uCERN3
         """
 
     def test_create_record_id(self):
         """bibupload - insert mode, trying to create a new record ID in the database"""
         rec_id = bibupload.create_new_record()
         self.assertNotEqual(None, rec_id)
 
     def test_create_specific_record_id(self):
         """bibupload - insert mode, trying to create a new specifc record ID in the database"""
         expected_rec_id = run_sql("SELECT MAX(id) FROM bibrec")[0][0] + 1
         rec_id = bibupload.create_new_record(expected_rec_id)
         self.assertEqual(rec_id, expected_rec_id)
 
     def test_no_retrieve_record_id(self):
         """bibupload - insert mode, detection of record ID in the input file"""
         # We create create the record out of the xml marc
         recs = bibupload.xml_marc_to_records(self.test)
         # We call the function which should retrieve the record id
         rec_id = bibupload.retrieve_rec_id(recs[0], 'insert')
         # We compare the value found with None
         self.assertEqual(None, rec_id)
 
     def test_insert_complete_xmlmarc(self):
         """bibupload - insert mode, trying to insert complete MARCXML file"""
         # Initialize the global variable
         # We create create the record out of the xml marc
         recs = bibupload.xml_marc_to_records(self.test)
         # We call the main function with the record as a parameter
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # We retrieve the inserted xml
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         inserted_hm = print_record(recid, 'hm')
         # Compare if the two MARCXML are the same
         self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm),
                                           self.test), '')
         self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(inserted_hm),
                                           self.test_hm), '')
 
     def test_retrieve_005_tag(self):
         """bibupload - insert mode, verifying insertion of 005 control field for record """
         # Convert marc xml into record structure
         recs = bibupload.xml_marc_to_records(self.test)
         dummy, recid, dummy = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid)
         # Retrive the inserted record based on the record id
         rec = get_record(recid)
         # We retrieve the creationdate date from the database
         query = """SELECT DATE_FORMAT(last_updated,'%%Y%%m%%d%%H%%i%%s') FROM bibfmt where id_bibrec=%s AND format='xm'"""
         res = run_sql(query, (recid, ))
         self.assertEqual(record_has_field(rec, '005'), True)
         self.assertEqual(str(res[0][0]) + '.0', record_get_field_value(rec, '005', '', ''))
 
 class BibUploadAppendModeTest(GenericBibUploadTest):
     """Testing append mode."""
 
     def setUp(self):
         # pylint: disable=C0103
         """Initialize the MARCXML variable"""
         GenericBibUploadTest.setUp(self)
         self.test_existing = """<record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, T</subfield>
         <subfield code="u">DESY</subfield>
         </datafield>
         <datafield tag ="970" ind1=" " ind2=" ">
         <subfield code="a">0003719PHOPHO</subfield>
         </datafield>
         </record>"""
         self.test_to_append = """<record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, U</subfield>
         <subfield code="u">CERN</subfield>
         </datafield>
         <datafield tag ="970" ind1=" " ind2=" ">
         <subfield code="a">0003719PHOPHO</subfield>
         </datafield>
         </record>"""
         self.test_expected_xm = """<record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, T</subfield>
         <subfield code="u">DESY</subfield>
         </datafield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, U</subfield>
         <subfield code="u">CERN</subfield>
         </datafield>
         <datafield tag ="970" ind1=" " ind2=" ">
         <subfield code="a">0003719PHOPHO</subfield>
         </datafield>
         </record>"""
         self.test_expected_hm = """
         001__ 123456789
         100__ $$aTester, T$$uDESY
         100__ $$aTester, U$$uCERN
         970__ $$a0003719PHOPHO
         """
         # insert test record:
 
         test_to_upload =  self.test_existing.replace('<controlfield tag="001">123456789</controlfield>',
                                                      '')
         recs = bibupload.xml_marc_to_records(test_to_upload)
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         self.test_recid = recid
         # replace test buffers with real recid of inserted test record:
         self.test_existing = self.test_existing.replace('123456789',
                                                         str(self.test_recid))
         self.test_to_append = self.test_to_append.replace('123456789',
                                                           str(self.test_recid))
         self.test_expected_xm = self.test_expected_xm.replace('123456789',
                                                               str(self.test_recid))
         self.test_expected_hm = self.test_expected_hm.replace('123456789',
                                                               str(self.test_recid))
 
     def test_retrieve_record_id(self):
         """bibupload - append mode, the input file should contain a record ID"""
         # We create create the record out of the xml marc
         recs = bibupload.xml_marc_to_records(self.test_to_append)
         # We call the function which should retrieve the record id
         rec_id = bibupload.retrieve_rec_id(recs[0], 'append')
         # We compare the value found with None
         self.assertEqual(self.test_recid, rec_id)
         # clean up after ourselves:
 
     def test_update_modification_record_date(self):
         """bibupload - append mode, checking the update of the modification date"""
         # Initialize the global variable
         # We create create the record out of the xml marc
         recs = bibupload.xml_marc_to_records(self.test_existing)
         # We call the function which should retrieve the record id
         rec_id = bibupload.retrieve_rec_id(recs[0], opt_mode='append')
         # Retrieve current localtime
         record_modification_date = time.localtime()
         # We update the modification date
         bibupload.update_bibrec_date(convert_datestruct_to_datetext(record_modification_date), rec_id, False)
         # We retrieve the modification date from the database
         query = """SELECT DATE_FORMAT(modification_date,'%%Y-%%m-%%d %%H:%%i:%%s') FROM bibrec where id = %s"""
         res = run_sql(query, (str(rec_id), ))
         # We compare the two results
         self.assertEqual(res[0][0], convert_datestruct_to_datetext(record_modification_date))
         # clean up after ourselves:
 
     def test_append_complete_xml_marc(self):
         """bibupload - append mode, appending complete MARCXML file"""
         # Now we append a datafield
         # We create create the record out of the xml marc
         recs = bibupload.xml_marc_to_records(self.test_to_append)
         # We call the main function with the record as a parameter
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='append')[0]
         self.check_record_consistency(recid)
         # We retrieve the inserted xm
         after_append_xm = print_record(recid, 'xm')
         after_append_hm = print_record(recid, 'hm')
         # Compare if the two MARCXML are the same
         self.assertEqual(compare_xmbuffers(after_append_xm, self.test_expected_xm), '')
         self.assertEqual(compare_hmbuffers(after_append_hm, self.test_expected_hm), '')
 
     def test_retrieve_updated_005_tag(self):
         """bibupload - append mode, updating 005 control tag after modifiction """
         recs = bibupload.xml_marc_to_records(self.test_to_append)
         _, recid, _ = bibupload.bibupload(recs[0], opt_mode='append')
         self.check_record_consistency(recid)
         rec = get_record(recid)
         query = """SELECT DATE_FORMAT(MAX(job_date),'%%Y%%m%%d%%H%%i%%s') FROM hstRECORD where id_bibrec = %s"""
         res =  run_sql(query, (str(recid), ))
         self.assertEqual(str(res[0][0])+'.0',record_get_field_value(rec,'005','',''))
 
 class BibUploadCorrectModeTest(GenericBibUploadTest):
     """
     Testing correcting a record containing similar tags (identical
     tag, different indicators).  Currently Invenio replaces only
     those tags that have matching indicators too, unlike ALEPH500 that
     does not pay attention to indicators, it corrects all fields with
     the same tag, regardless of the indicator values.
     """
 
     def setUp(self):
         """Initialize the MARCXML test record."""
         GenericBibUploadTest.setUp(self)
         self.testrec1_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
         <datafield tag="100" ind1=" " ind2=" ">
          <subfield code="a">Test, Jane</subfield>
          <subfield code="u">Test Institute</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, John</subfield>
          <subfield code="u">Test University</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="8">
          <subfield code="a">Cool</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, Jim</subfield>
          <subfield code="u">Test Laboratory</subfield>
         </datafield>
         </record>
         """
         self.testrec1_hm = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         10047 $$aTest, John$$uTest University
         10048 $$aCool
         10047 $$aTest, Jim$$uTest Laboratory
         """
         self.testrec1_xm_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, Joseph</subfield>
          <subfield code="u">Test Academy</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test2, Joseph</subfield>
          <subfield code="u">Test2 Academy</subfield>
         </datafield>
         </record>
         """
         self.testrec1_corrected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
         <datafield tag="100" ind1=" " ind2=" ">
          <subfield code="a">Test, Jane</subfield>
          <subfield code="u">Test Institute</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="8">
          <subfield code="a">Cool</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, Joseph</subfield>
          <subfield code="u">Test Academy</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test2, Joseph</subfield>
          <subfield code="u">Test2 Academy</subfield>
         </datafield>
         </record>
         """
         self.testrec1_corrected_hm = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         10048 $$aCool
         10047 $$aTest, Joseph$$uTest Academy
         10047 $$aTest2, Joseph$$uTest2 Academy
         """
         # insert test record:
         test_record_xm = self.testrec1_xm.replace('<controlfield tag="001">123456789</controlfield>',
                                                   '')
         recs = bibupload.xml_marc_to_records(test_record_xm)
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recID:
         self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid))
         self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid))
         self.testrec1_xm_to_correct = self.testrec1_xm_to_correct.replace('123456789', str(recid))
         self.testrec1_corrected_xm = self.testrec1_corrected_xm.replace('123456789', str(recid))
         self.testrec1_corrected_hm = self.testrec1_corrected_hm.replace('123456789', str(recid))
         # test of the inserted record:
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '')
 
     def test_record_correction(self):
         """bibupload - correct mode, similar MARCXML tags/indicators"""
         # correct some tags:
         recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_correct)
         _, self.recid, _ = bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(self.recid)
         corrected_xm = print_record(self.recid, 'xm')
         corrected_hm = print_record(self.recid, 'hm')
         # did it work?
         self.assertEqual(compare_xmbuffers(corrected_xm, self.testrec1_corrected_xm), '')
         self.assertEqual(compare_hmbuffers(corrected_hm, self.testrec1_corrected_hm), '')
         # clean up after ourselves:
         return
 
 class BibUploadDeleteModeTest(GenericBibUploadTest):
     """
     Testing deleting specific tags from a record while keeping anything else
     untouched.  Currently Invenio deletes only those tags that have
     matching indicators too, unlike ALEPH500 that does not pay attention to
     indicators, it corrects all fields with the same tag, regardless of the
     indicator values.
     """
 
     def setUp(self):
         """Initialize the MARCXML test record."""
         GenericBibUploadTest.setUp(self)
         self.testrec1_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
         <datafield tag="100" ind1=" " ind2=" ">
          <subfield code="a">Test, Jane</subfield>
          <subfield code="u">Test Institute</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, John</subfield>
          <subfield code="u">Test University</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="8">
          <subfield code="a">Cool</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, Jim</subfield>
          <subfield code="u">Test Laboratory</subfield>
         </datafield>
         <datafield tag="888" ind1=" " ind2=" ">
          <subfield code="a">dumb text</subfield>
         </datafield>
         </record>
         """
         self.testrec1_hm = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         10047 $$aTest, John$$uTest University
         10048 $$aCool
         10047 $$aTest, Jim$$uTest Laboratory
         888__ $$adumb text
         """
         self.testrec1_xm_to_delete = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag="100" ind1=" " ind2=" ">
          <subfield code="a">Test, Jane</subfield>
          <subfield code="u">Test Institute</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, Johnson</subfield>
          <subfield code="u">Test University</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="8">
          <subfield code="a">Cool</subfield>
         </datafield>
         <datafield tag="888" ind1=" " ind2=" ">
          <subfield code="a">dumb text</subfield>
         </datafield>
         </record>
         """
         self.testrec1_corrected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, John</subfield>
          <subfield code="u">Test University</subfield>
         </datafield>
         <datafield tag="100" ind1="4" ind2="7">
          <subfield code="a">Test, Jim</subfield>
          <subfield code="u">Test Laboratory</subfield>
         </datafield>
         </record>
         """
         self.testrec1_corrected_hm = """
         001__ 123456789
         003__ SzGeCERN
         10047 $$aTest, John$$uTest University
         10047 $$aTest, Jim$$uTest Laboratory
         """
         # insert test record:
         test_record_xm = self.testrec1_xm.replace('<controlfield tag="001">123456789</controlfield>',
                                                   '')
         recs = bibupload.xml_marc_to_records(test_record_xm)
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recID:
         self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid))
         self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid))
         self.testrec1_xm_to_delete = self.testrec1_xm_to_delete.replace('123456789', str(recid))
         self.testrec1_corrected_xm = self.testrec1_corrected_xm.replace('123456789', str(recid))
         self.testrec1_corrected_hm = self.testrec1_corrected_hm.replace('123456789', str(recid))
         # test of the inserted record:
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '')
         # Checking dumb text is in bibxxx
         self.failUnless(run_sql("SELECT id_bibrec from bibrec_bib88x WHERE id_bibrec=%s", (recid, )))
 
     def test_record_tags_deletion(self):
         """bibupload - delete mode, deleting specific tags"""
         # correct some tags:
         recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_delete)
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='delete')[0]
         self.check_record_consistency(recid)
         corrected_xm = print_record(recid, 'xm')
         corrected_hm = print_record(recid, 'hm')
         # did it work?
         self.assertEqual(compare_xmbuffers(corrected_xm, self.testrec1_corrected_xm), '')
         self.assertEqual(compare_hmbuffers(corrected_hm, self.testrec1_corrected_hm), '')
         # Checking dumb text is no more in bibxxx
         self.failIf(run_sql("SELECT id_bibrec from bibrec_bib88x WHERE id_bibrec=%s", (recid, )))
         # clean up after ourselves:
 
 
 class BibUploadReplaceModeTest(GenericBibUploadTest):
     """Testing replace mode."""
 
     def test_record_replace(self):
         """bibupload - replace mode, similar MARCXML tags/indicators"""
         # replace some tags:
         testrec1_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """
         testrec1_hm = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         10047 $$aTest, John$$uTest University
         10048 $$aCool
         10047 $$aTest, Jim$$uTest Laboratory
         """
         testrec1_xm_to_replace = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Joseph</subfield>
           <subfield code="u">Test Academy</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test2, Joseph</subfield>
           <subfield code="u">Test2 Academy</subfield>
          </datafield>
         </record>
         """
         testrec1_replaced_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Joseph</subfield>
           <subfield code="u">Test Academy</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test2, Joseph</subfield>
           <subfield code="u">Test2 Academy</subfield>
          </datafield>
         </record>
         """
         testrec1_replaced_hm = """
         001__ 123456789
         10047 $$aTest, Joseph$$uTest Academy
         10047 $$aTest2, Joseph$$uTest2 Academy
         """
         # insert test record:
         test_record_xm = testrec1_xm.replace('<controlfield tag="001">123456789</controlfield>',
                                                   '')
         recs = bibupload.xml_marc_to_records(test_record_xm)
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recID:
         testrec1_xm = testrec1_xm.replace('123456789', str(recid))
         testrec1_hm = testrec1_hm.replace('123456789', str(recid))
         testrec1_xm_to_replace = testrec1_xm_to_replace.replace('123456789', str(recid))
         testrec1_replaced_xm = testrec1_replaced_xm.replace('123456789', str(recid))
         testrec1_replaced_hm = testrec1_replaced_hm.replace('123456789', str(recid))
         # test of the inserted record:
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(inserted_xm, testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(inserted_hm, testrec1_hm), '')
         recs = bibupload.xml_marc_to_records(testrec1_xm_to_replace)
         _, recid, _ = bibupload.bibupload(recs[0], opt_mode='replace')
         self.check_record_consistency(recid)
         replaced_xm = print_record(recid, 'xm')
         replaced_hm = print_record(recid, 'hm')
         # did it work?
         self.assertEqual(compare_xmbuffers(replaced_xm, testrec1_replaced_xm), '')
         self.assertEqual(compare_hmbuffers(replaced_hm, testrec1_replaced_hm), '')
 
     def test_record_replace_force_non_existing(self):
         """bibupload - replace mode, force non existing recid"""
         # replace some tags:
         the_recid = self.last_recid + 1
         testrec1_xm = """
         <record>
         <controlfield tag="001">%s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """ % the_recid
         testrec1_hm = """
         001__ %s
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         10047 $$aTest, John$$uTest University
         10048 $$aCool
         10047 $$aTest, Jim$$uTest Laboratory
         """ % the_recid
         recs = bibupload.xml_marc_to_records(testrec1_xm)
         task_set_option('force', True)
         try:
             err, recid, msg = bibupload.bibupload_records(recs, opt_mode='replace')[0]
             self.check_record_consistency(recid)
         finally:
             task_set_option('force', False)
         replaced_xm = print_record(recid, 'xm')
         replaced_hm = print_record(recid, 'hm')
         # did it work?
         self.assertEqual(compare_xmbuffers(replaced_xm, testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(replaced_hm, testrec1_hm), '')
         self.assertEqual(recid, the_recid)
 
     def test_record_replace_non_existing(self):
         """bibupload - replace mode, non existing recid"""
         # replace some tags:
         the_recid = self.last_recid + 1
         testrec1_xm = """
         <record>
         <controlfield tag="001">%s</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """ % the_recid
 
         recs = bibupload.xml_marc_to_records(testrec1_xm)
         err, recid, _ = bibupload.bibupload(recs[0], opt_mode='replace')
         self.assertEqual((err, recid), (1, -1))
 
     def test_record_replace_two_recids(self):
         """bibupload - replace mode, two recids"""
         # replace some tags:
         testrec1_xm = """
         <record>
         <controlfield tag="001">300</controlfield>
         <controlfield tag="001">305</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="8">
           <subfield code="a">Cool</subfield>
          </datafield>
          <datafield tag="100" ind1="4" ind2="7">
           <subfield code="a">Test, Jim</subfield>
           <subfield code="u">Test Laboratory</subfield>
          </datafield>
         </record>
         """
         recs = bibupload.xml_marc_to_records(testrec1_xm)
         err, recid, _ = bibupload.bibupload(recs[0], opt_mode='replace')
         # did it work?
         self.assertEqual((err, recid), (1, -1))
 
 
 class BibUploadReferencesModeTest(GenericBibUploadTest):
     """Testing references mode.
     NOTE: in the past this was done by calling bibupload --reference|-z
     which is now simply implying bibupload --correct.
     """
 
     def setUp(self):
         """Initialize the MARCXML variable"""
         GenericBibUploadTest.setUp(self)
         self.test_insert = """<record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, T</subfield>
         <subfield code="u">CERN</subfield>
         </datafield>
         </record>"""
         self.test_reference = """<record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag =\"""" + CFG_BIBUPLOAD_REFERENCE_TAG + """\" ind1="C" ind2="5">
         <subfield code="m">M. Lüscher and P. Weisz, String excitation energies in SU(N) gauge theories beyond the free-string approximation,</subfield>
         <subfield code="s">J. High Energy Phys. 07 (2004) 014</subfield>
         </datafield>
         </record>"""
         self.test_reference_expected_xm = """<record>
         <controlfield tag="001">123456789</controlfield>
         <datafield tag ="100" ind1=" " ind2=" ">
         <subfield code="a">Tester, T</subfield>
         <subfield code="u">CERN</subfield>
         </datafield>
         <datafield tag =\"""" + CFG_BIBUPLOAD_REFERENCE_TAG + """\" ind1="C" ind2="5">
         <subfield code="m">M. Lüscher and P. Weisz, String excitation energies in SU(N) gauge theories beyond the free-string approximation,</subfield>
         <subfield code="s">J. High Energy Phys. 07 (2004) 014</subfield>
         </datafield>
         </record>"""
         self.test_insert_hm = """
         001__ 123456789
         100__ $$aTester, T$$uCERN
         """
         self.test_reference_expected_hm = """
         001__ 123456789
         100__ $$aTester, T$$uCERN
         %(reference_tag)sC5 $$mM. Lüscher and P. Weisz, String excitation energies in SU(N) gauge theories beyond the free-string approximation,$$sJ. High Energy Phys. 07 (2004) 014
         """ % {'reference_tag': CFG_BIBUPLOAD_REFERENCE_TAG}
         # insert test record:
         test_insert = self.test_insert.replace('<controlfield tag="001">123456789</controlfield>',
                                                '')
         recs = bibupload.xml_marc_to_records(test_insert)
         _, recid, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recID:
         self.test_insert = self.test_insert.replace('123456789', str(recid))
         self.test_insert_hm = self.test_insert_hm.replace('123456789', str(recid))
         self.test_reference = self.test_reference.replace('123456789', str(recid))
         self.test_reference_expected_xm = self.test_reference_expected_xm.replace('123456789', str(recid))
         self.test_reference_expected_hm = self.test_reference_expected_hm.replace('123456789', str(recid))
         # test of the inserted record:
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(inserted_xm, self.test_insert), '')
         self.assertEqual(compare_hmbuffers(inserted_hm, self.test_insert_hm), '')
         self.test_recid = recid
 
     def test_reference_complete_xml_marc(self):
         """bibupload - reference mode, inserting references MARCXML file"""
         # We create create the record out of the xml marc
         recs = bibupload.xml_marc_to_records(self.test_reference)
         # We call the main function with the record as a parameter
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='reference')[0]
         self.check_record_consistency(recid)
         # We retrieve the inserted xml
         reference_xm = print_record(recid, 'xm')
         reference_hm = print_record(recid, 'hm')
         # Compare if the two MARCXML are the same
         self.assertEqual(compare_xmbuffers(reference_xm, self.test_reference_expected_xm), '')
         self.assertEqual(compare_hmbuffers(reference_hm, self.test_reference_expected_hm), '')
 
 
 class BibUploadRecordsWithSYSNOTest(GenericBibUploadTest):
     """Testing uploading of records that have external SYSNO present."""
 
     def setUp(self):
         # pylint: disable=C0103
         """Initialize the MARCXML test records."""
         GenericBibUploadTest.setUp(self)
         # Note that SYSNO fields are repeated but with different
         # subfields, this is to test whether bibupload would not
         # mistakenly pick up wrong values.
         self.xm_testrec1 = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="%(sysnosubfieldcode)s">sysno1</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="0">sysno2</subfield>
          </datafield>
         </record>
         """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
                'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ",
                'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ",
                'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6],
                }
         self.hm_testrec1 = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1
         %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$%(sysnosubfieldcode)ssysno1
         %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$0sysno2
         """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
                'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4],
                'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5],
                'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6],
                }
         self.xm_testrec1_to_update = """
         <record>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="%(sysnosubfieldcode)s">sysno1</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="0">sysno2</subfield>
          </datafield>
         </record>
         """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
                'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ",
                'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ",
                'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6],
                }
         self.xm_testrec1_updated = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="%(sysnosubfieldcode)s">sysno1</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="0">sysno2</subfield>
          </datafield>
         </record>
         """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
                'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ",
                'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ",
                'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6],
                }
         self.hm_testrec1_updated = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1 Updated
         %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$%(sysnosubfieldcode)ssysno1
         %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$0sysno2
         """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
                'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4],
                'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5],
                'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6],
                }
         self.xm_testrec2 = """
         <record>
          <controlfield tag="001">987654321</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 2</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="%(sysnosubfieldcode)s">sysno2</subfield>
          </datafield>
          <datafield tag="%(sysnotag)s" ind1="%(sysnoind1)s" ind2="%(sysnoind2)s">
           <subfield code="0">sysno1</subfield>
          </datafield>
         </record>
         """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
                'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or " ",
                'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or " ",
                'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6],
                }
         self.hm_testrec2 = """
         001__ 987654321
         003__ SzGeCERN
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 2
         %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$%(sysnosubfieldcode)ssysno2
         %(sysnotag)s%(sysnoind1)s%(sysnoind2)s $$0sysno1
         """ % {'sysnotag': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3],
                'sysnoind1': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4],
                'sysnoind2': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5],
                'sysnosubfieldcode': CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6],
                }
 
     def test_insert_the_same_sysno_record(self):
         """bibupload - SYSNO tag, refuse to insert the same SYSNO record"""
         # initialize bibupload mode:
         if self.verbose:
             print "test_insert_the_same_sysno_record() started"
         # insert record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # insert record 2 first time:
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid2)
         inserted_xm = print_record(recid2, 'xm')
         inserted_hm = print_record(recid2, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec2 =  self.xm_testrec2.replace('987654321', str(recid2))
         self.hm_testrec2 =  self.hm_testrec2.replace('987654321', str(recid2))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec2), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec2), '')
         # try to insert updated record 1, it should fail:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.assertEqual(-1, recid1_updated)
         if self.verbose:
             print "test_insert_the_same_sysno_record() finished"
 
     def test_insert_or_replace_the_same_sysno_record(self):
         """bibupload - SYSNO tag, allow to insert or replace the same SYSNO record"""
         # initialize bibupload mode:
         if self.verbose:
             print "test_insert_or_replace_the_same_sysno_record() started"
         # insert/replace record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                           self.hm_testrec1), '')
         # try to insert/replace updated record 1, it should be okay:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs,
             opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1_updated)
         inserted_xm = print_record(recid1_updated, 'xm')
         inserted_hm = print_record(recid1_updated, 'hm')
         self.assertEqual(recid1, recid1_updated)
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1_updated =  self.xm_testrec1_updated.replace('123456789', str(recid1))
         self.hm_testrec1_updated =  self.hm_testrec1_updated.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                           self.xm_testrec1_updated), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                           self.hm_testrec1_updated), '')
         if self.verbose:
             print "test_insert_or_replace_the_same_sysno_record() finished"
 
     def test_replace_nonexisting_sysno_record(self):
         """bibupload - SYSNO tag, refuse to replace non-existing SYSNO record"""
         # initialize bibupload mode:
         if self.verbose:
             print "test_replace_nonexisting_sysno_record() started"
         # insert record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummy, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # try to replace record 2 it should fail:
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummy, recid2, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.assertEqual(-1, recid2)
         if self.verbose:
             print "test_replace_nonexisting_sysno_record() finished"
 
 class BibUploadRecordsWithEXTOAIIDTest(GenericBibUploadTest):
     """Testing uploading of records that have external EXTOAIID present."""
 
     def setUp(self):
         # pylint: disable=C0103
         """Initialize the MARCXML test records."""
         GenericBibUploadTest.setUp(self)
         # Note that EXTOAIID fields are repeated but with different
         # subfields, this is to test whether bibupload would not
         # mistakenly pick up wrong values.
         self.xm_testrec1 = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="%(extoaiidsubfieldcode)s">extoaiid1</subfield>
           <subfield code="%(extoaisrcsubfieldcode)s">extoaisrc1</subfield>
          </datafield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="0">extoaiid2</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1</subfield>
          </datafield>
         </record>
         """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ",
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ",
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6],
                }
         self.hm_testrec1 = """
         001__ 123456789
         003__ SzGeCERN
         %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$%(extoaisrcsubfieldcode)sextoaisrc1$$%(extoaiidsubfieldcode)sextoaiid1
         %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$0extoaiid2
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1
         """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4],
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5],
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6],
                }
         self.xm_testrec1_to_update = """
         <record>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="%(extoaiidsubfieldcode)s">extoaiid1</subfield>
           <subfield code="%(extoaisrcsubfieldcode)s">extoaisrc1</subfield>
          </datafield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="0">extoaiid2</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
         </record>
         """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ",
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ",
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6],
                }
         self.xm_testrec1_updated = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="%(extoaiidsubfieldcode)s">extoaiid1</subfield>
           <subfield code="%(extoaisrcsubfieldcode)s">extoaisrc1</subfield>
          </datafield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="0">extoaiid2</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
         </record>
         """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ",
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ",
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6],
                }
         self.hm_testrec1_updated = """
         001__ 123456789
         003__ SzGeCERN
         %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$%(extoaisrcsubfieldcode)sextoaisrc1$$%(extoaiidsubfieldcode)sextoaiid1
         %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$0extoaiid2
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1 Updated
         """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4],
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5],
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6],
                }
         self.xm_testrec2 = """
         <record>
          <controlfield tag="001">987654321</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="%(extoaiidsubfieldcode)s">extoaiid2</subfield>
           <subfield code="%(extoaisrcsubfieldcode)s">extoaisrc1</subfield>
          </datafield>
          <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
           <subfield code="0">extoaiid1</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 2</subfield>
          </datafield>
         </record>
         """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ",
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ",
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6],
                }
         self.hm_testrec2 = """
         001__ 987654321
         003__ SzGeCERN
         %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$%(extoaisrcsubfieldcode)sextoaisrc1$$%(extoaiidsubfieldcode)sextoaiid2
         %(extoaiidtag)s%(extoaiidind1)s%(extoaiidind2)s $$0extoaiid1
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 2
         """ % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4],
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5],
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'extoaisrcsubfieldcode' : CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6],
                }
 
     def test_insert_the_same_extoaiid_record(self):
         """bibupload - EXTOAIID tag, refuse to insert the same EXTOAIID record"""
         # initialize bibupload mode:
         if self.verbose:
             print "test_insert_the_same_extoaiid_record() started"
         # insert record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # insert record 2 first time:
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid2)
         inserted_xm = print_record(recid2, 'xm')
         inserted_hm = print_record(recid2, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec2 =  self.xm_testrec2.replace('987654321', str(recid2))
         self.hm_testrec2 =  self.hm_testrec2.replace('987654321', str(recid2))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec2), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec2), '')
         # try to insert updated record 1, it should fail:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.assertEqual(-1, recid1_updated)
         if self.verbose:
             print "test_insert_the_same_extoaiid_record() finished"
 
     def test_insert_or_replace_the_same_extoaiid_record(self):
         """bibupload - EXTOAIID tag, allow to insert or replace the same EXTOAIID record"""
         # initialize bibupload mode:
         if self.verbose:
             print "test_insert_or_replace_the_same_extoaiid_record() started"
         # insert/replace record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                           self.hm_testrec1), '')
         # try to insert/replace updated record 1, it should be okay:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1_updated)
         inserted_xm = print_record(recid1_updated, 'xm')
         inserted_hm = print_record(recid1_updated, 'hm')
         self.assertEqual(recid1, recid1_updated)
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1_updated =  self.xm_testrec1_updated.replace('123456789', str(recid1))
         self.hm_testrec1_updated =  self.hm_testrec1_updated.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                           self.xm_testrec1_updated), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                           self.hm_testrec1_updated), '')
         if self.verbose:
             print "test_insert_or_replace_the_same_extoaiid_record() finished"
 
     def test_replace_nonexisting_extoaiid_record(self):
         """bibupload - EXTOAIID tag, refuse to replace non-existing EXTOAIID record"""
         # initialize bibupload mode:
         if self.verbose:
             print "test_replace_nonexisting_extoaiid_record() started"
         # insert record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # try to replace record 2 it should fail:
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.assertEqual(-1, recid2)
         if self.verbose:
             print "test_replace_nonexisting_extoaiid_record() finished"
 
 class BibUploadRecordsWithOAIIDTest(GenericBibUploadTest):
     """Testing uploading of records that have OAI ID present."""
 
     def setUp(self):
         """Initialize the MARCXML test records."""
         GenericBibUploadTest.setUp(self)
         # Note that OAI fields are repeated but with different
         # subfields, this is to test whether bibupload would not
         # mistakenly pick up wrong values.
         self.xm_testrec1 = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="%(oaisubfieldcode)s">oai:foo:1</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="0">oai:foo:2</subfield>
          </datafield>
         </record>
         """ % {'oaitag': CFG_OAI_ID_FIELD[0:3],
                'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \
                           CFG_OAI_ID_FIELD[3:4] or " ",
                'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \
                           CFG_OAI_ID_FIELD[4:5] or " ",
                'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6],
                }
         self.hm_testrec1 = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1
         %(oaitag)s%(oaiind1)s%(oaiind2)s $$%(oaisubfieldcode)soai:foo:1
         %(oaitag)s%(oaiind1)s%(oaiind2)s $$0oai:foo:2
         """ % {'oaitag': CFG_OAI_ID_FIELD[0:3],
                'oaiind1': CFG_OAI_ID_FIELD[3:4],
                'oaiind2': CFG_OAI_ID_FIELD[4:5],
                'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6],
                }
         self.xm_testrec1_to_update = """
         <record>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="%(oaisubfieldcode)s">oai:foo:1</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="0">oai:foo:2</subfield>
          </datafield>
         </record>
         """ % {'oaitag': CFG_OAI_ID_FIELD[0:3],
                'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \
                           CFG_OAI_ID_FIELD[3:4] or " ",
                'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \
                           CFG_OAI_ID_FIELD[4:5] or " ",
                'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6],
                }
         self.xm_testrec1_updated = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="%(oaisubfieldcode)s">oai:foo:1</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="0">oai:foo:2</subfield>
          </datafield>
         </record>
         """ % {'oaitag': CFG_OAI_ID_FIELD[0:3],
                'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \
                           CFG_OAI_ID_FIELD[3:4] or " ",
                'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \
                           CFG_OAI_ID_FIELD[4:5] or " ",
                'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6],
                }
         self.hm_testrec1_updated = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1 Updated
         %(oaitag)s%(oaiind1)s%(oaiind2)s $$%(oaisubfieldcode)soai:foo:1
         %(oaitag)s%(oaiind1)s%(oaiind2)s $$0oai:foo:2
         """ % {'oaitag': CFG_OAI_ID_FIELD[0:3],
                'oaiind1': CFG_OAI_ID_FIELD[3:4],
                'oaiind2': CFG_OAI_ID_FIELD[4:5],
                'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6],
                }
         self.xm_testrec2 = """
         <record>
          <controlfield tag="001">987654321</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 2</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="%(oaisubfieldcode)s">oai:foo:2</subfield>
          </datafield>
          <datafield tag="%(oaitag)s" ind1="%(oaiind1)s" ind2="%(oaiind2)s">
           <subfield code="0">oai:foo:1</subfield>
          </datafield>
         </record>
         """ % {'oaitag': CFG_OAI_ID_FIELD[0:3],
                'oaiind1': CFG_OAI_ID_FIELD[3:4] != "_" and \
                           CFG_OAI_ID_FIELD[3:4] or " ",
                'oaiind2': CFG_OAI_ID_FIELD[4:5] != "_" and \
                           CFG_OAI_ID_FIELD[4:5] or " ",
                'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6],
                }
         self.hm_testrec2 = """
         001__ 987654321
         003__ SzGeCERN
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 2
         %(oaitag)s%(oaiind1)s%(oaiind2)s $$%(oaisubfieldcode)soai:foo:2
         %(oaitag)s%(oaiind1)s%(oaiind2)s $$0oai:foo:1
         """ % {'oaitag': CFG_OAI_ID_FIELD[0:3],
                'oaiind1': CFG_OAI_ID_FIELD[3:4],
                'oaiind2': CFG_OAI_ID_FIELD[4:5],
                'oaisubfieldcode': CFG_OAI_ID_FIELD[5:6],
                }
 
     def test_insert_the_same_oai_record(self):
         """bibupload - OAIID tag, refuse to insert the same OAI record"""
         # insert record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # insert record 2 first time:
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid2)
         inserted_xm = print_record(recid2, 'xm')
         inserted_hm = print_record(recid2, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec2 =  self.xm_testrec2.replace('987654321', str(recid2))
         self.hm_testrec2 =  self.hm_testrec2.replace('987654321', str(recid2))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec2), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec2), '')
         # try to insert updated record 1, it should fail:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.assertEqual(-1, recid1_updated)
 
     def test_insert_or_replace_the_same_oai_record(self):
         """bibupload - OAIID tag, allow to insert or replace the same OAI record"""
         # initialize bibupload mode:
         # insert/replace record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # try to insert/replace updated record 1, it should be okay:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         dummyerr1_updated, recid1_updated, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1_updated)
         inserted_xm = print_record(recid1_updated, 'xm')
         inserted_hm = print_record(recid1_updated, 'hm')
         self.assertEqual(recid1, recid1_updated)
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1_updated =  self.xm_testrec1_updated.replace('123456789', str(recid1))
         self.hm_testrec1_updated =  self.hm_testrec1_updated.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                           self.xm_testrec1_updated), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                           self.hm_testrec1_updated), '')
 
     def test_replace_nonexisting_oai_record(self):
         """bibupload - OAIID tag, refuse to replace non-existing OAI record"""
         # insert record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # try to replace record 2 it should fail:
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         dummyerr2, recid2, _ = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.assertEqual(-1, recid2)
 
 class BibUploadRecordsWithDOITest(GenericBibUploadTest):
     """Testing uploading of records with DOI."""
 
     def setUp(self):
         """Initialize the MARCXML test records."""
         GenericBibUploadTest.setUp(self)
         self.xm_testrec1 = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789</subfield>
          </datafield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">nondoi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789-0</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.hm_testrec1 = """
         001__ 123456789
         003__ SzGeCERN
         %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/123-456-789
         %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)snondoi$$%(doisubfieldcodevalue)s10.5170/123-456-789-0
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': '_',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.xm_testrec1_to_update = """
         <record>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789</subfield>
          </datafield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">nondoi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789-0</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.xm_testrec1_updated = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789</subfield>
          </datafield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">nondoi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789-0</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 1 Updated</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.hm_testrec1_updated = """
         001__ 123456789
         003__ SzGeCERN
         %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/123-456-789
         %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)snondoi$$%(doisubfieldcodevalue)s10.5170/123-456-789-0
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 1 Updated
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': '_',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.xm_testrec2 = """
         <record>
          <controlfield tag="001">987654321</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/987-654-321</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 2</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.hm_testrec2 = """
         001__ 987654321
         003__ SzGeCERN
         %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/987-654-321
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 2
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': '_',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.xm_testrec2_to_update = """
         <record>
          <controlfield tag="001">987654321</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.xm_testrec3 = """
         <record>
          <controlfield tag="001">192837645</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789-0</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 4</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.hm_testrec3 = """
         001__ 192837645
         003__ SzGeCERN
         %(doitag)s%(doiind1)s%(doiind2)s $$%(doisubfieldcodesource)sdoi$$%(doisubfieldcodevalue)s10.5170/123-456-789-0
         100__ $$aBar, Baz$$uFoo
         245__ $$aOn the quux and huux 4
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': '_',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.xm_testrec4 = """
         <record>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789-non-existing</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 5</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
         self.xm_testrec5 = """
         <record>
          <controlfield tag="001">123456789</controlfield>
          <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/123-456-789</subfield>
          </datafield>
          <datafield tag="%(doitag)s" ind1="%(doiind1)s" ind2="%(doiind2)s">
           <subfield code="%(doisubfieldcodesource)s">doi</subfield>
           <subfield code="%(doisubfieldcodevalue)s">10.5170/987-654-321</subfield>
          </datafield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Bar, Baz</subfield>
           <subfield code="u">Foo</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">On the quux and huux 6</subfield>
          </datafield>
         </record>
         """ % {'doitag': '024',
                'doiind1': '7',
                'doiind2': ' ',
                'doisubfieldcodevalue': 'a',
                'doisubfieldcodesource': '2'
                }
 
     def test_insert_the_same_doi_matching_on_doi(self):
         """bibupload - DOI tag, refuse to "insert" twice same DOI (matching on DOI)"""
         # insert record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # insert record 2 first time:
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err2, recid2, msg2 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid2)
         inserted_xm = print_record(recid2, 'xm')
         inserted_hm = print_record(recid2, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec2 = self.xm_testrec2.replace('987654321', str(recid2))
         self.hm_testrec2 = self.hm_testrec2.replace('987654321', str(recid2))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec2), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec2), '')
 
         # try to insert again record 1 (without recid, matching on DOI)
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='insert')
         self.assertEqual(-1, recid1_updated)
 
         # if we try to update, append or correct, the same record is matched
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='correct')
         self.check_record_consistency(recid1_updated)
         self.assertEqual(recid1, recid1_updated)
 
         err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='append')
         self.check_record_consistency(recid1_updated)
         self.assertEqual(recid1, recid1_updated)
 
         err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='replace')
         self.check_record_consistency(recid1_updated)
         self.assertEqual(recid1, recid1_updated)
 
     def test_insert_the_same_doi_matching_on_recid(self):
         """bibupload - DOI tag, refuse to "insert" twice same DOI (matching on recid)"""
         # First upload 2 test records
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid1)
 
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err2, recid2, msg2 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid2)
 
         # try to update record 2 with DOI already in record 1. It must fail:
         testrec_to_update = self.xm_testrec2_to_update.replace('<controlfield tag="001">987654321</controlfield>',
                                                                '<controlfield tag="001">%s</controlfield>' % recid2)
         recs = bibupload.xml_marc_to_records(testrec_to_update)
         err, recid, msg = bibupload.bibupload(recs[0], opt_mode='replace')
         self.check_record_consistency(recid)
         self.assertEqual(1, err)
 
         # Ditto in correct and append mode
         recs = bibupload.xml_marc_to_records(testrec_to_update)
         err, recid, msg = bibupload.bibupload(recs[0], opt_mode='correct')
         self.check_record_consistency(recid)
         self.assertEqual(1, err)
 
         recs = bibupload.xml_marc_to_records(testrec_to_update)
         err, recid, msg = bibupload.bibupload(recs[0], opt_mode='append')
         self.check_record_consistency(recid)
         self.assertEqual(1, err)
 
     def test_insert_or_replace_the_same_doi_record(self):
         """bibupload - DOI tag, allow to insert or replace matching on DOI"""
         # insert/replace record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert')
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # try to insert/replace updated record 1, it should be okay:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='replace_or_insert')
         self.check_record_consistency(recid1_updated)
         inserted_xm = print_record(recid1_updated, 'xm')
         inserted_hm = print_record(recid1_updated, 'hm')
         self.assertEqual(recid1, recid1_updated)
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1_updated = self.xm_testrec1_updated.replace('123456789', str(recid1))
         self.hm_testrec1_updated = self.hm_testrec1_updated.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                           self.xm_testrec1_updated), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                           self.hm_testrec1_updated), '')
 
     def test_correct_the_same_doi_record(self):
         """bibupload - DOI tag, allow to correct matching on DOI"""
         # insert/replace record 1 first time:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert')
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1 =  self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 =  self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
         # try to correct updated record 1, it should be okay:
         recs = bibupload.xml_marc_to_records(self.xm_testrec1_to_update)
         err1_updated, recid1_updated, msg1_updated = bibupload.bibupload(recs[0], opt_mode='correct')
         self.check_record_consistency(recid1_updated)
         inserted_xm = print_record(recid1_updated, 'xm')
         inserted_hm = print_record(recid1_updated, 'hm')
         self.assertEqual(recid1, recid1_updated)
         # use real recID in test buffers when comparing whether it worked:
         self.xm_testrec1_updated = self.xm_testrec1_updated.replace('123456789', str(recid1))
         self.hm_testrec1_updated = self.hm_testrec1_updated.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                           self.xm_testrec1_updated), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                           self.hm_testrec1_updated), '')
 
     def test_replace_nonexisting_doi_record(self):
         """bibupload - DOI tag, refuse to replace non-existing DOI record (matching on DOI)"""
         testrec_to_insert_first = self.xm_testrec4
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err4, recid4, msg4 = bibupload.bibupload(recs[0], opt_mode='replace')
         self.assertEqual(-1, recid4)
 
     def test_matching_on_doi_source_field(self):
         """bibupload - DOI tag, test matching records using DOI value AND source field ($2)"""
         # insert test record 1, with a "fake" doi (not "doi" in source field):
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid1)
         inserted_xm = print_record(recid1, 'xm')
         inserted_hm = print_record(recid1, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec1 = self.xm_testrec1.replace('123456789', str(recid1))
         self.hm_testrec1 = self.hm_testrec1.replace('123456789', str(recid1))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec1), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec1), '')
 
         # insert record 3, which matches record 1 "fake" doi, so it
         # should work.
         testrec_to_insert_first = self.xm_testrec3.replace('<controlfield tag="001">192837645</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err3, recid3, msg3 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid3)
         inserted_xm = print_record(recid3, 'xm')
         inserted_hm = print_record(recid3, 'hm')
         # use real recID when comparing whether it worked:
         self.xm_testrec3 = self.xm_testrec3.replace('192837645', str(recid3))
         self.hm_testrec3 = self.hm_testrec3.replace('192837645', str(recid3))
         self.assertEqual(compare_xmbuffers(inserted_xm,
                                            self.xm_testrec3), '')
         self.assertEqual(compare_hmbuffers(inserted_hm,
                                            self.hm_testrec3), '')
 
     def test_replace_or_update_record__with_ambiguous_doi(self):
         """bibupload - DOI tag, refuse to replace/correct/append on the basis of ambiguous DOI"""
         # First upload 2 test records with two different DOIs:
         testrec_to_insert_first = self.xm_testrec1.replace('<controlfield tag="001">123456789</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err1, recid1, msg1 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid1)
         self.assertEqual(0, err1)
 
         testrec_to_insert_first = self.xm_testrec2.replace('<controlfield tag="001">987654321</controlfield>',
                                                            '')
         recs = bibupload.xml_marc_to_records(testrec_to_insert_first)
         err2, recid2, msg2 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid2)
         self.assertEqual(0, err2)
 
         # Now try to insert record with DOIs matching the records
         # previously uploaded.  It must fail.
         testrec = self.xm_testrec5.replace('<controlfield tag="001">123456789</controlfield>',
                                            '')
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='insert')
         self.assertEqual(1, err5)
 
         # Ditto for other modes:
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert')
         self.assertEqual(1, err5)
 
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace')
         self.assertEqual(1, err5)
 
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='correct')
         self.assertEqual(1, err5)
 
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='append')
         self.assertEqual(1, err5)
 
         # The same is true if a recid exists in the input MARCXML (as
         # long as DOIs are ambiguous):
         testrec = self.xm_testrec5.replace('<controlfield tag="001">123456789</controlfield>',
                                            '<controlfield tag="001">%s</controlfield>' % recid1)
 
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace_or_insert')
         self.assertEqual(1, err5)
 
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='replace')
         self.assertEqual(1, err5)
 
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='correct')
         self.assertEqual(1, err5)
 
         recs = bibupload.xml_marc_to_records(testrec)
         err5, recid5, msg5 = bibupload.bibupload(recs[0], opt_mode='append')
         self.assertEqual(1, err5)
 
 class BibUploadIndicatorsTest(GenericBibUploadTest):
     """
     Testing uploading of a MARCXML record with indicators having
     either blank space (as per MARC schema) or empty string value (old
     behaviour).
     """
 
     def setUp(self):
         """Initialize the MARCXML test record."""
         GenericBibUploadTest.setUp(self)
         self.testrec1_xm = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
         </record>
         """
         self.testrec1_hm = """
         003__ SzGeCERN
         100__ $$aTest, John$$uTest University
         """
         self.testrec2_xm = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1="" ind2="">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
         </record>
         """
         self.testrec2_hm = """
         003__ SzGeCERN
         100__ $$aTest, John$$uTest University
         """
 
     def test_record_with_spaces_in_indicators(self):
         """bibupload - inserting MARCXML with spaces in indicators"""
         recs = bibupload.xml_marc_to_records(self.testrec1_xm)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm),
                                           self.testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(inserted_hm),
                                           self.testrec1_hm), '')
 
     def test_record_with_no_spaces_in_indicators(self):
         """bibupload - inserting MARCXML with no spaces in indicators"""
         recs = bibupload.xml_marc_to_records(self.testrec2_xm)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(inserted_xm),
                                           self.testrec2_xm), '')
         self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(inserted_hm),
                                           self.testrec2_hm), '')
 
 class BibUploadUpperLowerCaseTest(GenericBibUploadTest):
     """
     Testing treatment of similar records with only upper and lower
     case value differences in the bibxxx table.
     """
 
     def setUp(self):
         """Initialize the MARCXML test records."""
         GenericBibUploadTest.setUp(self)
         self.testrec1_xm = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
         </record>
         """
         self.testrec1_hm = """
         003__ SzGeCERN
         100__ $$aTest, John$$uTest University
         """
         self.testrec2_xm = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1="" ind2="">
           <subfield code="a">TeSt, JoHn</subfield>
           <subfield code="u">Test UniVeRsity</subfield>
          </datafield>
         </record>
         """
         self.testrec2_hm = """
         003__ SzGeCERN
         100__ $$aTeSt, JoHn$$uTest UniVeRsity
         """
 
     def test_record_with_upper_lower_case_letters(self):
         """bibupload - inserting similar MARCXML records with upper/lower case"""
         # insert test record #1:
         recs = bibupload.xml_marc_to_records(self.testrec1_xm)
         dummyerr1, recid1, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid1)
         recid1_inserted_xm = print_record(recid1, 'xm')
         recid1_inserted_hm = print_record(recid1, 'hm')
         # insert test record #2:
         recs = bibupload.xml_marc_to_records(self.testrec2_xm)
         dummyerr1, recid2, _ = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid2)
         recid2_inserted_xm = print_record(recid2, 'xm')
         recid2_inserted_hm = print_record(recid2, 'hm')
         # let us compare stuff now:
         self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(recid1_inserted_xm),
                                           self.testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(recid1_inserted_hm),
                                           self.testrec1_hm), '')
         self.assertEqual(compare_xmbuffers(remove_tag_001_from_xmbuffer(recid2_inserted_xm),
                                           self.testrec2_xm), '')
         self.assertEqual(compare_hmbuffers(remove_tag_001_from_hmbuffer(recid2_inserted_hm),
                                           self.testrec2_hm), '')
 
 class BibUploadControlledProvenanceTest(GenericBibUploadTest):
     """Testing treatment of tags under controlled provenance in the correct mode."""
 
     def setUp(self):
         """Initialize the MARCXML test record."""
         GenericBibUploadTest.setUp(self)
         self.testrec1_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">Test title</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">blabla</subfield>
           <subfield code="9">sam</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">blublu</subfield>
           <subfield code="9">sim</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">human</subfield>
          </datafield>
         </record>
         """
         self.testrec1_hm = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         245__ $$aTest title
         6531_ $$9sam$$ablabla
         6531_ $$9sim$$ablublu
         6531_ $$ahuman
         """
         self.testrec1_xm_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">bleble</subfield>
           <subfield code="9">sim</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">bloblo</subfield>
           <subfield code="9">som</subfield>
          </datafield>
         </record>
         """
         self.testrec1_corrected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">Test title</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">blabla</subfield>
           <subfield code="9">sam</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">human</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">bleble</subfield>
           <subfield code="9">sim</subfield>
          </datafield>
          <datafield tag="653" ind1="1" ind2=" ">
           <subfield code="a">bloblo</subfield>
           <subfield code="9">som</subfield>
          </datafield>
         </record>
         """
         self.testrec1_corrected_hm = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         245__ $$aTest title
         6531_ $$9sam$$ablabla
         6531_ $$ahuman
         6531_ $$9sim$$ableble
         6531_ $$9som$$abloblo
         """
         # insert test record:
         test_record_xm = self.testrec1_xm.replace('<controlfield tag="001">123456789</controlfield>',
                                                   '')
         recs = bibupload.xml_marc_to_records(test_record_xm)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recID:
         self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid))
         self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid))
         self.testrec1_xm_to_correct = self.testrec1_xm_to_correct.replace('123456789', str(recid))
         self.testrec1_corrected_xm = self.testrec1_corrected_xm.replace('123456789', str(recid))
         self.testrec1_corrected_hm = self.testrec1_corrected_hm.replace('123456789', str(recid))
         # test of the inserted record:
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '')
 
     def test_controlled_provenance_persistence(self):
         """bibupload - correct mode, tags with controlled provenance"""
         # correct metadata tags; will the protected tags be kept?
         recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_correct)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
         corrected_xm = print_record(recid, 'xm')
         corrected_hm = print_record(recid, 'hm')
         # did it work?
         self.assertEqual(compare_xmbuffers(corrected_xm, self.testrec1_corrected_xm), '')
         self.assertEqual(compare_hmbuffers(corrected_hm, self.testrec1_corrected_hm), '')
 
 
 class BibUploadStrongTagsTest(GenericBibUploadTest):
     """Testing treatment of strong tags and the replace mode."""
 
     def setUp(self):
         """Initialize the MARCXML test record."""
         GenericBibUploadTest.setUp(self)
         self.testrec1_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Jane</subfield>
           <subfield code="u">Test Institute</subfield>
          </datafield>
          <datafield tag="245" ind1=" " ind2=" ">
           <subfield code="a">Test title</subfield>
          </datafield>
          <datafield tag="%(strong_tag)s" ind1=" " ind2=" ">
           <subfield code="a">A value</subfield>
           <subfield code="b">Another value</subfield>
          </datafield>
         </record>
         """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]}
         self.testrec1_hm = """
         001__ 123456789
         003__ SzGeCERN
         100__ $$aTest, Jane$$uTest Institute
         245__ $$aTest title
         %(strong_tag)s__ $$aA value$$bAnother value
         """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]}
         self.testrec1_xm_to_replace = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Joseph</subfield>
           <subfield code="u">Test Academy</subfield>
          </datafield>
         </record>
         """
         self.testrec1_replaced_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, Joseph</subfield>
           <subfield code="u">Test Academy</subfield>
          </datafield>
          <datafield tag="%(strong_tag)s" ind1=" " ind2=" ">
           <subfield code="a">A value</subfield>
           <subfield code="b">Another value</subfield>
          </datafield>
         </record>
         """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]}
         self.testrec1_replaced_hm = """
         001__ 123456789
         100__ $$aTest, Joseph$$uTest Academy
         %(strong_tag)s__ $$aA value$$bAnother value
         """ % {'strong_tag': bibupload.CFG_BIBUPLOAD_STRONG_TAGS[0]}
         # insert test record:
         test_record_xm = self.testrec1_xm.replace('<controlfield tag="001">123456789</controlfield>',
                                                   '')
         recs = bibupload.xml_marc_to_records(test_record_xm)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recID:
         self.testrec1_xm = self.testrec1_xm.replace('123456789', str(recid))
         self.testrec1_hm = self.testrec1_hm.replace('123456789', str(recid))
         self.testrec1_xm_to_replace = self.testrec1_xm_to_replace.replace('123456789', str(recid))
         self.testrec1_replaced_xm = self.testrec1_replaced_xm.replace('123456789', str(recid))
         self.testrec1_replaced_hm = self.testrec1_replaced_hm.replace('123456789', str(recid))
         # test of the inserted record:
         inserted_xm = print_record(recid, 'xm')
         inserted_hm = print_record(recid, 'hm')
         self.assertEqual(compare_xmbuffers(inserted_xm, self.testrec1_xm), '')
         self.assertEqual(compare_hmbuffers(inserted_hm, self.testrec1_hm), '')
 
     def test_strong_tags_persistence(self):
         """bibupload - strong tags, persistence in replace mode"""
         # replace all metadata tags; will the strong tags be kept?
         recs = bibupload.xml_marc_to_records(self.testrec1_xm_to_replace)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(recid)
         replaced_xm = print_record(recid, 'xm')
         replaced_hm = print_record(recid, 'hm')
         # did it work?
         self.assertEqual(compare_xmbuffers(replaced_xm, self.testrec1_replaced_xm), '')
         self.assertEqual(compare_hmbuffers(replaced_hm, self.testrec1_replaced_hm), '')
 
 class BibUploadPretendTest(GenericBibUploadTest):
     """
     Testing bibupload --pretend correctness.
     """
     def setUp(self):
         GenericBibUploadTest.setUp(self)
         self.demo_data = bibupload.xml_marc_to_records(open(os.path.join(CFG_TMPDIR, 'demobibdata.xml')).read())[0]
         self.before = self._get_tables_fingerprint()
         task_set_task_param('pretend', True)
 
     def tearDown(self):
         GenericBibUploadTest.tearDown(self)
         task_set_task_param('pretend', False)
 
     @staticmethod
     def _get_tables_fingerprint():
         """
         Take lenght and last modification time of all the tables that
         might be touched by bibupload and return them in a nice structure.
         """
         fingerprint = {}
         tables = ['bibrec', 'bibdoc', 'bibrec_bibdoc', 'bibdoc_bibdoc', 'bibfmt', 'hstDOCUMENT', 'hstRECORD', 'bibHOLDINGPEN', 'bibdocmoreinfo', 'bibdocfsinfo']
         for i in xrange(100):
             tables.append('bib%02dx' % i)
             tables.append('bibrec_bib%02dx' % i)
         for table in tables:
             fingerprint[table] = get_table_status_info(table)
         return fingerprint
 
     @staticmethod
     def _checks_tables_fingerprints(before, after):
         """
         Checks differences in table_fingerprints.
         """
         for table in before.keys():
             if before[table] != after[table]:
                 raise StandardError("Table %s has been modified: before was [%s], after was [%s]" % (table, pprint.pformat(before[table]), pprint.pformat(after[table])))
         return True
 
     def test_pretend_insert(self):
         """bibupload - pretend insert"""
         bibupload.bibupload_records([self.demo_data], opt_mode='insert', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
     def test_pretend_correct(self):
         """bibupload - pretend correct"""
         bibupload.bibupload_records([self.demo_data], opt_mode='correct', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
     def test_pretend_replace(self):
         """bibupload - pretend replace"""
         bibupload.bibupload_records([self.demo_data], opt_mode='replace', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
     def test_pretend_append(self):
         """bibupload - pretend append"""
         bibupload.bibupload_records([self.demo_data], opt_mode='append', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
     def test_pretend_replace_or_insert(self):
         """bibupload - pretend replace or insert"""
         bibupload.bibupload_records([self.demo_data], opt_mode='replace_or_insert', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
     def test_pretend_holdingpen(self):
         """bibupload - pretend holdingpen"""
         bibupload.bibupload_records([self.demo_data], opt_mode='holdingpen', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
     def test_pretend_delete(self):
         """bibupload - pretend delete"""
         bibupload.bibupload_records([self.demo_data], opt_mode='delete', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
     def test_pretend_reference(self):
         """bibupload - pretend reference"""
         bibupload.bibupload_records([self.demo_data], opt_mode='reference', pretend=True)
         self.failUnless(self._checks_tables_fingerprints(self.before, self._get_tables_fingerprint()))
 
 class BibUploadHoldingPenTest(GenericBibUploadTest):
     """
     Testing the Holding Pen usage.
     """
     def setUp(self):
         GenericBibUploadTest.setUp(self)
         self.verbose = 9
         setup_loggers()
         task_set_task_param('verbose', self.verbose)
         self.recid = 10
         self.oai_id = "oai:cds.cern.ch:CERN-EP-2001-094"
 
     def test_holding_pen_upload_with_recid(self):
         """bibupload - holding pen upload with recid"""
         test_to_upload = """<?xml version="1.0" encoding="UTF-8"?>
             <collection xmlns="http://www.loc.gov/MARC21/slim">
             <record>
             <controlfield tag="001">%s</controlfield>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Kleefeld, F</subfield>
             </datafield>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Newcomer, Y</subfield>
             </datafield>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Rupp, G</subfield>
             </datafield>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Scadron, M D</subfield>
             </datafield>
             </record>
             </collection>""" % self.recid
         recs = bibupload.xml_marc_to_records(test_to_upload)
         bibupload.insert_record_into_holding_pen(recs[0], "")
         res = run_sql("SELECT changeset_xml FROM bibHOLDINGPEN WHERE id_bibrec=%s", (self.recid, ))
         self.failUnless("Rupp, G" in zlib.decompress(res[0][0]))
 
     def test_holding_pen_upload_with_oai_id(self):
         """bibupload - holding pen upload with oai_id"""
         test_to_upload = """<?xml version="1.0" encoding="UTF-8"?>
             <collection xmlns="http://www.loc.gov/MARC21/slim">
             <record>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Kleefeld, F</subfield>
             </datafield>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Newcomer, Y</subfield>
             </datafield>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Rupp, G</subfield>
             </datafield>
             <datafield tag="700" ind1=" " ind2=" ">
                 <subfield code="a">Scadron, M D</subfield>
             </datafield>
             <datafield tag="%(extoaiidtag)s" ind1="%(extoaiidind1)s" ind2="%(extoaiidind2)s">
                 <subfield code="%(extoaiidsubfieldcode)s">%(value)s</subfield>
             </datafield>
             </record>
             </collection>""" % {'extoaiidtag': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3],
                'extoaiidind1': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or " ",
                'extoaiidind2': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \
                             CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or " ",
                'extoaiidsubfieldcode': CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6],
                'value': self.oai_id
             }
         recs = bibupload.xml_marc_to_records(test_to_upload)
         bibupload.insert_record_into_holding_pen(recs[0], self.oai_id)
         res = run_sql("SELECT changeset_xml FROM bibHOLDINGPEN WHERE id_bibrec=%s AND oai_id=%s", (self.recid, self.oai_id))
         self.failUnless("Rupp, G" in zlib.decompress(res[0][0]))
 
     def tearDown(self):
         GenericBibUploadTest.tearDown(self)
         run_sql("DELETE FROM bibHOLDINGPEN WHERE id_bibrec=%s", (self.recid, ))
 
 class BibUploadFFTModeTest(GenericBibUploadTest):
     """
     Testing treatment of fulltext file transfer import mode.
     """
     def _test_bibdoc_status(self, recid, docname, status):
         res = run_sql('SELECT bd.status FROM bibrec_bibdoc as bb JOIN bibdoc as bd ON bb.id_bibdoc = bd.id WHERE bb.id_bibrec = %s AND bb.docname = %s', (recid, docname))
         self.failUnless(res)
         self.assertEqual(status, res[0][0])
 
     def test_writing_rights(self):
         """bibupload - FFT has writing rights"""
         self.failUnless(bibupload.writing_rights_p())
 
     def test_simple_fft_insert(self):
         """bibupload - simple FFT insert"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self.failUnless(try_url_download(testrec_expected_url))
 
     def test_fft_insert_with_valid_embargo(self):
         """bibupload - FFT insert with valid embargo"""
         # define the test case:
         future_date = time.strftime('%Y-%m-%d', time.gmtime(time.time() + 24 * 3600 * 2))
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="r">firerole: deny until '%(future_date)s'
 allow any</subfield>
          </datafield>
         </record>
         """ % {
             'future_date': future_date,
             'siteurl': CFG_SITE_URL
         }
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         result = urlopen(testrec_expected_url).read()
         self.failUnless("This file is restricted." in result, result)
 
     def test_fft_insert_with_expired_embargo(self):
         """bibupload - FFT insert with expired embargo"""
         # define the test case:
         past_date = time.strftime('%Y-%m-%d', time.gmtime(time.time() - 24 * 3600 * 2))
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="r">firerole: deny until '%(past_date)s'
 allow any</subfield>
          </datafield>
         </record>
         """ % {
             'past_date': past_date,
             'siteurl': CFG_SITE_URL
         }
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         result = urlopen(testrec_expected_url).read()
         self.failIf("If you already have an account, please login using the form below." in result, result)
         self.assertEqual(test_web_page_content(testrec_expected_url, 'hyde', 'h123yde', expected_text='Authorization failure'), [])
         self.force_webcoll(recid)
         self.assertEqual(test_web_page_content(testrec_expected_url, 'hyde', 'h123yde', expected_text=urlopen("%(siteurl)s/img/site_logo.gif" % {
             'siteurl': CFG_SITE_URL
         }).read()), [])
 
     def test_exotic_format_fft_append(self):
         """bibupload - exotic format FFT append"""
         # define the test case:
         testfile = os.path.join(CFG_TMPDIR, 'test.ps.Z')
         open(testfile, 'w').write('TEST')
         email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3]
         email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3]
         email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4]
         email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5]
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
         </record>
         """ % {
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
         testrec_to_append = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%s</subfield>
          </datafield>
         </record>
         """ % testfile
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
          <subfield code="s">4</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/test.ps.Z</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL,
             'CFG_SITE_RECORD': CFG_SITE_RECORD,
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
 
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/test.ps.Z" \
                % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/test?format=ps.Z" \
                % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_to_append = testrec_to_append.replace('123456789',
                                                           str(recid))
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         testrec_expected_url2 = testrec_expected_url.replace('123456789',
                                                           str(recid))
         recs = bibupload.xml_marc_to_records(testrec_to_append)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='append')[0]
         self.check_record_consistency(recid)
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self.assertEqual(test_web_page_content(testrec_expected_url, 'jekyll', 'j123ekyll', expected_text='TEST'), [])
         self.assertEqual(test_web_page_content(testrec_expected_url2, 'jekyll', 'j123ekyll', expected_text='TEST'), [])
 
 
     def test_fft_check_md5_through_bibrecdoc_str(self):
         """bibupload - simple FFT insert, check md5 through BibRecDocs.str()"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%s/img/head.gif</subfield>
          </datafield>
         </record>
         """ % CFG_SITE_URL
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
 
         original_md5 = md5(urlopen('%s/img/head.gif' % CFG_SITE_URL).read()).hexdigest()
 
         bibrec_str = str(BibRecDocs(int(recid)))
 
         md5_found = False
         for row in bibrec_str.split('\n'):
             if 'checksum' in row:
                 if original_md5 in row:
                     md5_found = True
 
         self.failUnless(md5_found)
 
 
     def test_detailed_fft_insert(self):
         """bibupload - detailed FFT insert"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="t">SuperMain</subfield>
           <subfield code="d">This is a description</subfield>
           <subfield code="z">This is a comment</subfield>
           <subfield code="n">CIDIESSE</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/rss.png</subfield>
           <subfield code="t">SuperMain</subfield>
           <subfield code="f">.jpeg</subfield>
           <subfield code="d">This is a description</subfield>
           <subfield code="z">This is a second comment</subfield>
           <subfield code="n">CIDIESSE</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.gif</subfield>
           <subfield code="y">This is a description</subfield>
           <subfield code="z">This is a comment</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">530</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.jpeg</subfield>
           <subfield code="y">This is a description</subfield>
           <subfield code="z">This is a second comment</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url1 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.gif" % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/CIDIESSE.jpeg" % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url1 = testrec_expected_url1.replace('123456789',
                                                           str(recid))
         testrec_expected_url2 = testrec_expected_url1.replace('123456789',
                                                           str(recid))
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self.failUnless(try_url_download(testrec_expected_url1))
         self.failUnless(try_url_download(testrec_expected_url2))
 
 
     def test_simple_fft_insert_with_restriction(self):
         """bibupload - simple FFT insert with restriction"""
         # define the test case:
         email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3]
         email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3]
         email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4]
         email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5]
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="r">thesis</subfield>
           <subfield code="x">%(siteurl)s/img/sb.gif</subfield>
          </datafield>
         </record>
         """ % {'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code,
             'siteurl': CFG_SITE_URL}
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">79</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon</subfield>
           <subfield code="x">icon</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL,
             'CFG_SITE_RECORD': CFG_SITE_RECORD,
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
 
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_icon = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         testrec_expected_icon = testrec_expected_icon.replace('123456789',
                                                           str(recid))
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self.assertEqual(test_web_page_content(testrec_expected_icon, 'jekyll', 'j123ekyll', expected_text=urlopen('%(siteurl)s/img/sb.gif' % {
             'siteurl': CFG_SITE_URL
         }).read()), [])
         self.assertEqual(test_web_page_content(testrec_expected_icon, 'hyde', 'h123yde', expected_text='Authorization failure'), [])
         self.force_webcoll(recid)
         self.assertEqual(test_web_page_content(testrec_expected_icon, 'hyde', 'h123yde', expected_text=urlopen('%(siteurl)s/img/restricted.gif' % {'siteurl': CFG_SITE_URL}).read()), [])
 
         self.failUnless("HTTP Error 401: Unauthorized" in test_web_page_content(testrec_expected_url, 'hyde', 'h123yde')[0])
         self.failUnless("This file is restricted." in urlopen(testrec_expected_url).read())
 
     def test_simple_fft_insert_with_icon(self):
         """bibupload - simple FFT insert with icon"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="x">%(siteurl)s/img/sb.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">79</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon</subfield>
           <subfield code="x">icon</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_icon = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         testrec_expected_icon = testrec_expected_icon.replace('123456789',
                                                           str(recid))
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(try_url_download(testrec_expected_icon))
 
     def test_multiple_fft_insert(self):
         """bibupload - multiple FFT insert"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/head.gif</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/%(CFG_SITE_RECORD)s/95/files/9809057.pdf</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(prefix)s/var/tmp/demobibdata.xml</subfield>
          </datafield>
         </record>
         """ % {
             'prefix': CFG_PREFIX,
             'siteurl': CFG_SITE_URL,
             'CFG_SITE_RECORD': CFG_SITE_RECORD,
         }
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">295078</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/9809057.pdf</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">%(sizeofdemobibdata)s</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/demobibdata.xml</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">208</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'sizeofdemobibdata': os.path.getsize(os.path.join(CFG_TMPDIR, "demobibdata.xml"))}
         # insert test record:
         testrec_expected_urls = []
         for files in ('site_logo.gif', 'head.gif', '9809057.pdf', 'demobibdata.xml'):
             testrec_expected_urls.append('%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/%(files)s' % {'siteurl' : CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'files' : files})
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_urls = []
         for files in ('site_logo.gif', 'head.gif', '9809057.pdf', 'demobibdata.xml'):
             testrec_expected_urls.append('%(siteurl)s/%(CFG_SITE_RECORD)s/%(recid)s/files/%(files)s' % {'siteurl' : CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD, 'files' : files, 'recid' : recid})
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
 
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
         for url in testrec_expected_urls:
             self.failUnless(try_url_download(url))
 
         self._test_bibdoc_status(recid, 'head', '')
         self._test_bibdoc_status(recid, '9809057', '')
         self._test_bibdoc_status(recid, 'site_logo', '')
         self._test_bibdoc_status(recid, 'demobibdata', '')
 
     def test_simple_fft_correct(self):
         """bibupload - simple FFT correct"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/sb.gif</subfield>
           <subfield code="n">site_logo</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">79</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self._test_bibdoc_status(recid, 'site_logo', '')
 
     def test_fft_correct_already_exists(self):
         """bibupload - FFT correct with already identical existing file"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="d">a description</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/help.png</subfield>
           <subfield code="n">site_logo</subfield>
           <subfield code="d">another description</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/rss.png</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/line.gif</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/merge.png</subfield>
           <subfield code="n">line</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="d">a second description</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/help.png</subfield>
           <subfield code="n">site_logo</subfield>
           <subfield code="d">another second description</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/refresh.png</subfield>
           <subfield code="n">rss</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/line.gif</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/merge-small.png</subfield>
           <subfield code="n">line</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">35</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.gif</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">626</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.png</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">432</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/rss.png</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
           <subfield code="y">a second description</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">786</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png</subfield>
           <subfield code="y">another second description</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/rss.png" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url3 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url4 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.png" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url5 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/line.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         testrec_expected_url2 = testrec_expected_url2.replace('123456789',
                                                           str(recid))
         testrec_expected_url3 = testrec_expected_url3.replace('123456789',
                                                           str(recid))
         testrec_expected_url4 = testrec_expected_url4.replace('123456789',
                                                           str(recid))
         testrec_expected_url5 = testrec_expected_url5.replace('123456789',
                                                           str(recid))
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload(recs[0], opt_mode='correct')
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(try_url_download(testrec_expected_url2))
         self.failUnless(try_url_download(testrec_expected_url3))
         self.failUnless(try_url_download(testrec_expected_url4))
         self.failUnless(try_url_download(testrec_expected_url5))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         bibrecdocs = BibRecDocs(recid)
         self.failUnless(bibrecdocs.get_bibdoc('rss').list_versions(), [1, 2])
         self.failUnless(bibrecdocs.get_bibdoc('site_logo').list_versions(), [1])
         self.failUnless(bibrecdocs.get_bibdoc('line').list_versions(), [1, 2])
 
     def test_fft_correct_modify_doctype(self):
         """bibupload - FFT correct with different doctype"""
         test_to_upload = """
         <record>
             <controlfield tag="003">SzGeCERN</controlfield>
             <datafield tag="FFT" ind1=" " ind2=" ">
                 <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
                 <subfield code="d">a description</subfield>
                 <subfield code="t">TEST1</subfield>
             </datafield>
         </record>
         """ % {
               'siteurl': CFG_SITE_URL
               }
 
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="n">site_logo</subfield>
           <subfield code="t">TEST2</subfield>
          </datafield>
         </record>
         """
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
           <subfield code="y">a description</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert')
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         bibrecdocs = BibRecDocs(recid)
         self.failUnless(bibrecdocs.get_bibdoc('site_logo').doctype, 'TEST1')
 
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload(recs[0], opt_mode='correct')
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
         bibrecdocs = BibRecDocs(recid)
         self.failUnless(bibrecdocs.get_bibdoc('site_logo').doctype, 'TEST2')
 
     def test_fft_append_already_exists(self):
         """bibupload - FFT append with already identical existing file"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="d">a description</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_append = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="d">a second description</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/help.png</subfield>
           <subfield code="n">site_logo</subfield>
           <subfield code="d">another second description</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
           <subfield code="y">a description</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">786</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png</subfield>
           <subfield code="y">another second description</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.png" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         _, recid, _ = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         test_to_append = test_to_append.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_append)
         err, recid, msg = bibupload.bibupload(recs[0], opt_mode='append')
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(try_url_download(testrec_expected_url2))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
     def test_fft_implicit_fix_marc(self):
         """bibupload - FFT implicit FIX-MARC"""
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="0" ind2=" ">
           <subfield code="f">foo@bar.com</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="f">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="856" ind1="0" ind2=" ">
           <subfield code="f">foo@bar.com</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="u">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="0" ind2=" ">
           <subfield code="f">foo@bar.com</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="u">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         # correct test record with implicit FIX-MARC:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
     def test_fft_vs_bibedit(self):
         """bibupload - FFT Vs. BibEdit compatibility"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_replace = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="u">http://www.google.com/</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="z">BibEdit Comment</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
           <subfield code="y">BibEdit Description</subfield>
           <subfield code="x">01</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="u">http://cern.ch/</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         testrec_expected_xm = str(test_to_replace)
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         test_to_replace = test_to_replace.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_replace)
         bibupload.bibupload_records(recs, opt_mode='replace')[0]
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self._test_bibdoc_status(recid, 'site_logo', '')
 
         bibrecdocs = BibRecDocs(recid)
         bibdoc = bibrecdocs.get_bibdoc('site_logo')
         self.assertEqual(bibdoc.get_description('.gif'), 'BibEdit Description')
 
 
     def test_detailed_fft_correct(self):
         """bibupload - detailed FFT correct
         """
         # define the test case:
 
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="d">Try</subfield>
           <subfield code="z">Comment</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/head.gif</subfield>
           <subfield code="n">site_logo</subfield>
           <subfield code="m">patata</subfield>
           <subfield code="d">Next Try</subfield>
           <subfield code="z">KEEP-OLD-VALUE</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">208</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif</subfield>
           <subfield code="y">Next Try</subfield>
           <subfield code="z">Comment</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
 
         # replace test buffers with real recid of inserted test record:
 
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
 
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='correct')
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
 
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self._test_bibdoc_status(recid, 'patata', '')
 
     def test_no_url_fft_correct(self):
         """bibupload - no_url FFT correct"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="d">Try</subfield>
           <subfield code="z">Comment</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="n">site_logo</subfield>
           <subfield code="m">patata</subfield>
           <subfield code="f">.gif</subfield>
           <subfield code="d">KEEP-OLD-VALUE</subfield>
           <subfield code="z">Next Comment</subfield>
          </datafield>
         </record>
         """
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif</subfield>
           <subfield code="y">Try</subfield>
           <subfield code="z">Next Comment</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
 
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
 
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
 
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self._test_bibdoc_status(recid, 'patata', '')
 
     def test_new_icon_fft_append(self):
         """bibupload - new icon FFT append"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
         </record>
         """
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="n">site_logo</subfield>
           <subfield code="x">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif?subformat=icon</subfield>
           <subfield code="x">icon</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
 
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
 
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='append')[0]
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self.failUnless(try_url_download(testrec_expected_url))
         self._test_bibdoc_status(recid, 'site_logo', '')
 
 
     def test_multiple_fft_correct(self):
         """bibupload - multiple FFT correct"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="d">Try</subfield>
           <subfield code="z">Comment</subfield>
           <subfield code="r">Restricted</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/okay.gif</subfield>
           <subfield code="n">site_logo</subfield>
           <subfield code="f">.jpeg</subfield>
           <subfield code="d">Try jpeg</subfield>
           <subfield code="z">Comment jpeg</subfield>
           <subfield code="r">Restricted</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/loading.gif</subfield>
           <subfield code="n">site_logo</subfield>
           <subfield code="m">patata</subfield>
           <subfield code="f">.gif</subfield>
           <subfield code="r">New restricted</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">9427</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/patata.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
 
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
 
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
 
         self.failUnless("This file is restricted." in urlopen(testrec_expected_url).read())
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
 
         self._test_bibdoc_status(recid, 'patata', 'New restricted')
 
     def test_purge_fft_correct(self):
         """bibupload - purge FFT correct"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/head.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_purge = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="t">PURGE</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">208</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif</subfield>
          </datafield>
         </record>
         """ % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         test_to_purge = test_to_purge.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='correct')[0]
         self.check_record_consistency(recid)
 
         # purge test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_purge)
         bibupload.bibupload_records(recs, opt_mode='correct')
         self.check_record_consistency(recid)
 
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self._test_bibdoc_status(recid, 'site_logo', '')
         self._test_bibdoc_status(recid, 'head', '')
 
     def test_revert_fft_correct(self):
         """bibupload - revert FFT correct"""
         # define the test case:
         email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3]
         email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3]
         email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4]
         email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5]
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/iconpen.gif</subfield>
           <subfield code="n">site_logo</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL,
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%s/img/head.gif</subfield>
           <subfield code="n">site_logo</subfield>
          </datafield>
         </record>
         """ % CFG_SITE_URL
         test_to_revert = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="n">site_logo</subfield>
           <subfield code="t">REVERT</subfield>
           <subfield code="v">1</subfield>
          </datafield>
         </record>
         """
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">171</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL,
             'CFG_SITE_RECORD': CFG_SITE_RECORD,
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         test_to_revert = test_to_revert.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         bibupload.bibupload_records(recs, opt_mode='correct')
         self.check_record_consistency(recid)
 
         # revert test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_revert)
         bibupload.bibupload_records(recs, opt_mode='correct')
         self.check_record_consistency(recid)
 
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
         self._test_bibdoc_status(recid, 'site_logo', '')
 
         expected_content_version1 = urlopen('%s/img/iconpen.gif' % CFG_SITE_URL).read()
         expected_content_version2 = urlopen('%s/img/head.gif' % CFG_SITE_URL).read()
         expected_content_version3 = expected_content_version1
 
         self.assertEqual(test_web_page_content('%s/%s/%s/files/site_logo.gif?version=1' % (CFG_SITE_URL, CFG_SITE_RECORD, recid), 'jekyll', 'j123ekyll', expected_content_version1), [])
         self.assertEqual(test_web_page_content('%s/%s/%s/files/site_logo.gif?version=2' % (CFG_SITE_URL, CFG_SITE_RECORD, recid), 'jekyll', 'j123ekyll', expected_content_version2), [])
         self.assertEqual(test_web_page_content('%s/%s/%s/files/site_logo.gif?version=3' % (CFG_SITE_URL, CFG_SITE_RECORD, recid), 'jekyll', 'j123ekyll', expected_content_version3), [])
 
 
     def test_simple_fft_replace(self):
         """bibupload - simple FFT replace"""
         # define the test case:
         email_tag = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][0:3]
         email_ind1 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][3]
         email_ind2 = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][4]
         email_code = CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS[0][5]
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/iconpen.gif</subfield>
           <subfield code="n">site_logo</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL,
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
 
         test_to_replace = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/head.gif</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL,
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
 
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="%(email_tag)s" ind1="%(email_ind1)s" ind2="%(email_ind2)s">
           <subfield code="%(email_code)s">jekyll@cds.cern.ch</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">208</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif</subfield>
          </datafield>
         </record>
         """ % {
              'siteurl': CFG_SITE_URL,
             'CFG_SITE_RECORD': CFG_SITE_RECORD,
             'email_tag': email_tag,
             'email_ind1': email_ind1 == '_' and ' ' or email_ind1,
             'email_ind2': email_ind2 == '_' and ' ' or email_ind2,
             'email_code': email_code}
 
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/head.gif" % { 'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
 
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         test_to_replace = test_to_replace.replace('123456789',
                                                           str(recid))
         # replace test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_replace)
         bibupload.bibupload_records(recs, opt_mode='replace')
         self.check_record_consistency(recid)
 
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(try_url_download(testrec_expected_url))
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
 
         expected_content_version = urlopen('%s/img/head.gif' % CFG_SITE_URL).read()
 
         self.assertEqual(test_web_page_content(testrec_expected_url, 'hyde', 'h123yde', expected_text='Authorization failure'), [])
         self.assertEqual(test_web_page_content(testrec_expected_url, 'jekyll', 'j123ekyll', expected_text=expected_content_version), [])
 
 
     def test_simple_fft_replace_or_insert(self):
         """bibupload - simple FFT replace_or_insert"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/iconpen.gif</subfield>
           <subfield code="n">site_logo</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL,}
 
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='replace_or_insert')[0]
         self.check_record_consistency(recid)
 
         ## When insert_or_replace a record for the first time, it should be like
         ## a simple insert, hence affected_fields should be empty.
         ## This also for the special case of FFT.
         affected_fields = run_sql("SELECT affected_fields FROM hstRECORD where id_bibrec=%s", (recid,))
         self.assertEqual(len(affected_fields), 1)
         self.failIf(affected_fields[0][0])
 
 
     def test_simple_fft_insert_with_modification_time(self):
         """bibupload - simple FFT insert with modification time"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="s">2006-05-04 03:02:01</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         testrec_expected_xm = """
         <record>
         <controlfield tag="001">123456789</controlfield>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="856" ind1="4" ind2=" ">
           <subfield code="s">2032</subfield>
           <subfield code="u">%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
         </record>
         """ % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/site_logo.gif" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         testrec_expected_url2 = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload_records(recs, opt_mode='insert')[0]
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_xm = testrec_expected_xm.replace('123456789',
                                                           str(recid))
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         testrec_expected_url2 = testrec_expected_url2.replace('123456789',
                                                           str(recid))
         # compare expected results:
         inserted_xm = print_record(recid, 'xm')
         self.failUnless(records_identical(create_record(inserted_xm)[0], create_record(testrec_expected_xm)[0], ignore_subfield_order=True, ignore_field_order=True))
         self.failUnless(try_url_download(testrec_expected_url))
         self.force_webcoll(recid)
         self.tear_down = True
         self.assertEqual(test_web_page_content(testrec_expected_url2, expected_text='<em>04 May 2006, 03:02</em>'), [])
 
 
     def test_multiple_fft_insert_with_modification_time(self):
         """bibupload - multiple FFT insert with modification time"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="s">2006-05-04 03:02:01</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/head.gif</subfield>
           <subfield code="s">2007-05-04 03:02:01</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/%(CFG_SITE_RECORD)s/95/files/9809057.pdf</subfield>
           <subfield code="s">2008-05-04 03:02:01</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(prefix)s/var/tmp/demobibdata.xml</subfield>
           <subfield code="s">2009-05-04 03:02:01</subfield>
          </datafield>
         </record>
         """ % {
             'prefix': CFG_PREFIX,
             'siteurl': CFG_SITE_URL,
             'CFG_SITE_RECORD': CFG_SITE_RECORD,
         }
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         self.force_webcoll(recid)
         self.assertEqual(test_web_page_content(testrec_expected_url, expected_text=['<em>04 May 2006, 03:02</em>', '<em>04 May 2007, 03:02</em>', '<em>04 May 2008, 03:02</em>', '<em>04 May 2009, 03:02</em>']), [])
 
     def test_simple_fft_correct_with_modification_time(self):
         """bibupload - simple FFT correct with modification time"""
         # define the test case:
         test_to_upload = """
         <record>
         <controlfield tag="003">SzGeCERN</controlfield>
          <datafield tag="100" ind1=" " ind2=" ">
           <subfield code="a">Test, John</subfield>
           <subfield code="u">Test University</subfield>
          </datafield>
          <datafield tag="980" ind1=" " ind2=" ">
           <subfield code="a">ARTICLE</subfield>
          </datafield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/site_logo.gif</subfield>
           <subfield code="s">2007-05-04 03:02:01</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         test_to_correct = """
         <record>
         <controlfield tag="001">123456789</controlfield>
          <datafield tag="FFT" ind1=" " ind2=" ">
           <subfield code="a">%(siteurl)s/img/sb.gif</subfield>
           <subfield code="n">site_logo</subfield>
           <subfield code="s">2008-05-04 03:02:01</subfield>
          </datafield>
         </record>
         """ % {
             'siteurl': CFG_SITE_URL
         }
         testrec_expected_url = "%(siteurl)s/%(CFG_SITE_RECORD)s/123456789/files/" \
             % {'siteurl': CFG_SITE_URL, 'CFG_SITE_RECORD': CFG_SITE_RECORD}
         # insert test record:
         recs = bibupload.xml_marc_to_records(test_to_upload)
         dummy, recid, dummy = bibupload.bibupload(recs[0], opt_mode='insert')
         self.check_record_consistency(recid)
         # replace test buffers with real recid of inserted test record:
         testrec_expected_url = testrec_expected_url.replace('123456789',
                                                           str(recid))
         test_to_correct = test_to_correct.replace('123456789',
                                                           str(recid))
         # correct test record with new FFT:
         recs = bibupload.xml_marc_to_records(test_to_correct)
         err, recid, msg = bibupload.bibupload(recs[0], opt_mode='correct')
         self.check_record_consistency(recid)
         self.force_webcoll(recid)
         self.assertEqual(test_web_page_content(testrec_expected_url, expected_text=['<em>04 May 2008, 03:02</em>']), [])
 
 
 TEST_SUITE = make_test_suite(BibUploadNoUselessHistoryTest,
                              BibUploadHoldingPenTest,
                              BibUploadInsertModeTest,
                              BibUploadAppendModeTest,
                              BibUploadCorrectModeTest,
                              BibUploadDeleteModeTest,
                              BibUploadReplaceModeTest,
                              BibUploadReferencesModeTest,
                              BibUploadRecordsWithSYSNOTest,
                              BibUploadRecordsWithEXTOAIIDTest,
                              BibUploadRecordsWithOAIIDTest,
                              BibUploadIndicatorsTest,
                              BibUploadUpperLowerCaseTest,
                              BibUploadControlledProvenanceTest,
                              BibUploadStrongTagsTest,
                              BibUploadFFTModeTest,
                              BibUploadPretendTest,
                              BibUploadCallbackURLTest,
                              BibUploadMoreInfoTest,
                              BibUploadBibRelationsTest,
                              BibUploadRecordsWithDOITest,
                              BibUploadTypicalBibEditSessionTest,
                              BibUploadRealCaseRemovalDOIViaBibEdit,
                              )
 
 if __name__ == "__main__":
     run_test_suite(TEST_SUITE, warn_user=True)
diff --git a/modules/miscutil/lib/inveniocfg.py b/modules/miscutil/lib/inveniocfg.py
index 78c1caa2d..3f1bee5f1 100644
--- a/modules/miscutil/lib/inveniocfg.py
+++ b/modules/miscutil/lib/inveniocfg.py
@@ -1,1785 +1,1785 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
 # Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 Invenio configuration and administration CLI tool.
 
 Usage: inveniocfg [options]
 
 General options:
    -h, --help               print this help
    -V, --version            print version number
 
 Options to finish your installation:
    --create-apache-conf     create Apache configuration files
    --create-tables          create DB tables for Invenio
    --load-bibfield-conf     load the BibField configuration
    --load-webstat-conf      load the WebStat configuration
    --drop-tables            drop DB tables of Invenio
    --check-openoffice       check for correctly set up of openoffice temporary directory
 
 Options to set up and test a demo site:
    --create-demo-site       create demo site
    --load-demo-records      load demo records
    --remove-demo-records    remove demo records, keeping demo site
    --drop-demo-site         drop demo site configurations too
    --run-unit-tests         run unit test suite (needs demo site)
    --run-regression-tests   run regression test suite (needs demo site)
    --run-web-tests          run web tests in a browser (needs demo site, Firefox, Selenium IDE)
 
 Options to update config files in situ:
    --update-all             perform all the update options
    --update-config-py       update config.py file from invenio.conf file
    --update-dbquery-py      update dbquery.py with DB credentials from invenio.conf
    --update-dbexec          update dbexec with DB credentials from invenio.conf
    --update-bibconvert-tpl  update bibconvert templates with CFG_SITE_URL from invenio.conf
    --update-web-tests       update web test cases with CFG_SITE_URL from invenio.conf
 
 Options to update DB tables:
    --reset-all              perform all the reset options
    --reset-sitename         reset tables to take account of new CFG_SITE_NAME*
    --reset-siteadminemail   reset tables to take account of new CFG_SITE_ADMIN_EMAIL
    --reset-fieldnames       reset tables to take account of new I18N names from PO files
    --reset-recstruct-cache  reset record structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE
    --reset-recjson-cache    reset record json cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE
 
 Options to upgrade your installation:
     --upgrade                       apply all pending upgrades
     --upgrade-check                 run pre-upgrade checks for all pending upgrades
     --upgrade-show-pending          show pending upgrades ready to be applied
     --upgrade-show-applied          show history of applied upgrades
     --upgrade-create-standard-recipe create a new upgrade recipe (for developers)
     --upgrade-create-release-recipe create a new release upgrade recipe (for developers)
 
 Options to help the work:
    --list                   print names and values of all options from conf files
    --get <some-opt>         get value of a given option from conf files
    --conf-dir </some/path>  path to directory where invenio*.conf files are [optional]
    --detect-system-details  print system details such as Apache/Python/MySQL versions
 """
 
 __revision__ = "$Id$"
 
 from ConfigParser import ConfigParser
 from optparse import OptionParser, OptionGroup, IndentedHelpFormatter, Option, \
     OptionError
 import os
 import re
 import shutil
 import socket
 import sys
 import zlib
 
 
 def print_usage():
     """Print help."""
     print __doc__
 
 
 def get_version():
     """ Get running version of Invenio """
     from invenio.config import CFG_VERSION
     return CFG_VERSION
 
 
 def print_version():
     """Print version information."""
     print get_version()
 
 
 def convert_conf_option(option_name, option_value):
     """
     Convert conf option into Python config.py line, converting
     values to ints or strings as appropriate.
     """
 
     ## 1) convert option name to uppercase:
     option_name = option_name.upper()
 
     ## 1a) adjust renamed variables:
     if option_name in ['CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_DOCTYPES',
                           'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_RESTRICTIONS',
                           'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_MISC',
                           'CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT',
                           'CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS',
                           'CFG_WEBSUBMIT_DESIRED_CONVERSIONS']:
         new_option_name = option_name.replace('WEBSUBMIT', 'BIBDOCFILE')
         print >> sys.stderr, ("""WARNING: %s has been renamed to %s.
 Please, update your invenio-local.conf file accordingly.""" % (option_name, new_option_name))
         option_name = new_option_name
 
 
     ## 2) convert option value to int or string:
     if option_name in ['CFG_BIBUPLOAD_REFERENCE_TAG',
                        'CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG',
                        'CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG',
                        'CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG',
                        'CFG_BIBUPLOAD_STRONG_TAGS',
                        'CFG_BIBFORMAT_HIDDEN_TAGS']:
         # some options are supposed be string even when they look like
         # numeric
         option_value = '"' + option_value + '"'
     else:
         try:
             option_value = int(option_value)
         except ValueError:
             option_value = '"' + option_value + '"'
 
     ## 3a) special cases: chars regexps
     if option_name in ['CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS',
                        'CFG_BIBINDEX_CHARS_PUNCTUATION']:
         option_value = 'r"[' + option_value[1:-1] + ']"'
 
     ## 3abis) special cases: real regexps
     if option_name in ['CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES',
                        'CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS',
                        'CFG_BIBUPLOAD_INTERNAL_DOI_PATTERN']:
         option_value = 'r"' + option_value[1:-1] + '"'
 
     ## 3b) special cases: True, False, None
     if option_value in ['"True"', '"False"', '"None"']:
         option_value = option_value[1:-1]
 
     ## 3c) special cases: dicts and real pythonic lists
     if option_name in ['CFG_WEBSEARCH_FIELDS_CONVERT',
                        'CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS',
                        'CFG_WEBSEARCH_FULLTEXT_SNIPPETS',
                        'CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS',
                        'CFG_SITE_EMERGENCY_EMAIL_ADDRESSES',
                        'CFG_BIBMATCH_FUZZY_WORDLIMITS',
                        'CFG_BIBMATCH_QUERY_TEMPLATES',
                        'CFG_WEBSEARCH_SYNONYM_KBRS',
                        'CFG_BIBINDEX_SYNONYM_KBRS',
                        'CFG_WEBCOMMENT_EMAIL_REPLIES_TO',
                        'CFG_WEBCOMMENT_RESTRICTION_DATAFIELD',
                        'CFG_WEBCOMMENT_ROUND_DATAFIELD',
                        'CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS',
                        'CFG_BIBSCHED_NODE_TASKS',
                        'CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE',
                        'CFG_OAI_METADATA_FORMATS',
                        'CFG_BIBDOCFILE_DESIRED_CONVERSIONS',
                        'CFG_BIBDOCFILE_BEST_FORMATS_TO_EXTRACT_TEXT_FROM',
                        'CFG_WEB_API_KEY_ALLOWED_URL',
                        'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_MISC',
                        'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_DOCTYPES',
                        'CFG_BIBDOCFILE_DOCUMENT_FILE_MANAGER_RESTRICTIONS',
                        'CFG_REFEXTRACT_KBS_OVERRIDE',
                        'CFG_OPENID_CONFIGURATIONS',
                        'CFG_OAUTH1_CONFIGURATIONS',
                        'CFG_OAUTH2_CONFIGURATIONS',
                        'CFG_BIBDOCFILE_ADDITIONAL_KNOWN_MIMETYPES',
                        'CFG_BIBDOCFILE_PREFERRED_MIMETYPES_MAPPING',
                        'CFG_BIBSCHED_NON_CONCURRENT_TASKS',
                        'CFG_REDIS_HOSTS',
                        'CFG_BIBSCHED_INCOMPATIBLE_TASKS',
                        'CFG_ICON_CREATION_FORMAT_MAPPINGS',
                        'CFG_BIBEDIT_AUTOCOMPLETE']:
         try:
             option_value = option_value[1:-1]
             if option_name == "CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE" and option_value.strip().startswith("{"):
                 print >> sys.stderr, ("""ERROR: CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE
 now accepts only a list of tuples, not a dictionary. Check invenio.conf for an example.
 Please, update your invenio-local.conf file accordingly.""")
                 sys.exit(1)
         except TypeError:
             if option_name in ('CFG_WEBSEARCH_FULLTEXT_SNIPPETS',):
                 print >> sys.stderr, """WARNING: CFG_WEBSEARCH_FULLTEXT_SNIPPETS
 has changed syntax: it can be customised to display different snippets for
 different document types.  See the corresponding documentation in invenio.conf.
 You may want to customise your invenio-local.conf configuration accordingly."""
                 option_value = """{'': %s}""" % option_value
             else:
                 print >> sys.stderr, "ERROR: type error in %s value %s." % \
                       (option_name, option_value)
                 sys.exit(1)
 
     ## 3cbis) very special cases: dicts with backward compatible string
     if option_name in ['CFG_BIBINDEX_SPLASH_PAGES']:
         if option_value.startswith('"{') and option_value.endswith('}"'):
             option_value = option_value[1:-1]
         else:
             option_value = """{%s: ".*"}""" % option_value
 
     ## 3d) special cases: comma-separated lists
     if option_name in ['CFG_SITE_LANGS',
                        'CFG_BIBDOCFILE_ADDITIONAL_KNOWN_FILE_EXTENSIONS',
                        'CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS',
                        'CFG_BIBUPLOAD_STRONG_TAGS',
                        'CFG_BIBFORMAT_HIDDEN_TAGS',
                        'CFG_BIBFORMAT_HIDDEN_RECJSON_FIELDS',
                        'CFG_BIBSCHED_GC_TASKS_TO_REMOVE',
                        'CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE',
                        'CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS',
                        'CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS',
                        'CFG_BIBUPLOAD_DELETE_FORMATS',
                        'CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST',
                        'CFG_WEBSEARCH_RSS_I18N_COLLECTIONS',
                        'CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY',
                        'CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY',
                        'CFG_BIBCIRCULATION_ITEM_STATUS_OPTIONAL',
                        'CFG_PLOTEXTRACTOR_DISALLOWED_TEX',
                        'CFG_OAI_FRIENDS',
                        'CFG_WEBSTYLE_REVERSE_PROXY_IPS',
                        'CFG_BIBEDIT_AUTOCOMPLETE_INSTITUTIONS_FIELDS',
                        'CFG_BIBFORMAT_DISABLE_I18N_FOR_CACHED_FORMATS',
                        'CFG_BIBFORMAT_HIDDEN_FILE_FORMATS',
                        'CFG_BIBFIELD_MASTER_FORMATS',
                        'CFG_OPENID_PROVIDERS',
                        'CFG_OAUTH1_PROVIDERS',
                        'CFG_OAUTH2_PROVIDERS',
                        'CFG_BIBFORMAT_CACHED_FORMATS',
                        'CFG_BIBEDIT_ADD_TICKET_RT_QUEUES',
                        'CFG_BIBAUTHORID_ENABLED_REMOTE_LOGIN_SYSTEMS',]:
         out = "["
         for elem in option_value[1:-1].split(","):
             if elem:
                 elem = elem.strip()
                 # string values
                 out += "'%s', " % elem
         out += "]"
         option_value = out
 
     ## 3e) special cases: multiline
     if option_name == 'CFG_OAI_IDENTIFY_DESCRIPTION':
         # make triple quotes
         option_value = '""' + option_value + '""'
 
     ## 3f) ignore some options:
     if option_name.startswith('CFG_SITE_NAME_INTL'):
         # treated elsewhere
         return
 
     ## 3g) special cases: float
     if option_name in ['CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY',
                        'CFG_BIBMATCH_LOCAL_SLEEPTIME',
                        'CFG_BIBMATCH_REMOTE_SLEEPTIME',
                        'CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT',
                        'CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT']:
         option_value = float(option_value[1:-1])
 
     ## 3h) special cases: bibmatch validation list
     if option_name in ['CFG_BIBMATCH_MATCH_VALIDATION_RULESETS']:
         option_value = option_value[1:-1]
 
     ## 4a) dropped variables
     if option_name in ['CFG_BATCHUPLOADER_WEB_ROBOT_AGENT']:
         print >> sys.stderr, ("""ERROR: CFG_BATCHUPLOADER_WEB_ROBOT_AGENT has been dropped in favour of
 CFG_BATCHUPLOADER_WEB_ROBOT_AGENTS.
 Please, update your invenio-local.conf file accordingly.""")
         sys.exit(1)
         option_value = option_value[1:-1]
     elif option_name in ['CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_DOCTYPES',
                           'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_RESTRICTIONS',
                           'CFG_WEBSUBMIT_DOCUMENT_FILE_MANAGER_MISC',
                           'CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT',
                           'CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS',
                           'CFG_WEBSUBMIT_DESIRED_CONVERSIONS']:
         new_option_name = option_name.replace('WEBSUBMIT', 'BIBDOCFILE')
         print >> sys.stderr, ("""ERROR: %s has been renamed to %s.
 Please, update your invenio-local.conf file accordingly.""" % (option_name, new_option_name))
         option_name = new_option_name
 
 
 
     ## 5) finally, return output line:
     return '%s = %s' % (option_name, option_value)
 
 def cli_cmd_update_config_py(conf):
     """
     Update new config.py from conf options, keeping previous
     config.py in a backup copy.
     """
     ## NOTE: the following function exists also in urlutils.py
     ## However we can't import urlutils here, as it depends on config.py
     ## to already exist, while we are in the process of creating it.
     def get_relative_url(url):
         """
         Returns the relative URL from a URL. For example:
 
         'http://web.net' -> ''
         'http://web.net/' -> ''
         'http://web.net/1222' -> '/1222'
         'http://web.net/wsadas/asd' -> '/wsadas/asd'
 
         It will never return a trailing "/".
 
         @param url: A url to transform
         @type url: str
 
         @return: relative URL
         """
         # remove any protocol info before
         stripped_site_url = url.replace("://", "")
         baseurl = "/" + "/".join(stripped_site_url.split("/")[1:])
 
         # remove any trailing slash ("/")
         if baseurl[-1] == "/":
             return baseurl[:-1]
         else:
             return baseurl
 
     print ">>> Going to update config.py..."
     ## location where config.py is:
     configpyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \
                    os.sep + 'invenio' + os.sep + 'config.py'
     ## backup current config.py file:
     if os.path.exists(configpyfile):
         shutil.copy(configpyfile, configpyfile + '.OLD')
     ## here we go:
     fdesc = open(configpyfile, 'w')
     ## generate preamble:
     fdesc.write("# -*- coding: utf-8 -*-\n")
     fdesc.write("# DO NOT EDIT THIS FILE!  IT WAS AUTOMATICALLY GENERATED\n")
     fdesc.write("# FROM INVENIO.CONF BY EXECUTING:\n")
     fdesc.write("# " + " ".join(sys.argv) + "\n")
     ## special treatment for CFG_SITE_NAME_INTL options:
     fdesc.write("CFG_SITE_NAME_INTL = {}\n")
     for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","):
         fdesc.write("CFG_SITE_NAME_INTL['%s'] = \"%s\"\n" % (lang, conf.get("Invenio",
                                                                             "CFG_SITE_NAME_INTL_" + lang)))
     ## special treatment for CFG_SITE_SECURE_URL that may be empty, in
     ## which case it should be put equal to CFG_SITE_URL:
     if not conf.get("Invenio", "CFG_SITE_SECURE_URL"):
         conf.set("Invenio", "CFG_SITE_SECURE_URL",
                  conf.get("Invenio", "CFG_SITE_URL"))
     ## Special treatment of base URL, adding CFG_BASE_URL
     base_url = get_relative_url(conf.get("Invenio", "CFG_SITE_URL"))
     fdesc.write("CFG_BASE_URL = \"%s\"\n" % (base_url,))
     ## process all the options normally:
     sections = conf.sections()
     sections.sort()
     for section in sections:
         options = conf.options(section)
         options.sort()
         for option in options:
             if not option.upper().startswith('CFG_DATABASE_'):
                 # put all options except for db credentials into config.py
                 line_out = convert_conf_option(option, conf.get(section, option))
                 if line_out:
                     fdesc.write(line_out + "\n")
     ## generate postamble:
     fdesc.write("")
     fdesc.write("# END OF GENERATED FILE")
     ## we are done:
     fdesc.close()
     print "You may want to restart Apache now."
     print ">>> config.py updated successfully."
 
 def cli_cmd_update_dbquery_py(conf):
     """
     Update lib/dbquery.py file with DB parameters read from conf file.
     Note: this edits dbquery.py in situ, taking a backup first.
     Use only when you know what you are doing.
     """
     print ">>> Going to update dbquery.py..."
     ## location where dbquery.py is:
     dbqueryconfigpyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \
                     os.sep + 'invenio' + os.sep + 'dbquery_config.py'
     ## backup current dbquery.py file:
     if os.path.exists(dbqueryconfigpyfile + 'c'):
         shutil.copy(dbqueryconfigpyfile + 'c', dbqueryconfigpyfile + 'c.OLD')
 
     out = ["%s = '%s'\n" % (item.upper(), value) \
                         for item, value in conf.items('Invenio') \
                         if item.upper().startswith('CFG_DATABASE_')]
 
     fdesc = open(dbqueryconfigpyfile, 'w')
     fdesc.write("# -*- coding: utf-8 -*-\n")
     fdesc.writelines(out)
     fdesc.close()
 
     print "You may want to restart Apache now."
     print ">>> dbquery.py updated successfully."
 
 def cli_cmd_update_dbexec(conf):
     """
     Update bin/dbexec file with DB parameters read from conf file.
     Note: this edits dbexec in situ, taking a backup first.
     Use only when you know what you are doing.
     """
     print ">>> Going to update dbexec..."
     ## location where dbexec is:
     dbexecfile = conf.get("Invenio", "CFG_BINDIR") + \
                     os.sep + 'dbexec'
     ## backup current dbexec file:
     if os.path.exists(dbexecfile):
         shutil.copy(dbexecfile, dbexecfile + '.OLD')
     ## replace db parameters via sed:
     out = ''
     for line in open(dbexecfile, 'r').readlines():
         match = re.search(r'^CFG_DATABASE_(HOST|PORT|NAME|USER|PASS|SLAVE)(\s*=\s*)\'.*\'$', line)
         if match:
             dbparam = 'CFG_DATABASE_' + match.group(1)
             out += "%s%s'%s'\n" % (dbparam, match.group(2),
                                    conf.get("Invenio", dbparam))
         else:
             out += line
     fdesc = open(dbexecfile, 'w')
     fdesc.write(out)
     fdesc.close()
     print ">>> dbexec updated successfully."
 
 def cli_cmd_update_bibconvert_tpl(conf):
     """
     Update bibconvert/config/*.tpl files looking for 856
     http://.../CFG_SITE_RECORD lines, replacing URL with CFG_SITE_URL taken
     from conf file.  Note: this edits tpl files in situ, taking a
     backup first.  Use only when you know what you are doing.
     """
     print ">>> Going to update bibconvert templates..."
     ## location where bibconvert/config/*.tpl are:
     tpldir = conf.get("Invenio", 'CFG_ETCDIR') + \
              os.sep + 'bibconvert' + os.sep + 'config'
     ## find all *.tpl files:
     for tplfilename in os.listdir(tpldir):
         if tplfilename.endswith(".tpl"):
             ## change tpl file:
             tplfile = tpldir + os.sep + tplfilename
             shutil.copy(tplfile, tplfile + '.OLD')
             out = ''
             for line in open(tplfile, 'r').readlines():
                 match = re.search(r'^(.*)http://.*?/%s/(.*)$' % conf.get("Invenio", 'CFG_SITE_RECORD'), line)
                 if match:
                     out += "%s%s/%s/%s\n" % (match.group(1),
                                                  conf.get("Invenio", 'CFG_SITE_URL'),
                                                  conf.get("Invenio", 'CFG_SITE_RECORD'),
                                                  match.group(2))
                 else:
                     out += line
             fdesc = open(tplfile, 'w')
             fdesc.write(out)
             fdesc.close()
     print ">>> bibconvert templates updated successfully."
 
 def cli_cmd_update_web_tests(conf):
     """
     Update web test cases lib/webtest/test_*.html looking for
     <td>http://.+?[</] strings and replacing them with CFG_SITE_URL
     taken from conf file.  Note: this edits test files in situ, taking
     a backup first.  Use only when you know what you are doing.
     """
     print ">>> Going to update web tests..."
     ## location where test_*.html files are:
     testdir = conf.get("Invenio", 'CFG_PREFIX') + os.sep + \
              'lib' + os.sep + 'webtest' + os.sep + 'invenio'
     ## find all test_*.html files:
     for testfilename in os.listdir(testdir):
         if testfilename.startswith("test_") and \
                testfilename.endswith(".html"):
             ## change test file:
             testfile = testdir + os.sep + testfilename
             shutil.copy(testfile, testfile + '.OLD')
             out = ''
             for line in open(testfile, 'r').readlines():
                 match = re.search(r'^(.*<td>)http://.+?([</].*)$', line)
                 if match:
                     out += "%s%s%s\n" % (match.group(1),
                                          conf.get("Invenio", 'CFG_SITE_URL'),
                                          match.group(2))
                 else:
                     match = re.search(r'^(.*<td>)/opt/invenio(.*)$', line)
                     if match:
                         out += "%s%s%s\n" % (match.group(1),
                                             conf.get("Invenio", 'CFG_PREFIX'),
                                             match.group(2))
                     else:
                         out += line
             fdesc = open(testfile, 'w')
             fdesc.write(out)
             fdesc.close()
     print ">>> web tests updated successfully."
 
 def cli_cmd_reset_sitename(conf):
     """
     Reset collection-related tables with new CFG_SITE_NAME and
     CFG_SITE_NAME_INTL* read from conf files.
     """
     print ">>> Going to reset CFG_SITE_NAME and CFG_SITE_NAME_INTL..."
     from invenio.dbquery import run_sql, IntegrityError
     # reset CFG_SITE_NAME:
     sitename = conf.get("Invenio", "CFG_SITE_NAME")
     try:
         run_sql("""INSERT INTO collection (id, name, dbquery, reclist) VALUES
                                           (1,%s,NULL,NULL)""", (sitename,))
     except IntegrityError:
         run_sql("""UPDATE collection SET name=%s WHERE id=1""", (sitename,))
     # reset CFG_SITE_NAME_INTL:
     for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","):
         sitename_lang = conf.get("Invenio", "CFG_SITE_NAME_INTL_" + lang)
         try:
             run_sql("""INSERT INTO collectionname (id_collection, ln, type, value) VALUES
                          (%s,%s,%s,%s)""", (1, lang, 'ln', sitename_lang))
         except IntegrityError:
             run_sql("""UPDATE collectionname SET value=%s
                         WHERE ln=%s AND id_collection=1 AND type='ln'""",
                     (sitename_lang, lang))
     print "You may want to restart Apache now."
     print ">>> CFG_SITE_NAME and CFG_SITE_NAME_INTL* reset successfully."
 
 def cli_cmd_reset_recstruct_cache(conf):
     """If CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE is changed, this function
     will adapt the database to either store or not store the recstruct
     format."""
     from invenio.intbitset import intbitset
     from invenio.dbquery import run_sql, serialize_via_marshal
     from invenio.search_engine import get_record, print_record
     from invenio.bibsched import server_pid, pidfile
     enable_recstruct_cache = conf.get("Invenio", "CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE")
     enable_recstruct_cache = enable_recstruct_cache in ('True', '1')
     pid = server_pid(ping_the_process=False)
     if pid:
         print >> sys.stderr, "ERROR: bibsched seems to run with pid %d, according to %s." % (pid, pidfile)
         print >> sys.stderr, "       Please stop bibsched before running this procedure."
         sys.exit(1)
     if enable_recstruct_cache:
         print ">>> Searching records which need recstruct cache resetting; this may take a while..."
         all_recids = intbitset(run_sql("SELECT id FROM bibrec"))
         good_recids = intbitset(run_sql("SELECT bibrec.id FROM bibrec JOIN bibfmt ON bibrec.id = bibfmt.id_bibrec WHERE format='recstruct' AND modification_date < last_updated"))
         recids = all_recids - good_recids
         print ">>> Generating recstruct cache..."
         tot = len(recids)
         count = 0
         for recid in recids:
             try:
                 value = serialize_via_marshal(get_record(recid))
             except zlib.error, err:
                 print >> sys.stderr, "Looks like XM is corrupted for record %s. Let's recover it from bibxxx" % recid
                 run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='xm'", (recid, ))
                 xm_value = zlib.compress(print_record(recid, 'xm'))
                 run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'xm', NOW(), %s)", (recid, xm_value))
                 value = serialize_via_marshal(get_record(recid))
 
             run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='recstruct'", (recid, ))
-            run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'recstruct', NOW(), %s)", (recid, value))
+            run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'recstruct', NOW(), _binary %s)", (recid, value))
             count += 1
             if count % 1000 == 0:
                 print "    ... done records %s/%s" % (count, tot)
         if count % 1000 != 0:
             print "    ... done records %s/%s" % (count, tot)
         print ">>> recstruct cache generated successfully."
     else:
         print ">>> Cleaning recstruct cache..."
         run_sql("DELETE FROM bibfmt WHERE format='recstruct'")
 
 def cli_cmd_reset_recjson_cache(conf):
     """If CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE is changed, this function
     will adapt the database to either store or not store the recjson
     format."""
     try:
         import cPickle as pickle
     except:
         import pickle
     from invenio.intbitset import intbitset
     from invenio.dbquery import run_sql
     from invenio.bibfield import get_record
     from invenio.bibsched import server_pid, pidfile
     enable_recjson_cache = conf.get("Invenio", "CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE")
     enable_recjson_cache = enable_recjson_cache in ('True', '1')
     pid = server_pid(ping_the_process=False)
     if pid:
         print >> sys.stderr, "ERROR: bibsched seems to run with pid %d, according to %s." % (pid, pidfile)
         print >> sys.stderr, "       Please stop bibsched before running this procedure."
         sys.exit(1)
     if enable_recjson_cache:
         print ">>> Searching records which need recjson cache resetting; this may take a while..."
         all_recids = intbitset(run_sql("SELECT id FROM bibrec"))
         #TODO: prevent doing all records?
         recids = all_recids
         print ">>> Generating recjson cache..."
         tot = len(recids)
         count = 0
         cli_cmd_load_bibfield_config(conf)
         for recid in recids:
             run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='recjson'", (recid,))
             #TODO: Update the cache or wait for the first access
             get_record(recid)
             count += 1
             if count % 1000 == 0:
                 print "    ... done records %s/%s" % (count, tot)
         if count % 1000 != 0:
             print "    ... done records %s/%s" % (count, tot)
         print ">>> recjson cache generated successfully."
 
 def cli_cmd_reset_siteadminemail(conf):
     """
     Reset user-related tables with new CFG_SITE_ADMIN_EMAIL read from conf files.
     """
     print ">>> Going to reset CFG_SITE_ADMIN_EMAIL..."
     from invenio.dbquery import run_sql
     siteadminemail = conf.get("Invenio", "CFG_SITE_ADMIN_EMAIL")
     run_sql("DELETE FROM user WHERE id=1")
     run_sql("""INSERT INTO user (id, email, password, note, nickname) VALUES
                         (1, %s, AES_ENCRYPT(email, ''), 1, 'admin')""",
             (siteadminemail,))
     print "You may want to restart Apache now."
     print ">>> CFG_SITE_ADMIN_EMAIL reset successfully."
 
 def cli_cmd_reset_fieldnames(conf):
     """
     Reset I18N field names such as author, title, etc and other I18N
     ranking method names such as word similarity.  Their translations
     are taken from the PO files.
     """
     print ">>> Going to reset I18N field names..."
     from invenio.messages import gettext_set_language, language_list_long
     from invenio.dbquery import run_sql, IntegrityError
 
     ## get field id and name list:
     field_id_name_list = run_sql("SELECT id, name FROM field")
     ## get rankmethod id and name list:
     rankmethod_id_name_list = run_sql("SELECT id, name FROM rnkMETHOD")
     ## update names for every language:
     for lang, dummy in language_list_long():
         _ = gettext_set_language(lang)
         ## this list is put here in order for PO system to pick names
         ## suitable for translation
         field_name_names = {"any field": _("any field"),
                             "title": _("title"),
                             "author": _("author"),
                             "abstract": _("abstract"),
                             "keyword": _("keyword"),
                             "report number": _("report number"),
                             "subject": _("subject"),
                             "reference": _("reference"),
                             "fulltext": _("fulltext"),
                             "collection": _("collection"),
                             "division": _("division"),
                             "year": _("year"),
                             "journal": _("journal"),
                             "experiment": _("experiment"),
                             "record ID": _("record ID")}
         ## update I18N names for every language:
         for (field_id, field_name) in field_id_name_list:
             if field_name_names.has_key(field_name):
                 try:
                     run_sql("""INSERT INTO fieldname (id_field,ln,type,value) VALUES
                                 (%s,%s,%s,%s)""", (field_id, lang, 'ln',
                                                 field_name_names[field_name]))
                 except IntegrityError:
                     run_sql("""UPDATE fieldname SET value=%s
                                 WHERE id_field=%s AND ln=%s AND type=%s""",
                             (field_name_names[field_name], field_id, lang, 'ln',))
         ## ditto for rank methods:
         rankmethod_name_names = {"wrd": _("word similarity"),
                                  "demo_jif": _("journal impact factor"),
                                  "citation": _("times cited"),
                                  "citerank_citation_t": _("time-decay cite count"),
                                  "citerank_pagerank_c": _("all-time-best cite rank"),
                                  "citerank_pagerank_t": _("time-decay cite rank"),}
         for (rankmethod_id, rankmethod_name) in rankmethod_id_name_list:
             if rankmethod_name_names.has_key(rankmethod_name):
                 try:
                     run_sql("""INSERT INTO rnkMETHODNAME (id_rnkMETHOD,ln,type,value) VALUES
                                 (%s,%s,%s,%s)""", (rankmethod_id, lang, 'ln',
                                                    rankmethod_name_names[rankmethod_name]))
                 except IntegrityError:
                     run_sql("""UPDATE rnkMETHODNAME SET value=%s
                                 WHERE id_rnkMETHOD=%s AND ln=%s AND type=%s""",
                             (rankmethod_name_names[rankmethod_name], rankmethod_id, lang, 'ln',))
 
     print ">>> I18N field names reset successfully."
 
 def cli_check_openoffice(conf):
     """
     If OpenOffice.org integration is enabled, checks whether the system is
     properly configured.
     """
     from invenio.bibtask import check_running_process_user
     from invenio.websubmit_file_converter import can_unoconv, get_file_converter_logger
     logger = get_file_converter_logger()
     for handler in logger.handlers:
         logger.removeHandler(handler)
     check_running_process_user()
     print ">>> Checking if Libre/OpenOffice.org is correctly integrated...",
     sys.stdout.flush()
     if can_unoconv(True):
         print "ok"
     else:
         sys.exit(1)
 
 def test_db_connection():
     """
     Test DB connection, and if fails, advise user how to set it up.
     Useful to be called during table creation.
     """
     print "Testing DB connection...",
     from invenio.textutils import wrap_text_in_a_box
     from invenio.dbquery import run_sql, Error
 
     ## first, test connection to the DB server:
     try:
         run_sql("SHOW TABLES")
     except Error, err:
         from invenio.dbquery import CFG_DATABASE_HOST, CFG_DATABASE_PORT, \
              CFG_DATABASE_NAME, CFG_DATABASE_USER, CFG_DATABASE_PASS
         print wrap_text_in_a_box("""\
 DATABASE CONNECTIVITY ERROR %(errno)d: %(errmsg)s.\n
 
 Perhaps you need to set up database and connection rights?
 If yes, then please login as MySQL admin user and run the
 following commands now:
 
 
 $ mysql -h %(dbhost)s -P %(dbport)s -u root -p mysql
 
 mysql> CREATE DATABASE %(dbname)s DEFAULT CHARACTER SET utf8;
 
 mysql> GRANT ALL PRIVILEGES ON %(dbname)s.*
 
        TO %(dbuser)s@%(webhost)s IDENTIFIED BY '%(dbpass)s';
 
 mysql> QUIT
 
 
 The values printed above were detected from your
 configuration. If they are not right, then please edit your
 invenio-local.conf file and rerun 'inveniocfg --update-all' first.
 
 
 If the problem is of different nature, then please inspect
 the above error message and fix the problem before continuing.""" % \
                                  {'errno': err.args[0],
                                   'errmsg': err.args[1],
                                   'dbname': CFG_DATABASE_NAME,
                                   'dbhost': CFG_DATABASE_HOST,
                                   'dbport': CFG_DATABASE_PORT,
                                   'dbuser': CFG_DATABASE_USER,
                                   'dbpass': CFG_DATABASE_PASS,
                                   'webhost': CFG_DATABASE_HOST == 'localhost' and 'localhost' or os.popen('hostname -f', 'r').read().strip(),
                                   })
         sys.exit(1)
     print "ok"
 
     ## second, test insert/select of a Unicode string to detect
     ## possible Python/MySQL/MySQLdb mis-setup:
     print "Testing Python/MySQL/MySQLdb UTF-8 chain...",
     try:
         try:
             beta_in_utf8 = "β" # Greek beta in UTF-8 is 0xCEB2
             run_sql("CREATE TABLE test__invenio__utf8 (x char(1), y varbinary(2)) DEFAULT CHARACTER SET utf8 ENGINE=MyISAM;")
             run_sql("INSERT INTO test__invenio__utf8 (x, y) VALUES (%s, %s)", (beta_in_utf8, beta_in_utf8))
             res = run_sql("SELECT x,y,HEX(x),HEX(y),LENGTH(x),LENGTH(y),CHAR_LENGTH(x),CHAR_LENGTH(y) FROM test__invenio__utf8")
             assert res[0] == ('\xce\xb2', '\xce\xb2', 'CEB2', 'CEB2', 2L, 2L, 1L, 2L)
             run_sql("DROP TABLE test__invenio__utf8")
         except Exception, err:
             print wrap_text_in_a_box("""\
 DATABASE RELATED ERROR %s\n
 
 A problem was detected with the UTF-8 treatment in the chain
 between the Python application, the MySQLdb connector, and
 the MySQL database. You may perhaps have installed older
 versions of some prerequisite packages?\n
 
 Please check the INSTALL file and please fix this problem
 before continuing.""" % err)
 
             sys.exit(1)
     finally:
         run_sql("DROP TABLE IF EXISTS test__invenio__utf8")
     print "ok"
 
 def cli_cmd_create_tables(conf):
     """Create and fill Invenio DB tables.  Useful for the installation process."""
     print ">>> Going to create and fill tables..."
     from invenio.config import CFG_PREFIX
     test_db_connection()
     for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabcreate.sql" % (CFG_PREFIX, CFG_PREFIX),
                 "%s/bin/dbexec < %s/lib/sql/invenio/tabfill.sql" % (CFG_PREFIX, CFG_PREFIX)]:
         if os.system(cmd):
             print "ERROR: failed execution of", cmd
             sys.exit(1)
     cli_cmd_reset_sitename(conf)
     cli_cmd_reset_siteadminemail(conf)
     cli_cmd_reset_fieldnames(conf)
     for cmd in ["%s/bin/webaccessadmin -u admin -c -a" % CFG_PREFIX]:
         if os.system(cmd):
             print "ERROR: failed execution of", cmd
             sys.exit(1)
     print ">>> Tables created and filled successfully."
 
 def cli_cmd_load_webstat_conf(conf):
     print ">>> Going to load WebStat config..."
     from invenio.config import CFG_PREFIX
     cmd = "%s/bin/webstatadmin --load-config" % CFG_PREFIX
     if os.system(cmd):
         print "ERROR: failed execution of", cmd
         sys.exit(1)
     print ">>> WebStat config load successfully."
 
 def cli_cmd_load_bibfield_config(conf):
     print ">>> Going to load BibField config..."
     from invenio.bibfield_config_engine import BibFieldParser
     BibFieldParser.reparse()
     print ">>> BibField config load successfully."
 
 def cli_cmd_drop_tables(conf):
     """Drop Invenio DB tables.  Useful for the uninstallation process."""
     print ">>> Going to drop tables..."
     from invenio.config import CFG_PREFIX
     from invenio.textutils import wrap_text_in_a_box, wait_for_user
     from invenio.webstat import destroy_customevents
     wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy
 your database tables!"""))
     msg = destroy_customevents()
     if msg:
         print msg
     cmd = "%s/bin/dbexec < %s/lib/sql/invenio/tabdrop.sql" % (CFG_PREFIX, CFG_PREFIX)
     if os.system(cmd):
         print "ERROR: failed execution of", cmd
         sys.exit(1)
     print ">>> Tables dropped successfully."
 
 def cli_cmd_create_demo_site(conf):
     """Create demo site.  Useful for testing purposes."""
     print ">>> Going to create demo site..."
     from invenio.config import CFG_PREFIX
     from invenio.dbquery import run_sql
     run_sql("TRUNCATE schTASK")
     run_sql("TRUNCATE session")
     run_sql("DELETE FROM user WHERE email=''")
     for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/democfgdata.sql" % \
                    (CFG_PREFIX, CFG_PREFIX),]:
         if os.system(cmd):
             print "ERROR: failed execution of", cmd
             sys.exit(1)
     cli_cmd_reset_fieldnames(conf) # needed for I18N demo ranking method names
     for cmd in ["%s/bin/webaccessadmin -u admin -c -r -D" % CFG_PREFIX,
                 "%s/bin/webcoll -u admin" % CFG_PREFIX,
                 "%s/bin/webcoll 1" % CFG_PREFIX,
                 "%s/bin/bibsort -u admin --load-config" % CFG_PREFIX,
                 "%s/bin/bibsort 2" % CFG_PREFIX, ]:
         if os.system(cmd):
             print "ERROR: failed execution of", cmd
             sys.exit(1)
     print ">>> Demo site created successfully."
 
 def cli_cmd_load_demo_records(conf):
     """Load demo records.  Useful for testing purposes."""
     from invenio.config import CFG_PREFIX
     from invenio.dbquery import run_sql
     print ">>> Going to load demo records..."
     run_sql("TRUNCATE schTASK")
     for cmd in ["%s/bin/bibupload -u admin -i %s/var/tmp/demobibdata.xml" % (CFG_PREFIX, CFG_PREFIX),
                 "%s/bin/bibupload 1" % CFG_PREFIX,
                 "%s/bin/bibdocfile --textify --with-ocr --recid 97" % CFG_PREFIX,
                 "%s/bin/bibdocfile --textify --all" % CFG_PREFIX,
                 "%s/bin/bibindex -u admin" % CFG_PREFIX,
                 "%s/bin/bibindex 2" % CFG_PREFIX,
                 "%s/bin/bibindex -u admin -w global" % CFG_PREFIX,
                 "%s/bin/bibindex 3" % CFG_PREFIX,
                 "%s/bin/bibreformat -u admin -o HB" % CFG_PREFIX,
                 "%s/bin/bibreformat 4" % CFG_PREFIX,
                 "%s/bin/webcoll -u admin" % CFG_PREFIX,
                 "%s/bin/webcoll 5" % CFG_PREFIX,
                 "%s/bin/bibrank -u admin" % CFG_PREFIX,
                 "%s/bin/bibrank 6" % CFG_PREFIX,
                 "%s/bin/bibsort -u admin -R" % CFG_PREFIX,
                 "%s/bin/bibsort 7" % CFG_PREFIX,
                 "%s/bin/oairepositoryupdater -u admin" % CFG_PREFIX,
                 "%s/bin/oairepositoryupdater 8" % CFG_PREFIX,
                 "%s/bin/bibupload 9" % CFG_PREFIX,]:
         if os.system(cmd):
             print "ERROR: failed execution of", cmd
             sys.exit(1)
     print ">>> Demo records loaded successfully."
 
 def cli_cmd_remove_demo_records(conf):
     """Remove demo records.  Useful when you are finished testing."""
     print ">>> Going to remove demo records..."
     from invenio.config import CFG_PREFIX
     from invenio.dbquery import run_sql
     from invenio.textutils import wrap_text_in_a_box, wait_for_user
     wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy
 your records and documents!"""))
     if os.path.exists(CFG_PREFIX + os.sep + 'var' + os.sep + 'data'):
         shutil.rmtree(CFG_PREFIX + os.sep + 'var' + os.sep + 'data')
     run_sql("TRUNCATE schTASK")
     for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabbibclean.sql" % (CFG_PREFIX, CFG_PREFIX),
                 "%s/bin/webcoll -u admin" % CFG_PREFIX,
                 "%s/bin/webcoll 1" % CFG_PREFIX,]:
         if os.system(cmd):
             print "ERROR: failed execution of", cmd
             sys.exit(1)
     print ">>> Demo records removed successfully."
 
 def cli_cmd_drop_demo_site(conf):
     """Drop demo site completely.  Useful when you are finished testing."""
     print ">>> Going to drop demo site..."
     from invenio.textutils import wrap_text_in_a_box, wait_for_user
     wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy
 your site and documents!"""))
     cli_cmd_drop_tables(conf)
     cli_cmd_create_tables(conf)
     cli_cmd_remove_demo_records(conf)
     print ">>> Demo site dropped successfully."
 
 def cli_cmd_run_unit_tests(conf):
     """Run unit tests, usually on the working demo site."""
     from invenio.testutils import build_and_run_unit_test_suite
     if not build_and_run_unit_test_suite():
         sys.exit(1)
 
 def cli_cmd_run_js_unit_tests(conf):
     """Run JavaScript unit tests, usually on the working demo site."""
     from invenio.testutils import build_and_run_js_unit_test_suite
     if not build_and_run_js_unit_test_suite():
         sys.exit(1)
 
 def cli_cmd_run_regression_tests(conf):
     """Run regression tests, usually on the working demo site."""
     from invenio.testutils import build_and_run_regression_test_suite
     if not build_and_run_regression_test_suite():
         sys.exit(1)
 
 def cli_cmd_run_web_tests(conf):
     """Run web tests in a browser. Requires Firefox with Selenium."""
     from invenio.testutils import build_and_run_web_test_suite
     if not build_and_run_web_test_suite():
         sys.exit(1)
 
 def _detect_ip_address(conf):
     """Detect IP address of this computer.  Useful for creating Apache
     vhost conf snippet on RHEL like machines.  However, if wanted site
     is 0.0.0.0, then use that, since we are running inside Docker.
 
     @return: IP address, or '*' if cannot detect
     @rtype: string
     @note: creates socket for real in order to detect real IP address,
         not the loopback one.
 
     """
     if '0.0.0.0' in conf.get('Invenio', 'CFG_SITE_URL'):
         return '0.0.0.0'
     try:
         s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
         s.connect(('invenio-software.org', 0))
         return s.getsockname()[0]
     except:
         return '*'
 
 def cli_cmd_create_apache_conf(conf):
     """
     Create Apache conf files for this site, keeping previous
     files in a backup copy.
     """
     print ">>> Going to create Apache conf files..."
     from invenio.textutils import wrap_text_in_a_box
     from invenio.access_control_config import CFG_EXTERNAL_AUTH_USING_SSO
     apache_conf_dir = conf.get("Invenio", 'CFG_ETCDIR') + \
                       os.sep + 'apache'
     if guess_apache_24():
         directory_www_directive = """
         # Uncomment the following on Apache < 2.4
         # <Directory %(webdir)s>
         #    Options FollowSymLinks MultiViews
         #    AllowOverride None
         #    Order allow,deny
         #    Allow from all
         # </Directory>
         # Comment the following on Apache < 2.4
         <Directory %(webdir)s>
            Options FollowSymLinks MultiViews
            AllowOverride None
            Require all granted
         </Directory>""" % {'webdir': conf.get('Invenio', 'CFG_WEBDIR')}
         directory_wsgi_directive = """
         # Uncomment the following on Apache < 2.4
         # <Directory %(wsgidir)s>
         #    WSGIProcessGroup invenio
         #    WSGIApplicationGroup %%{GLOBAL}
         #    Options FollowSymLinks MultiViews
         #    AllowOverride None
         #    Order allow,deny
         #    Allow from all
         # </Directory>
         # Comment the following on Apache < 2.4
         <Directory %(wsgidir)s>
            WSGIProcessGroup invenio
            WSGIApplicationGroup %%{GLOBAL}
            Options FollowSymLinks MultiViews
            AllowOverride None
            Require all granted
         </Directory>""" % {'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi')}
     else:
         directory_www_directive = """
         # Comment the following on Apache >= 2.4
         <Directory %(webdir)s>
            Options FollowSymLinks MultiViews
            AllowOverride None
            Order allow,deny
            Allow from all
         </Directory>
         # Uncomment the following on Apache >= 2.4
         # <Directory %(webdir)s>
         #    Options FollowSymLinks MultiViews
         #    AllowOverride None
         #    Require all granted
         # </Directory>""" % {'webdir': conf.get('Invenio', 'CFG_WEBDIR')}
         directory_wsgi_directive = """
         # Comment the following on Apache >= 2.4
         <Directory %(wsgidir)s>
            WSGIProcessGroup invenio
            WSGIApplicationGroup %%{GLOBAL}
            Options FollowSymLinks MultiViews
            AllowOverride None
            Order allow,deny
            Allow from all
         </Directory>
         # Uncomment the following on Apache >= 2.4
         # <Directory %(wsgidir)s>
         #    WSGIProcessGroup invenio
         #    WSGIApplicationGroup %%{GLOBAL}
         #    Options FollowSymLinks MultiViews
         #    AllowOverride None
         #    Require all granted
         # </Directory>""" % {'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi')}
 
     ## Preparation of XSendFile directive
     xsendfile_directive_needed = int(conf.get("Invenio", 'CFG_BIBDOCFILE_USE_XSENDFILE')) != 0
     if xsendfile_directive_needed:
         xsendfile_directive = "XSendFile On\n"
     else:
         xsendfile_directive = "#XSendFile On\n"
     for path in (conf.get('Invenio', 'CFG_BIBDOCFILE_FILEDIR'), # BibDocFile
             conf.get('Invenio', 'CFG_WEBDIR'),
             conf.get('Invenio', 'CFG_WEBSUBMIT_STORAGEDIR'), # WebSubmit
             conf.get('Invenio', 'CFG_TMPDIR'),
             os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'tmp', 'attachfile'),
             os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'comments'),
             os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'baskets', 'comments'),
             os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'lib', 'webdoc', 'invenio', 'info'),
             '/tmp'): # BibExport
         if xsendfile_directive_needed:
             xsendfile_directive += '        XSendFilePath %s\n' % path
         else:
             xsendfile_directive += '        #XSendFilePath %s\n' % path
     xsendfile_directive = xsendfile_directive.strip()
 
     ## Preparation of deflate directive
     deflate_directive_needed = int(conf.get("Invenio", 'CFG_WEBSTYLE_HTTP_USE_COMPRESSION')) != 0
     if deflate_directive_needed:
         deflate_directive = r"""
         ## Configuration snippet taken from:
         ## <http://httpd.apache.org/docs/2.2/mod/mod_deflate.html>
         <IfModule mod_deflate.c>
             SetOutputFilter DEFLATE
 
             # Netscape 4.x has some problems...
             BrowserMatch ^Mozilla/4 gzip-only-text/html
 
             # Netscape 4.06-4.08 have some more problems
             BrowserMatch ^Mozilla/4\.0[678] no-gzip
 
             # MSIE masquerades as Netscape, but it is fine
             # BrowserMatch \bMSIE !no-gzip !gzip-only-text/html
 
             # NOTE: Due to a bug in mod_setenvif up to Apache 2.0.48
             # the above regex won't work. You can use the following
             # workaround to get the desired effect:
             BrowserMatch \bMSI[E] !no-gzip !gzip-only-text/html
 
             # Don't compress images
             SetEnvIfNoCase Request_URI \
                 \.(?:gif|jpe?g|png)$ no-gzip dont-vary
 
             # Make sure proxies don't deliver the wrong content
             <IfModule mod_headers.c>
                 Header append Vary User-Agent env=!dont-vary
             </IfModule>
         </IfModule>
         """
     else:
         deflate_directive = ""
 
     if CFG_EXTERNAL_AUTH_USING_SSO:
         shibboleth_directive = r"""
         <Location ~ "/youraccount/login|Shibboleth.sso/">
             SSLRequireSSL   # The modules only work using HTTPS
             AuthType shibboleth
             ShibRequireSession On
             ShibRequireAll On
             ShibExportAssertion Off
             require valid-user
         </Location>
         """
     else:
         shibboleth_directive = ""
 
     ## Apache vhost conf file is distro specific, so analyze needs:
     # Gentoo (and generic defaults):
     listen_directive_needed = True
     ssl_pem_directive_needed = False
     ssl_pem_path = '/etc/apache2/ssl/apache.pem'
     ssl_crt_path = '/etc/apache2/ssl/server.crt'
     ssl_key_path = '/etc/apache2/ssl/server.key'
     vhost_ip_address_needed = False
     wsgi_socket_directive_needed = False
     # Debian:
     if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'debian_version'):
         listen_directive_needed = False
         ssl_pem_directive_needed = True
     # RHEL/SLC:
     if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'redhat-release'):
         listen_directive_needed = False
         ssl_crt_path = '/etc/pki/tls/certs/localhost.crt'
         ssl_key_path = '/etc/pki/tls/private/localhost.key'
         vhost_ip_address_needed = True
         if os.popen('grep -c "CentOS.*[67]\." /etc/redhat-release').read().strip() == '1':
             vhost_ip_address_needed = False
         wsgi_socket_directive_needed = True
     # maybe we are using non-standard ports?
     vhost_site_url = conf.get('Invenio', 'CFG_SITE_URL').replace("http://", "")
     if vhost_site_url.startswith("https://"):
         ## The installation is configured to require HTTPS for any connection
         vhost_site_url = vhost_site_url.replace("https://", "")
     vhost_site_url_port = '80'
     vhost_site_secure_url = conf.get('Invenio', 'CFG_SITE_SECURE_URL').replace("https://", "")
     vhost_site_secure_url_port = '443'
     if ':' in vhost_site_url:
         vhost_site_url, vhost_site_url_port = vhost_site_url.split(':', 1)
     if ':' in vhost_site_secure_url:
         vhost_site_secure_url, vhost_site_secure_url_port = vhost_site_secure_url.split(':', 1)
     if vhost_site_url_port != '80' or vhost_site_secure_url_port != '443':
         listen_directive_needed = True
     ## OK, let's create Apache vhost files:
     if not os.path.exists(apache_conf_dir):
         os.mkdir(apache_conf_dir)
     apache_vhost_file = apache_conf_dir + os.sep + \
                             'invenio-apache-vhost.conf'
     apache_vhost_ssl_file = apache_conf_dir + os.sep + \
                              'invenio-apache-vhost-ssl.conf'
     apache_vhost_body = """\
 AddDefaultCharset UTF-8
 ServerSignature Off
 ServerTokens Prod
 NameVirtualHost %(vhost_ip_address)s:%(vhost_site_url_port)s
 %(listen_directive)s
 %(wsgi_socket_directive)s
 WSGIPythonHome %(wsgi_python_home)s
 WSGIRestrictStdout Off
 <Files *.pyc>
    deny from all
 </Files>
 <Files *~>
    deny from all
 </Files>
 <VirtualHost %(vhost_ip_address)s:%(vhost_site_url_port)s>
         ServerName %(servername)s
         ServerAlias %(serveralias)s
         ServerAdmin %(serveradmin)s
         DocumentRoot %(webdir)s
         %(directory_www_directive)s
         ErrorLog %(logdir)s/apache.err
         LogLevel warn
         LogFormat "%%h %%l %%u %%t \\"%%r\\" %%>s %%b \\"%%{Referer}i\\" \\"%%{User-agent}i\\" %%D" combined_with_timing
         CustomLog %(logdir)s/apache.log combined_with_timing
         DirectoryIndex index.en.html index.html
         Alias /static/ %(webdir)s/static/
         Alias /img/ %(webdir)s/img/
         Alias /css/ %(webdir)s/css/
         Alias /js/ %(webdir)s/js/
         Alias /flash/ %(webdir)s/flash/
         Alias /export/ %(webdir)s/export/
         Alias /MathJax/ %(webdir)s/MathJax/
         Alias /jsCalendar/ %(webdir)s/jsCalendar/
         Alias /ckeditor/ %(webdir)s/ckeditor/
         Alias /mediaelement/ %(webdir)s/mediaelement/
         AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1
         Alias /robots.txt %(webdir)s/robots.txt
         Alias /favicon.ico %(webdir)s/favicon.ico
         WSGIDaemonProcess invenio processes=5 threads=1 display-name=%%{GROUP} inactivity-timeout=3600 maximum-requests=10000 %(wsgiuser)s
         WSGIImportScript %(wsgidir)s/invenio.wsgi process-group=invenio application-group=%%{GLOBAL}
         WSGIScriptAlias / %(wsgidir)s/invenio.wsgi
         WSGIPassAuthorization On
         %(xsendfile_directive)s
         %(directory_wsgi_directive)s
         %(deflate_directive)s
 </VirtualHost>
 """ % {'vhost_site_url_port': vhost_site_url_port,
        'servername': vhost_site_url,
        'serveralias': vhost_site_url.split('.')[0],
        'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'),
        'webdir': conf.get('Invenio', 'CFG_WEBDIR'),
        'logdir': conf.get('Invenio', 'CFG_LOGDIR'),
        'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'),
        'wsgiuser': conf.get('Invenio', 'CFG_BIBSCHED_PROCESS_USER') and
                    'user='+conf.get('Invenio', 'CFG_BIBSCHED_PROCESS_USER'),
        'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address(conf) or '*',
        'listen_directive': listen_directive_needed and 'Listen ' + vhost_site_url_port or \
                            '#Listen ' + vhost_site_url_port,
        'wsgi_python_home': sys.prefix,
        'wsgi_socket_directive': (wsgi_socket_directive_needed and \
                                 'WSGISocketPrefix ' or '#WSGISocketPrefix ') + \
               conf.get('Invenio', 'CFG_PREFIX') + os.sep + 'var' + os.sep + 'run',
        'xsendfile_directive': xsendfile_directive,
        'directory_www_directive': directory_www_directive,
        'directory_wsgi_directive': directory_wsgi_directive,
        'deflate_directive': deflate_directive,
        }
     apache_vhost_ssl_body = """\
 ServerSignature Off
 ServerTokens Prod
 %(listen_directive)s
 NameVirtualHost %(vhost_ip_address)s:%(vhost_site_secure_url_port)s
 %(ssl_pem_directive)s
 %(ssl_crt_directive)s
 %(ssl_key_directive)s
 %(ssl_protocol_directive)s
 %(ssl_cipher_directive)s
 WSGIRestrictStdout Off
 <Files *.pyc>
    deny from all
 </Files>
 <Files *~>
    deny from all
 </Files>
 <VirtualHost %(vhost_ip_address)s:%(vhost_site_secure_url_port)s>
         ServerName %(servername)s
         ServerAlias %(serveralias)s
         ServerAdmin %(serveradmin)s
         SSLEngine on
         DocumentRoot %(webdir)s
         %(directory_www_directive)s
         ErrorLog %(logdir)s/apache-ssl.err
         LogLevel warn
         LogFormat "%%h %%l %%u %%t \\"%%r\\" %%>s %%b \\"%%{Referer}i\\" \\"%%{User-agent}i\\" %%D" combined_with_timing
         CustomLog %(logdir)s/apache-ssl.log combined_with_timing
         DirectoryIndex index.en.html index.html
         Alias /static/ %(webdir)s/static/
         Alias /img/ %(webdir)s/img/
         Alias /css/ %(webdir)s/css/
         Alias /js/ %(webdir)s/js/
         Alias /flash/ %(webdir)s/flash/
         Alias /export/ %(webdir)s/export/
         Alias /MathJax/ %(webdir)s/MathJax/
         Alias /jsCalendar/ %(webdir)s/jsCalendar/
         Alias /ckeditor/ %(webdir)s/ckeditor/
         Alias /mediaelement/ %(webdir)s/mediaelement/
         AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1
         Alias /robots.txt %(webdir)s/robots.txt
         Alias /favicon.ico %(webdir)s/favicon.ico
         RedirectMatch /sslredirect/(.*) http://$1
         WSGIScriptAlias / %(wsgidir)s/invenio.wsgi
         WSGIPassAuthorization On
         %(xsendfile_directive)s
         %(directory_wsgi_directive)s
         %(deflate_directive)s
         %(shibboleth_directive)s
 </VirtualHost>
 """ % {'vhost_site_secure_url_port': vhost_site_secure_url_port,
        'servername': vhost_site_secure_url,
        'serveralias': vhost_site_secure_url.split('.')[0],
        'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'),
        'webdir': conf.get('Invenio', 'CFG_WEBDIR'),
        'logdir': conf.get('Invenio', 'CFG_LOGDIR'),
        'wsgidir' : os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'),
        'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address(conf) or '*',
        'listen_directive' : listen_directive_needed and 'Listen ' + vhost_site_secure_url_port or \
                             '#Listen ' + vhost_site_secure_url_port,
        'ssl_pem_directive': ssl_pem_directive_needed and \
                             'SSLCertificateFile %s' % ssl_pem_path or \
                             '#SSLCertificateFile %s' % ssl_pem_path,
        'ssl_crt_directive': ssl_pem_directive_needed and \
                             '#SSLCertificateFile %s' % ssl_crt_path or \
                             'SSLCertificateFile %s' % ssl_crt_path,
        'ssl_key_directive': ssl_pem_directive_needed and \
                             '#SSLCertificateKeyFile %s' % ssl_key_path or \
                             'SSLCertificateKeyFile %s' % ssl_key_path,
        'ssl_protocol_directive': ssl_pem_directive_needed and \
                             'SSLProtocol all -SSLv2 -SSLv3' or \
                             '#SSLProtocol all -SSLv2 -SSLv3',
        'ssl_cipher_directive': ssl_pem_directive_needed and \
                             'SSLCipherSuite HIGH:MEDIUM:!ADH' or \
                             '#SSLCipherSuite HIGH:MEDIUM:!ADH',
        'xsendfile_directive': xsendfile_directive,
        'directory_www_directive': directory_www_directive,
        'directory_wsgi_directive': directory_wsgi_directive,
        'deflate_directive': deflate_directive,
        'shibboleth_directive': shibboleth_directive,
        }
     # write HTTP vhost snippet:
     if os.path.exists(apache_vhost_file):
         shutil.copy(apache_vhost_file,
                     apache_vhost_file + '.OLD')
     fdesc = open(apache_vhost_file, 'w')
     fdesc.write(apache_vhost_body)
     fdesc.close()
     print
     print "Created file", apache_vhost_file
     # write HTTPS vhost snippet:
     vhost_ssl_created = False
     if conf.get('Invenio', 'CFG_SITE_SECURE_URL').startswith("https://"):
         if os.path.exists(apache_vhost_ssl_file):
             shutil.copy(apache_vhost_ssl_file,
                         apache_vhost_ssl_file + '.OLD')
         fdesc = open(apache_vhost_ssl_file, 'w')
         fdesc.write(apache_vhost_ssl_body)
         fdesc.close()
         vhost_ssl_created = True
         print "Created file", apache_vhost_ssl_file
 
     print wrap_text_in_a_box("""\
 Apache virtual host configuration file(s) for your Invenio site
 was(were) created.  Please check created file(s) and activate virtual
 host(s).  For example, you can put the following include statements in
 your httpd.conf:\n
 
 Include %s
 
 %s
 
 
 Please see the INSTALL file for more details.
     """ % (apache_vhost_file, (vhost_ssl_created and 'Include ' or '#Include ') + apache_vhost_ssl_file))
     print ">>> Apache conf files created."
 
 def cli_cmd_get(conf, varname):
     """
     Return value of VARNAME read from CONF files.  Useful for
     third-party programs to access values of conf options such as
     CFG_PREFIX.  Return None if VARNAME is not found.
     """
     try:
         if not varname:
             raise Exception("ERROR: Please specify a configuration variable.")
         varname = varname.lower()
         # do not pay attention to section names yet:
         all_options = {}
         for section in conf.sections():
             for option in conf.options(section):
                 all_options[option] = conf.get(section, option)
         varvalue = all_options.get(varname, None)
         if varvalue is None:
             raise Exception()
         print varvalue
     except Exception, e:
         if e.message:
             print e.message
         sys.exit(1)
 
 def cli_cmd_list(conf):
     """
     Print a list of all conf options and values from CONF.
     """
     sections = conf.sections()
     sections.sort()
     for section in sections:
         options = conf.options(section)
         options.sort()
         for option in options:
             print option.upper(), '=', conf.get(section, option)
 
 def _grep_version_from_executable(path_to_exec, version_regexp):
     """
     Try to detect a program version by digging into its binary
     PATH_TO_EXEC and looking for VERSION_REGEXP.  Return program
     version as a string.  Return empty string if not succeeded.
     """
     from invenio.shellutils import run_shell_command
     exec_version = ""
     if os.path.exists(path_to_exec):
         dummy1, cmd2_out, dummy2 = run_shell_command("strings %s | grep %s",
                                                      (path_to_exec, version_regexp))
         if cmd2_out:
             for cmd2_out_line in cmd2_out.split("\n"):
                 if len(cmd2_out_line) > len(exec_version):
                     # the longest the better
                     exec_version = cmd2_out_line
     return exec_version
 
 _RE_APACHE_MAJOR_VERSION = re.compile(r"Apache/(\d+\.\d+)")
 def guess_apache_24(apache_versions=None):
     """
     Returns True if it looks like the system is running Apache 2.4 or later.
     """
     if apache_versions is None:
         apache_versions = detect_apache_version()
     for apache_version in apache_versions:
         g = _RE_APACHE_MAJOR_VERSION.search(apache_version)
         if g:
             try:
                 version = float(g.group(1))
             except ValueError:
                 continue
             if version >= 2.4:
                 return True
     return False
 
 
 def detect_apache_version():
     """
     Try to detect Apache version by localizing httpd or apache
     executables and grepping inside binaries.  Return list of all
     found Apache versions and paths.  (For a given executable, the
     returned format is 'apache_version [apache_path]'.)  Return empty
     list if no success.
     """
     from invenio.shellutils import run_shell_command
     out = []
     dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache")
     for apache in cmd_out.split("\n"):
         apache_version = _grep_version_from_executable(apache, '^Apache\/')
         if apache_version:
             out.append("%s [%s]" % (apache_version, apache))
     return out
 
 def cli_cmd_detect_system_details(conf):
     """
     Detect and print system details such as Apache/Python/MySQL
     versions etc.  Useful for debugging problems on various OS.
     """
     import MySQLdb
     print ">>> Going to detect system details..."
     print "* Hostname: " + socket.gethostname()
     print "* Invenio version: " + conf.get("Invenio", "CFG_VERSION")
     print "* Python version: " + sys.version.replace("\n", " ")
     print "* Apache version: " + ";\n                  ".join(detect_apache_version())
     print "* MySQLdb version: " + MySQLdb.__version__
     try:
         from invenio.dbquery import run_sql
         print "* MySQL version:"
         for key, val in run_sql("SHOW VARIABLES LIKE 'version%'") + \
                 run_sql("SHOW VARIABLES LIKE 'charact%'") + \
                 run_sql("SHOW VARIABLES LIKE 'collat%'"):
             if False:
                 print "    - %s: %s" % (key, val)
             elif key in ['version',
                          'character_set_client',
                          'character_set_connection',
                          'character_set_database',
                          'character_set_results',
                          'character_set_server',
                          'character_set_system',
                          'collation_connection',
                          'collation_database',
                          'collation_server']:
                 print "    - %s: %s" % (key, val)
     except ImportError:
         print "* ERROR: cannot import dbquery"
     print ">>> System details detected successfully."
 
 
 def cli_cmd_upgrade(conf):
     """
     Command for applying upgrades
     """
     from invenio.inveniocfg_upgrader import cmd_upgrade
     cmd_upgrade(conf)
 
 def cli_cmd_upgrade_check(conf):
     """
     Command for running pre-upgrade checks
     """
     from invenio.inveniocfg_upgrader import cmd_upgrade_check
     cmd_upgrade_check(conf)
 
 
 def cli_cmd_upgrade_show_pending(conf):
     """
     Command for showing upgrades ready to be applied
     """
     from invenio.inveniocfg_upgrader import cmd_upgrade_show_pending
     cmd_upgrade_show_pending(conf)
 
 
 def cli_cmd_upgrade_show_applied(conf):
     """
     Command for showing all upgrades already applied.
     """
     from invenio.inveniocfg_upgrader import cmd_upgrade_show_applied
     cmd_upgrade_show_applied(conf)
 
 
 def cli_cmd_upgrade_create_release_recipe(conf, path):
     """
     Create a new release upgrade recipe (for developers).
     """
     from invenio.inveniocfg_upgrader import cmd_upgrade_create_release_recipe
     cmd_upgrade_create_release_recipe(conf, path)
 
 
 def cli_cmd_upgrade_create_standard_recipe(conf, path, depends_on=None,
                                           release=False):
     """
     Create a new upgrade recipe (for developers).
     """
     from invenio.inveniocfg_upgrader import cmd_upgrade_create_standard_recipe
     cmd_upgrade_create_standard_recipe(conf, path, depends_on=depends_on,
                                        release=release)
 
 
 def prepare_option_parser():
     """Parse the command line options."""
 
     class InvenioOption(Option):
         """
         Option class that implements the action 'store_append_const' which will
 
         1) append <const> to list in options.<dest>
         2) take a value and store in options.<const>
 
         Useful for e.g. appending a const to an actions list, while also taking
         an option value and storing it.
 
         This ensures that we can run actions in the order they are given on the
         command-line.
 
         Python 2.4 compatibility note: *append_const* action is not available in
         Python 2.4, so it is implemented here, together with the new action
         *store_append_const*.
         """
         ACTIONS = Option.ACTIONS + ("store_append_const", "append_const")
         STORE_ACTIONS = Option.STORE_ACTIONS + ("store_append_const", "append_const")
         TYPED_ACTIONS = Option.TYPED_ACTIONS + ("store_append_const", )
         ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("store_append_const", )
         CONST_ACTIONS = getattr(Option, 'CONST_ACTIONS', ()) + ("store_append_const", "append_const")
 
         def take_action(self, action, dest, opt, value, values, parser):
             if action == "store_append_const":
                 # Combination of 'store' and 'append_const' actions
                 values.ensure_value(dest, []).append(self.const)
                 value_dest = self.const.replace('-', '_')
                 setattr(values, value_dest, value)
             elif action == "append_const" and not hasattr(Option, 'CONST_ACTIONS'):
                 values.ensure_value(dest, []).append(self.const)
             else:
                 Option.take_action(self, action, dest, opt, value, values, parser)
 
         def _check_const(self):
             if self.action not in self.CONST_ACTIONS and self.const is not None:
                 raise OptionError(
                     "'const' must not be supplied for action %r" % self.action,
                     self)
 
         CHECK_METHODS = [
             Option._check_action,
             Option._check_type,
             Option._check_choice,
             Option._check_dest,
             _check_const,
             Option._check_nargs,
             Option._check_callback,
         ]
 
     parser = OptionParser(option_class=InvenioOption, description="Invenio configuration and administration CLI tool", formatter=IndentedHelpFormatter(max_help_position=31))
 
     parser.add_option("-V", "--version", action="store_true", help="print version number")
 
     finish_options = OptionGroup(parser, "Options to finish your installation")
     finish_options.add_option("", "--create-apache-conf", dest='actions', const='create-apache-conf', action="append_const", help="create Apache configuration files")
     finish_options.add_option("", "--create-tables", dest='actions', const='create-tables', action="append_const", help="create DB tables for Invenio")
     finish_options.add_option("", "--load-bibfield-conf", dest='actions', const='load-bibfield-conf', action="append_const", help="load bibfield configuration file")
     finish_options.add_option("", "--load-webstat-conf", dest='actions', const='load-webstat-conf', action="append_const", help="load the WebStat configuration")
     finish_options.add_option("", "--drop-tables", dest='actions', const='drop-tables', action="append_const", help="drop DB tables of Invenio")
     finish_options.add_option("", "--check-openoffice", dest='actions', const='check-openoffice', action="append_const", help="check for correctly set up of openoffice temporary directory")
     parser.add_option_group(finish_options)
 
     demotest_options = OptionGroup(parser, "Options to set up and test a demo site")
     demotest_options.add_option("", "--create-demo-site", dest='actions', const='create-demo-site', action="append_const", help="create demo site")
     demotest_options.add_option("", "--load-demo-records", dest='actions', const='load-demo-records', action="append_const", help="load demo records")
     demotest_options.add_option("", "--remove-demo-records", dest='actions', const='remove-demo-records', action="append_const", help="remove demo records, keeping demo site")
     demotest_options.add_option("", "--drop-demo-site", dest='actions', const='drop-demo-site', action="append_const", help="drop demo site configurations too")
     demotest_options.add_option("", "--run-unit-tests", dest='actions', const='run-unit-tests', action="append_const", help="run unit test suite (needs demo site)")
     demotest_options.add_option("", "--run-js-unit-tests", dest='actions', const='run-js-unit-tests', action="append_const", help="run JS unit test suite (needs demo site)")
     demotest_options.add_option("", "--run-regression-tests", dest='actions', const='run-regression-tests', action="append_const", help="run regression test suite (needs demo site)")
     demotest_options.add_option("", "--run-web-tests", dest='actions', const='run-web-tests', action="append_const", help="run web tests in a browser (needs demo site, Firefox, Selenium IDE)")
     parser.add_option_group(demotest_options)
 
     config_options = OptionGroup(parser, "Options to update config files in situ")
     config_options.add_option("", "--update-all", dest='actions', const='update-all', action="append_const", help="perform all the update options")
     config_options.add_option("", "--update-config-py", dest='actions', const='update-config-py', action="append_const", help="update config.py file from invenio.conf file")
     config_options.add_option("", "--update-dbquery-py", dest='actions', const='update-dbquery-py', action="append_const", help="update dbquery.py with DB credentials from invenio.conf")
     config_options.add_option("", "--update-dbexec", dest='actions', const='update-dbexec', action="append_const", help="update dbexec with DB credentials from invenio.conf")
     config_options.add_option("", "--update-bibconvert-tpl", dest='actions', const='update-bibconvert-tpl', action="append_const", help="update bibconvert templates with CFG_SITE_URL from invenio.conf")
     config_options.add_option("", "--update-web-tests", dest='actions', const='update-web-tests', action="append_const", help="update web test cases with CFG_SITE_URL from invenio.conf")
     parser.add_option_group(config_options)
 
     reset_options = OptionGroup(parser, "Options to update DB tables")
     reset_options.add_option("", "--reset-all", dest='actions', const='reset-all', action="append_const", help="perform all the reset options")
     reset_options.add_option("", "--reset-sitename", dest='actions', const='reset-sitename', action="append_const", help="reset tables to take account of new CFG_SITE_NAME*")
     reset_options.add_option("", "--reset-siteadminemail", dest='actions', const='reset-siteadminemail', action="append_const", help="reset tables to take account of new CFG_SITE_ADMIN_EMAIL")
     reset_options.add_option("", "--reset-fieldnames", dest='actions', const='reset-fieldnames', action="append_const", help="reset tables to take account of new I18N names from PO files")
     reset_options.add_option("", "--reset-recstruct-cache", dest='actions', const='reset-recstruct-cache', action="append_const", help="reset record structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE")
     reset_options.add_option("", "--reset-recjson-cache", dest='actions', const='reset-recjson-cache', action="append_const", help="reset record json structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE")
     parser.add_option_group(reset_options)
 
     upgrade_options = OptionGroup(parser, "Options to upgrade your installation")
     upgrade_options.add_option("", "--upgrade", dest='actions', const='upgrade', action="append_const", help="apply all pending upgrades")
     upgrade_options.add_option("", "--upgrade-check", dest='actions', const='upgrade-check', action="append_const", help="run pre-upgrade checks for pending upgrades")
     upgrade_options.add_option("", "--upgrade-show-pending", dest='actions', const='upgrade-show-pending', action="append_const", help="show pending upgrades")
     upgrade_options.add_option("", "--upgrade-show-applied", dest='actions', const='upgrade-show-applied', action="append_const", help="show history of applied upgrades")
     upgrade_options.add_option("", "--upgrade-create-standard-recipe", dest='actions', metavar='REPOSITORY[,DIR]', const='upgrade-create-standard-recipe', action="store_append_const", help="create a new standard upgrade recipe (for developers)")
     upgrade_options.add_option("", "--upgrade-create-release-recipe", dest='actions', metavar='REPOSITORY[,DIR]', const='upgrade-create-release-recipe', action="store_append_const", help="create a new release upgrade recipe (for developers)")
     parser.add_option_group(upgrade_options)
 
     helper_options = OptionGroup(parser, "Options to help the work")
     helper_options.add_option("", "--list", dest='actions', const='list', action="append_const", help="print names and values of all options from conf files")
     helper_options.add_option("", "--get", dest='actions', const='get', action="store_append_const", metavar="OPTION", help="get value of a given option from conf files")
     helper_options.add_option("", "--conf-dir", action="store", metavar="PATH", help="path to directory where invenio*.conf files are [optional]")
     helper_options.add_option("", "--detect-system-details", dest='actions', const='detect-system-details', action="append_const", help="print system details such as Apache/Python/MySQL versions")
     parser.add_option_group(helper_options)
 
     parser.add_option('--yes-i-know', action='store_true', dest='yes-i-know', help='use with care!')
     parser.add_option('-x', '--stop', action='store_true', dest='stop_on_error', help='When running tests, stop at first error')
 
     return parser
 
 
 def prepare_conf(options):
     """ Read configuration files """
     conf = ConfigParser()
     confdir = getattr(options, 'conf_dir', None)
 
     if confdir is None:
         ## try to detect path to conf dir (relative to this bin dir):
         confdir = re.sub(r'/bin$', '/etc', sys.path[0])
 
     if confdir and not os.path.exists(confdir):
         raise Exception("ERROR: bad --conf-dir option value - directory does not exists.")
         sys.exit(1)
 
     ## read conf files:
     for conffile in [confdir + os.sep + 'invenio.conf',
                      confdir + os.sep + 'invenio-autotools.conf',
                      confdir + os.sep + 'invenio-local.conf', ]:
 
         if os.path.exists(conffile):
             conf.read(conffile)
         else:
             if not conffile.endswith("invenio-local.conf"):
                 # invenio-local.conf is optional, otherwise stop
                 raise Exception("ERROR: Badly guessed conf file location %s (Please use --conf-dir option.)" % conffile)
     return conf
 
 
 def main(*cmd_args):
     """Main entry point."""
     # Allow easier testing
     if not cmd_args:
         cmd_args = sys.argv[1:]
 
     # Parse arguments
     parser = prepare_option_parser()
     (options, dummy_args) = parser.parse_args(list(cmd_args))
 
     if getattr(options, 'stop_on_error', False):
         from invenio.testutils import wrap_failfast
         wrap_failfast()
 
     if getattr(options, 'version', False):
         print_version()
     else:
         # Read configuration
         try:
             conf = prepare_conf(options)
         except Exception, e:
             print e
             sys.exit(1)
 
         ## Decide what to do
         actions = getattr(options, 'actions', None)
 
         if not actions:
             print """ERROR: Please specify a command.  Please see '--help'."""
             sys.exit(1)
 
         for action in actions:
             if action == 'get':
                 cli_cmd_get(conf, getattr(options, 'get', None))
             elif action == 'list':
                 cli_cmd_list(conf)
             elif action == 'detect-system-details':
                 cli_cmd_detect_system_details(conf)
             elif action == 'create-tables':
                 cli_cmd_create_tables(conf)
             elif action == 'load-webstat-conf':
                 cli_cmd_load_webstat_conf(conf)
             elif action == 'drop-tables':
                 cli_cmd_drop_tables(conf)
             elif action == 'check-openoffice':
                 cli_check_openoffice(conf)
             elif action == 'load-bibfield-conf':
                 cli_cmd_load_bibfield_config(conf)
             elif action == 'create-demo-site':
                 cli_cmd_create_demo_site(conf)
             elif action == 'load-demo-records':
                 cli_cmd_load_demo_records(conf)
             elif action == 'remove-demo-records':
                 cli_cmd_remove_demo_records(conf)
             elif action == 'drop-demo-site':
                 cli_cmd_drop_demo_site(conf)
             elif action == 'run-unit-tests':
                 cli_cmd_run_unit_tests(conf)
             elif action == 'run-js-unit-tests':
                 cli_cmd_run_js_unit_tests(conf)
             elif action == 'run-regression-tests':
                 cli_cmd_run_regression_tests(conf)
             elif action == 'run-web-tests':
                 cli_cmd_run_web_tests(conf)
             elif action == 'update-all':
                 cli_cmd_update_config_py(conf)
                 cli_cmd_update_dbquery_py(conf)
                 cli_cmd_update_dbexec(conf)
                 cli_cmd_update_bibconvert_tpl(conf)
                 cli_cmd_update_web_tests(conf)
             elif action == 'update-config-py':
                 cli_cmd_update_config_py(conf)
             elif action == 'update-dbquery-py':
                 cli_cmd_update_dbquery_py(conf)
             elif action == 'update-dbexec':
                 cli_cmd_update_dbexec(conf)
             elif action == 'update-bibconvert-tpl':
                 cli_cmd_update_bibconvert_tpl(conf)
             elif action == 'update-web-tests':
                 cli_cmd_update_web_tests(conf)
             elif action == 'reset-all':
                 cli_cmd_reset_sitename(conf)
                 cli_cmd_reset_siteadminemail(conf)
                 cli_cmd_reset_fieldnames(conf)
                 cli_cmd_reset_recstruct_cache(conf)
             elif action == 'reset-sitename':
                 cli_cmd_reset_sitename(conf)
             elif action == 'reset-siteadminemail':
                 cli_cmd_reset_siteadminemail(conf)
             elif action == 'reset-fieldnames':
                 cli_cmd_reset_fieldnames(conf)
             elif action == 'reset-recstruct-cache':
                 cli_cmd_reset_recstruct_cache(conf)
             elif action == 'reset-recjson-cache':
                 cli_cmd_reset_recjson_cache(conf)
             elif action == 'create-apache-conf':
                 cli_cmd_create_apache_conf(conf)
             elif action == 'upgrade':
                 cli_cmd_upgrade(conf)
             elif action == 'upgrade-check':
                 cli_cmd_upgrade_check(conf)
             elif action == 'upgrade-show-pending':
                 cli_cmd_upgrade_show_pending(conf)
             elif action == 'upgrade-show-applied':
                 cli_cmd_upgrade_show_applied(conf)
             elif action == 'upgrade-create-standard-recipe':
                 cli_cmd_upgrade_create_standard_recipe(conf, getattr(options, 'upgrade_create_standard_recipe', None))
             elif action == 'upgrade-create-release-recipe':
                 cli_cmd_upgrade_create_release_recipe(conf, getattr(options, 'upgrade_create_release_recipe', None))
             else:
                 print "ERROR: Unknown command", action
                 sys.exit(1)
 
 if __name__ == '__main__':
     main()
diff --git a/modules/webaccess/lib/access_control_firerole.py b/modules/webaccess/lib/access_control_firerole.py
index 01e1fb756..efd51a505 100644
--- a/modules/webaccess/lib/access_control_firerole.py
+++ b/modules/webaccess/lib/access_control_firerole.py
@@ -1,337 +1,344 @@
 # This file is part of Invenio.
-# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2013 CERN.
+# Copyright (C) 2007, 2008, 2009, 2010, 2011, 2013, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """Invenio Access Control FireRole."""
 
 __revision__ = "$Id$"
 
 __lastupdated__ = """$Date$"""
 
 """These functions are for realizing a firewall like role definition for extending
 webaccess to connect user to roles using every infos about users.
 """
 
 import re
 import cPickle
 from zlib import compress, decompress
 import sys
 import time
 
 if sys.hexversion < 0x2040000:
     # pylint: disable=W0622
     from sets import Set as set
     # pylint: enable=W0622
 
 from invenio.access_control_config import InvenioWebAccessFireroleError
 from invenio.dbquery import run_sql, blob_to_string
 from invenio.config import CFG_CERN_SITE
 from invenio.access_control_config import CFG_ACC_EMPTY_ROLE_DEFINITION_SRC, \
         CFG_ACC_EMPTY_ROLE_DEFINITION_SER, CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ
 from invenio.errorlib import register_exception
 
 # INTERFACE
 
 def compile_role_definition(firerole_def_src):
     """ Given a text in which every row contains a rule it returns the compiled
     object definition.
     Rules have the following syntax:
     allow|deny [not] field {list of one or more (double)quoted string or regexp}
     or allow|deny any
     Every row may contain a # sign followed by a comment which are discarded.
     Field could be any key contained in a user_info dictionary. If the key does
     not exist in the dictionary, the rule is skipped.
     The first rule which matches return.
     """
     line = 0
     ret = []
     default_allow_p = False
     if not firerole_def_src or not firerole_def_src.strip():
         firerole_def_src = CFG_ACC_EMPTY_ROLE_DEFINITION_SRC
     for row in firerole_def_src.split('\n'):
         line += 1
         row = row.strip()
         if not row:
             continue
         clean_row = _no_comment_re.sub('', row)
         if clean_row:
             g = _any_rule_re.match(clean_row)
             if g:
                 default_allow_p = g.group('command').lower() == 'allow'
                 break
             g = _rule_re.match(clean_row)
             if g:
                 allow_p = g.group('command').lower() == 'allow'
                 not_p = g.group('not') != None
                 field = g.group('field').lower()
                 # Renaming groups to group
                 for alias_item in _aliasTable:
                     if field in alias_item:
                         field = alias_item[0]
                         break
                 if field.startswith('precached_'):
                     raise InvenioWebAccessFireroleError("Error while compiling rule %s (line %s): %s is a reserved key and can not be used in FireRole rules!" % (row, line, field))
                 expressions = g.group('expression')+g.group('more_expressions')
                 expressions_list = []
                 for expr in _expressions_re.finditer(expressions):
                     expr = expr.group()
                     if field in ('from', 'until'):
                         try:
                             expressions_list.append((False, time.mktime(time.strptime(expr[1:-1], '%Y-%m-%d'))))
                         except Exception, msg:
                             raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): %s is not a valid date with format YYYY-MM-DD because %s!" % (row, line, expr, msg))
                     elif expr[0] == '/':
                         try:
                             expressions_list.append((True, re.compile(expr[1:-1], re.I)))
                         except Exception, msg:
                             raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): %s is not a valid re because %s!" % (row, line, expr, msg))
                     else:
                         if field == 'remote_ip' and '/' in expr[1:-1]:
                             try:
                                 expressions_list.append((False, _ip_matcher_builder(expr[1:-1])))
                             except Exception, msg:
                                 raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): %s is not a valid ip group because %s!" % (row, line, expr, msg))
                         else:
                             expressions_list.append((False, expr[1:-1]))
                 expressions_list = tuple(expressions_list)
                 if field in ('from', 'until'):
                     if len(expressions_list) != 1:
                         raise InvenioWebAccessFireroleError("Error when compiling rule %s (line %s): exactly one date is expected when using 'from' or 'until', but %s were found" % (row, line, len(expressions_list)))
                     if not_p:
                         raise InvenioWebAccessFireroleError("Error when compiling rule %s (line %s): 'not' is not allowed when using 'from' or 'until'" % (row, line))
                 ret.append((allow_p, not_p, field, expressions_list))
             else:
                 raise InvenioWebAccessFireroleError("Syntax error while compiling rule %s (line %s): not a valid rule!" % (row, line))
     return (default_allow_p, tuple(ret))
 
 
 def repair_role_definitions():
     """ Try to rebuild compiled serialized definitions from their respectives
     sources. This is needed in case Python break back compatibility.
     """
     definitions = run_sql("SELECT id, firerole_def_src FROM accROLE")
     for role_id, firerole_def_src in definitions:
-        run_sql("UPDATE accROLE SET firerole_def_ser=%s WHERE id=%s", (serialize(compile_role_definition(firerole_def_src)), role_id))
+        firerole_def_ser = serialize(compile_role_definition(firerole_def_src))
+        if firerole_def_ser:
+            run_sql("UPDATE accROLE SET firerole_def_ser=_binary %s WHERE id=%s", (firerole_def_ser, role_id))
+        else:
+            run_sql("UPDATE accROLE SET firerole_def_ser=%s WHERE id=%s", (firerole_def_ser, role_id))
 
 def store_role_definition(role_id, firerole_def_ser, firerole_def_src):
     """ Store a compiled serialized definition and its source in the database
     alongside the role to which it belong.
     @param role_id: the role_id
     @param firerole_def_ser: the serialized compiled definition
     @param firerole_def_src: the sources from which the definition was taken
     """
-    run_sql("UPDATE accROLE SET firerole_def_ser=%s, firerole_def_src=%s WHERE id=%s", (firerole_def_ser, firerole_def_src, role_id))
+    if firerole_def_ser:
+        run_sql("UPDATE accROLE SET firerole_def_ser=_binary %s, firerole_def_src=%s WHERE id=%s", (firerole_def_ser, firerole_def_src, role_id))
+    else:
+        run_sql("UPDATE accROLE SET firerole_def_ser=%s, firerole_def_src=%s WHERE id=%s", (firerole_def_ser, firerole_def_src, role_id))
 
 def load_role_definition(role_id):
     """ Load the definition corresponding to a role. If the compiled definition
     is corrupted it try to repairs definitions from their sources and try again
     to return the definition.
     @param role_id:
     @return: a deserialized compiled role definition
     """
     res = run_sql("SELECT firerole_def_ser FROM accROLE WHERE id=%s", (role_id, ), 1, run_on_slave=True)
     if res:
         try:
             return deserialize(res[0][0])
         except Exception:
             ## Something bad might have happened? (Update of Python?)
             repair_role_definitions()
             res = run_sql("SELECT firerole_def_ser FROM accROLE WHERE id=%s", (role_id, ), 1, run_on_slave=True)
             if res:
                 return deserialize(res[0][0])
     return CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ
 
 def acc_firerole_extract_emails(firerole_def_obj):
     """
     Best effort function to extract all the possible email addresses
     authorized by the given firerole.
     """
     authorized_emails = set()
     try:
         default_allow_p, rules = firerole_def_obj
         for (allow_p, not_p, field, expressions_list) in rules: # for every rule
             if not_p:
                 continue
             if field == 'group':
                 for reg_p, expr in expressions_list:
                     if reg_p:
                         continue
                     if CFG_CERN_SITE and expr.endswith(' [CERN]'):
                         authorized_emails.add(expr[:-len(' [CERN]')].lower().strip() + '@cern.ch')
                     emails = run_sql("SELECT user.email FROM usergroup JOIN user_usergroup ON usergroup.id=user_usergroup.id_usergroup JOIN user ON user.id=user_usergroup.id_user WHERE usergroup.name=%s", (expr, ))
                     for email in emails:
                         authorized_emails.add(email[0].lower().strip())
             elif field == 'email':
                 for reg_p, expr in expressions_list:
                     if reg_p:
                         continue
                     authorized_emails.add(expr.lower().strip())
             elif field == 'uid':
                 for reg_p, expr in expressions_list:
                     if reg_p:
                         continue
                     email = run_sql("SELECT email FROM user WHERE id=%s", (expr, ))
                     if email:
                         authorized_emails.add(email[0][0].lower().strip())
         return authorized_emails
     except Exception, msg:
         raise InvenioWebAccessFireroleError, msg
 
 
 def acc_firerole_check_user(user_info, firerole_def_obj):
     """ Given a user_info dictionary, it matches the rules inside the deserializez
     compiled definition in order to discover if the current user match the roles
     corresponding to this definition.
     @param user_info: a dict produced by collect_user_info which contains every
     info about a user
     @param firerole_def_obj: a compiled deserialized definition produced by
     compile_role_defintion
     @return: True if the user match the definition, False otherwise.
     """
     try:
         default_allow_p, rules = firerole_def_obj
         for (allow_p, not_p, field, expressions_list) in rules: # for every rule
             group_p = field == 'group' # Is it related to group?
             ip_p = field == 'remote_ip' # Is it related to Ips?
             until_p = field == 'until' # Is it related to dates?
             from_p = field == 'from' # Idem.
             next_expr_p = False # Silly flag to break 2 for cycles
             if not user_info.has_key(field) and not from_p and not until_p:
                 continue
             for reg_p, expr in expressions_list: # For every element in the rule
                 if group_p: # Special case: groups
                     if reg_p: # When it is a regexp
                         for group in user_info[field]: # iterate over every group
                             if expr.match(group): # if it matches
                                 if not_p: # if must not match
                                     next_expr_p = True # let's skip to next expr
                                     break
                                 else: # Ok!
                                     return allow_p
                         if next_expr_p:
                             break # I said: let's skip to next rule ;-)
                     elif expr.lower() in [group.lower() for group in user_info[field]]: # Simple expression then just check for expr in groups
                         if not_p: # If expr is in groups then if must not match
                             break # let's skip to next expr
                         else: # Ok!
                             return allow_p
                 elif reg_p: # Not a group, then easier. If it's a regexp
                     if expr.match(user_info[field]): # if it matches
                         if not_p: # If must not match
                             break # Let's skip to next expr
                         else:
                             return allow_p # Ok!
                 elif ip_p and type(expr) == type(()): # If it's just a simple expression but an IP!
                     if _ipmatch(user_info['remote_ip'], expr): # Then if Ip matches
                         if not_p: # If must not match
                             break # let's skip to next expr
                         else:
                             return allow_p # ok!
                 elif until_p:
                     if time.time() <= expr:
                         if allow_p:
                             break
                         else:
                             return False
                     elif allow_p:
                         return False
                     else:
                         break
                 elif from_p:
                     if time.time() >= expr:
                         if allow_p:
                             break
                         else:
                             return False
                     elif allow_p:
                         return False
                     else:
                         break
                 elif expr.lower() == str(user_info[field]).lower(): # Finally the easiest one!!
                     if not_p: # ...
                         break
                     else: # ...
                         return allow_p # ...
             if not_p and not next_expr_p: # Nothing has matched and we got not
                 return allow_p # Then the whole rule matched!
     except Exception, msg:
         register_exception(alert_admin=True)
         raise InvenioWebAccessFireroleError, msg
     return default_allow_p # By default we allow ;-) it'an OpenAccess project
 
 def serialize(firerole_def_obj):
     """ Serialize and compress a definition."""
     if firerole_def_obj == CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ:
         return CFG_ACC_EMPTY_ROLE_DEFINITION_SER
     elif firerole_def_obj:
         return compress(cPickle.dumps(firerole_def_obj, -1))
     else:
         return CFG_ACC_EMPTY_ROLE_DEFINITION_SER
 
 def deserialize(firerole_def_ser):
     """ Deserialize and decompress a definition."""
     if firerole_def_ser:
         return cPickle.loads(decompress(blob_to_string(firerole_def_ser)))
     else:
         return CFG_ACC_EMPTY_ROLE_DEFINITION_OBJ
 
 # IMPLEMENTATION
 
 # Comment finder
 _no_comment_re = re.compile(r'[\s]*(?<!\\)#.*')
 
 # Rule dissecter
 _rule_re = re.compile(r'(?P<command>allow|deny)[\s]+(?:(?P<not>not)[\s]+)?(?P<field>[\w]+)[\s]+(?P<expression>(?<!\\)\'.+?(?<!\\)\'|(?<!\\)\".+?(?<!\\)\"|(?<!\\)\/.+?(?<!\\)\/)(?P<more_expressions>([\s]*,[\s]*((?<!\\)\'.+?(?<!\\)\'|(?<!\\)\".+?(?<!\\)\"|(?<!\\)\/.+?(?<!\\)\/))*)(?:[\s]*(?<!\\).*)?', re.I)
 
 _any_rule_re = re.compile(r'(?P<command>allow|deny)[\s]+(any|all)[\s]*', re.I)
 
 # Sub expression finder
 _expressions_re = re.compile(r'(?<!\\)\'.+?(?<!\\)\'|(?<!\\)\".+?(?<!\\)\"|(?<!\\)\/.+?(?<!\\)\/')
 
 def _mkip(ip):
     """ Compute a numerical value for a dotted IP """
     num = 0L
     if '.' in ip:
         for i in map(int, ip.split('.')):
             num = (num << 8) + i
     return num
 
 _full = 2L ** 32 - 1
 
 
 _aliasTable = (('group', 'groups'), )
 
 
 def _ip_matcher_builder(group):
     """ Compile a string "ip/bitmask" (i.e. 127.0.0.0/24)
     @param group: a classical "ip/bitmask" string
     @return: a tuple containing the gip and mask in a binary version.
     """
     gip, gmk = group.split('/')
     gip = _mkip(gip)
     gmk = int(gmk)
     mask = (_full - (2L ** (32 - gmk) - 1))
     if not (gip & mask == gip):
         raise InvenioWebAccessFireroleError, "Netmask does not match IP (%Lx %Lx)" % (gip, mask)
     return (gip, mask)
 
 def _ipmatch(ip, ip_matcher):
     """ Check if an ip matches an ip_group.
     @param ip: the ip to check
     @param ip_matcher: a compiled ip_group produced by ip_matcher_builder
     @return: True if ip matches, False otherwise
     """
     return _mkip(ip) & ip_matcher[1] == ip_matcher[0]
diff --git a/modules/webbasket/lib/webbasket_dblayer.py b/modules/webbasket/lib/webbasket_dblayer.py
index 4dd7f3794..1d214b7d5 100644
--- a/modules/webbasket/lib/webbasket_dblayer.py
+++ b/modules/webbasket/lib/webbasket_dblayer.py
@@ -1,2430 +1,2430 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2013 CERN.
+# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2013, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """ Database related functions for webbasket module """
 
 __revision__ = "$Id$"
 
 from zlib import decompress
 from zlib import compress
 from time import localtime
 from invenio.textutils import encode_for_xml
 
 from invenio.dbquery import run_sql
 from invenio.webcomment import get_reply_order_cache_data
 from invenio.webbasket_config import CFG_WEBBASKET_SHARE_LEVELS, \
                                      CFG_WEBBASKET_ACTIONS, \
                                      CFG_WEBBASKET_SHARE_LEVELS_ORDERED, \
                                      CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH
 from invenio.config import CFG_SITE_URL
 from invenio.dateutils import convert_datestruct_to_datetext
 from invenio.websession_config import CFG_WEBSESSION_USERGROUP_STATUS
 from invenio.search_engine import get_fieldvalues
 
 ########################### Table of contents ################################
 #
 # NB. functions preceeded by a star use usergroup table
 #
 # 1. General functions
 #    - count_baskets
 #    - check_user_owns_basket
 #    - get_max_user_rights_on_basket
 #
 # 2. Personal baskets
 #    - get_personal_baskets_info_for_topic
 #    - get_all_personal_basket_ids_and_names_by_topic
 #    - get_all_personal_baskets_names
 #    - get_basket_name
 #    - is_personal_basket_valid
 #    - is_topic_valid
 #    - get_basket_topic
 #    - get_personal_topics_infos
 #    - rename_basket
 #    - rename_topic
 #    - move_baskets_to_topic
 #    - delete_basket
 #    - create_basket
 #
 # 3. Actions on baskets
 #    - get_basket_record
 #    - get_basket_content
 #    - get_basket_item
 #    - get_basket_item_title_and_URL
 #    - share_basket_with_group
 #    - update_rights
 #    - move_item
 #    - delete_item
 #    - add_to_basket
 #    - get_external_records_by_collection
 #    - store_external_records
 #    - store_external_urls
 #    - store_external_source
 #    - get_external_colid_and_url
 #
 # 4. Group baskets
 #    - get_group_basket_infos
 #    - get_group_name
 #    - get_all_group_basket_ids_and_names_by_group
 #    - (*) get_all_group_baskets_names
 #    - is_shared_to
 #
 # 5. External baskets (baskets user has subscribed to)
 #    - get_external_baskets_infos
 #    - get_external_basket_info
 #    - get_all_external_basket_ids_and_names
 #    - count_external_baskets
 #    - get_all_external_baskets_names
 #
 # 6. Public baskets (interface to subscribe to baskets)
 #    - get_public_basket_infos
 #    - get_public_basket_info
 #    - get_basket_general_infos
 #    - get_basket_owner_id
 #    - count_public_baskets
 #    - get_public_baskets_list
 #    - is_basket_public
 #    - subscribe
 #    - unsubscribe
 #    - is_user_subscribed_to_basket
 #    - count_subscribers
 #    - (*) get_groups_subscribing_to_basket
 #    - get_rights_on_public_basket
 #
 # 7. Annotating
 #    - get_notes
 #    - get_note
 #    - save_note
 #    - delete_note
 #    - note_belongs_to_item_in_basket_p
 #
 # 8. Usergroup functions
 #    - (*) get_group_infos
 #    - count_groups_user_member_of
 #    - (*) get_groups_user_member_of
 #
 # 9. auxilliary functions
 #    - __wash_sql_count
 #    - __decompress_last
 #    - create_pseudo_record
 #    - prettify_url
 
 ########################## General functions ##################################
 
 def count_baskets(uid):
     """Return (nb personal baskets, nb group baskets, nb external
     baskets) tuple for given user"""
     query1 = "SELECT COUNT(id) FROM bskBASKET WHERE id_owner=%s"
     res1 = run_sql(query1, (int(uid),))
     personal = __wash_sql_count(res1)
     query2 = """SELECT count(ugbsk.id_bskbasket)
                 FROM usergroup_bskBASKET ugbsk LEFT JOIN user_usergroup uug
                                                ON ugbsk.id_usergroup=uug.id_usergroup
                 WHERE uug.id_user=%s AND uug.user_status!=%s
                 GROUP BY ugbsk.id_usergroup"""
     params = (int(uid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING'])
     res2 = run_sql(query2, params)
     if len(res2):
         groups = reduce(lambda x, y: x + y, map(lambda x: x[0], res2))
     else:
         groups = 0
     external = count_external_baskets(uid)
     return (personal, groups, external)
 
 def check_user_owns_baskets(uid, bskids):
     """ Return 1 if user is owner of every basket in list bskids"""
     if not((type(bskids) is list) or (type(bskids) is tuple)):
         bskids = [bskids]
     query = """SELECT id_owner FROM bskBASKET WHERE %s GROUP BY id_owner"""
     sep = ' OR '
     query %= sep.join(['id=%s'] * len(bskids))
     res = run_sql(query, tuple(bskids))
     if len(res)==1 and int(res[0][0])==uid:
         return 1
     else:
         return 0
 
 def get_max_user_rights_on_basket(uid, bskid):
     """Return the max rights a user has on this basket"""
     query_owner = "SELECT count(id_owner) FROM bskBASKET WHERE id_owner=%s and id=%s"
     params_owner = (int(uid), int(bskid))
     res = run_sql(query_owner, params_owner)
     if res and res[0][0]:
         # if this user is owner of this baskets he can do anything he wants.
         return CFG_WEBBASKET_SHARE_LEVELS['MANAGE']
     # not owner => group member ?
     query_group_baskets = """
     SELECT share_level
     FROM user_usergroup AS ug LEFT JOIN usergroup_bskBASKET AS ub
                               ON ug.id_usergroup=ub.id_usergroup
     WHERE ug.id_user=%s AND ub.id_bskBASKET=%s AND NOT(ub.share_level='NO') AND ug.user_status!=%s
     """
     params_group_baskets = (int(uid), int(bskid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING'])
     res = run_sql(query_group_baskets, params_group_baskets)
     group_index = None
     if res:
         try:
             group_index = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(res[0][0])
         except:
             return None
     # public basket ?
     query_public_baskets = """
     SELECT share_level
     FROM usergroup_bskBASKET
     WHERE id_usergroup=0 AND id_bskBASKET=%s
     """
     public_index = None
     res = run_sql(query_public_baskets, (int(bskid),))
     if res:
         try:
             public_index = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(res[0][0])
         except:
             return None
     if group_index or public_index:
         if group_index > public_index:
             return CFG_WEBBASKET_SHARE_LEVELS_ORDERED[group_index]
         else:
             return CFG_WEBBASKET_SHARE_LEVELS_ORDERED[public_index]
     return None
 
 ########################### Personal baskets ##################################
 
 def get_personal_baskets_info_for_topic(uid, topic):
     """Return information about every basket that belongs to the given user and topic."""
 
     query = """ SELECT      bsk.id,
                             bsk.name,
                             DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
                             bsk.nb_views,
                             count(rec.id_bibrec_or_bskEXTREC),
                             DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s')
                 FROM        user_bskBASKET AS ubsk
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=ubsk.id_bskBASKET
                     AND     bsk.id_owner=%s
                 LEFT JOIN   bskREC AS rec
                     ON      rec.id_bskBASKET=bsk.id
                 WHERE       ubsk.id_user=%s
                 AND         ubsk.topic=%s
                 GROUP BY    bsk.id
                 ORDER BY    bsk.name"""
 
     params = (uid, uid, topic)
 
     res = run_sql(query, params)
 
     return res
 
 def get_all_user_personal_basket_ids_by_topic(uid):
     """For a given user return all their personal basket ids grouped by topic."""
 
     query = """ SELECT      ubsk.topic,
                             GROUP_CONCAT(bsk.id)
                 FROM        user_bskBASKET AS ubsk
                 JOIN        bskBASKET AS bsk
                 ON          ubsk.id_bskBASKET=bsk.id
                 AND         ubsk.id_user=bsk.id_owner
                 WHERE       bsk.id_owner=%s
                 GROUP BY    ubsk.topic
                 ORDER BY    ubsk.topic"""
     params = (uid,)
     res = run_sql(query, params)
 
     return res
 
 def get_all_personal_baskets_names(uid):
     """ for a given user, returns every basket he is owner of
     returns list of tuples: (bskid, bsk_name, topic)
     """
     query = """
     SELECT bsk.id,
            bsk.name,
            ubsk.topic
     FROM user_bskBASKET ubsk JOIN bskBASKET bsk
                              ON ubsk.id_bskBASKET=bsk.id
                              AND ubsk.id_user=bsk.id_owner
     WHERE bsk.id_owner=%s
     ORDER BY ubsk.topic
     """
     params = (int(uid),)
     return run_sql(query, params)
 
 def get_basket_name(bskid):
     """return the name of a given basket"""
     query = 'SELECT name FROM bskBASKET where id=%s'
     res = run_sql(query, (int(bskid), ))
     if res:
         return res[0][0]
     else:
         return ''
 
 def is_personal_basket_valid(uid, bskid):
     """Check if the basked (bskid) belongs to user (uid) and is valid."""
 
     query = """ SELECT  id
                 FROM    bskBASKET
                 WHERE   id=%s
                 AND     id_owner=%s"""
     params = (bskid, uid)
     res = run_sql(query, params)
 
     return res
 
 def is_topic_valid(uid, topic):
     """Check if the topic defined by user (uid) exists."""
 
     query = """ SELECT  distinct(topic)
                 FROM    user_bskBASKET
                 WHERE   topic=%s
                 AND     id_user=%s"""
     params = (topic, uid)
     res = run_sql(query, params)
 
     return res
 
 def get_basket_topic(uid, bskid):
     """Return the name of the topic this basket (bskid) belongs to."""
 
     query = """ SELECT  topic
                 FROM    user_bskBASKET
                 WHERE   id_bskBASKET=%s
                 AND     id_user=%s"""
     params = (bskid,uid)
     res = run_sql(query, params)
 
     return res
 
 def get_personal_topics_infos(uid):
     """
     Get the list of every topic user has defined,
     and the number of baskets in each topic
     @param uid: user id (int)
     @return: a list of tuples (topic name, nb of baskets)
     """
     query = """SELECT topic, count(b.id)
                FROM   user_bskBASKET ub JOIN bskBASKET b
                                         ON ub.id_bskBASKET=b.id AND
                                            b.id_owner=ub.id_user
                WHERE  ub.id_user=%s
                GROUP BY topic
                ORDER BY topic"""
     uid = int(uid)
     res = run_sql(query, (uid,))
     return res
 
 def get_basket_ids_and_names(bskids, limit=0):
     """For the given basket ids, return their ids and names,
     ordered by basket name.
     If 'limit' is greater than 0, limit the number of results returned."""
 
     if not((type(bskids) is list) or (type(bskids) is tuple)):
         bskids = [bskids]
 
     query = """ SELECT      bsk.id,
                             bsk.name
                 FROM        bskBASKET AS bsk
                 WHERE       %s
                 ORDER BY    bsk.name
                 %s"""
     sep = ' OR '
     query %= (sep.join(['id=%s'] * len(bskids)), limit and 'LIMIT %i' % limit or '')
 
     params = tuple(bskids)
 
     res = run_sql(query, params)
 
     return res
 
 def rename_basket(bskid, new_name):
     """Rename basket to new_name"""
     run_sql("UPDATE bskBASKET SET name=%s WHERE id=%s", (new_name, bskid))
 
 def rename_topic(uid, old_topic, new_topic):
     """Rename topic to new_topic """
     res = run_sql("UPDATE user_bskBASKET SET topic=%s WHERE id_user=%s AND topic=%s",
                   (new_topic, uid, old_topic))
     return res
 
 def move_baskets_to_topic(uid, bskids, new_topic):
     """Move given baskets to another topic"""
     if not((type(bskids) is list) or (type(bskids) is tuple)):
         bskids = [bskids]
     query = "UPDATE user_bskBASKET SET topic=%s WHERE id_user=%s AND ("
     query += ' OR '.join(['id_bskBASKET=%s'] * len(bskids))
     query += ")"
     params = (new_topic, uid) + tuple(bskids)
     res = run_sql(query, params)
     return res
 
 def delete_basket(bskid):
     """Delete given basket."""
 
     # TODO: check if any alerts are automaticly adding items to the given basket.
     bskid = int(bskid)
 
     query1 = "DELETE FROM bskBASKET WHERE id=%s"
     res = run_sql(query1, (bskid,))
 
     query2A = "SELECT id_bibrec_or_bskEXTREC FROM bskREC WHERE id_bskBASKET=%s"
     local_and_external_ids = run_sql(query2A, (bskid,))
     external_ids = [local_and_external_id[0] for local_and_external_id in \
                     local_and_external_ids if local_and_external_id[0]<0]
     for external_id in external_ids:
         delete_item(bskid=bskid, recid=external_id, update_date_modification=False)
 
     query2B = "DELETE FROM bskREC WHERE id_bskBASKET=%s"
     run_sql(query2B, (bskid,))
 
     query3 = "DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET=%s"
     run_sql(query3, (bskid,))
 
     query4 = "DELETE FROM user_bskBASKET WHERE id_bskBASKET=%s"
     run_sql(query4, (bskid,))
 
     query5 = "DELETE FROM usergroup_bskBASKET WHERE id_bskBASKET=%s"
     run_sql(query5, (bskid,))
 
     query6 = "DELETE FROM user_query_basket WHERE id_basket=%s"
     run_sql(query6, (bskid,))
 
     return int(res)
 
 def create_basket(uid, basket_name, topic):
     """Create new basket for given user in given topic"""
     now = convert_datestruct_to_datetext(localtime())
     id_bsk = run_sql("""INSERT INTO bskBASKET (id_owner, name, date_modification)
                         VALUES                (%s, %s, %s)""",
                      (uid, basket_name, now))
     run_sql("""INSERT INTO user_bskBASKET (id_user, id_bskBASKET, topic)
                VALUES                     (%s, %s, %s)""",
             (uid, id_bsk, topic))
     return id_bsk
 
 def get_all_items_in_user_personal_baskets(uid,
                                            topic="",
                                            format='hb'):
     """For the specified user, return all the items in their personal baskets,
     grouped by basket if local or as a list if external.
     If topic is set, return only that topic's items."""
 
     if topic:
         topic_clause = """AND     ubsk.topic=%s"""
         params_local = (uid, uid, topic)
         params_external = (uid, uid, topic, format)
     else:
         topic_clause = ""
         params_local = (uid, uid)
         params_external = (uid, uid, format)
 
     query_local = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             ubsk.topic,
                             GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC)
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                     AND     bsk.id_owner=%%s
                 JOIN        user_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ubsk.id_user=%%s
                     %s
                 WHERE       rec.id_bibrec_or_bskEXTREC > 0
                 GROUP BY    rec.id_bskBASKET""" % (topic_clause,)
 
     res_local = run_sql(query_local, params_local)
 
     query_external = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             ubsk.topic,
                             rec.id_bibrec_or_bskEXTREC,
                             ext.value
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                     AND     bsk.id_owner=%%s
                 JOIN        user_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ubsk.id_user=%%s
                     %s
                 JOIN        bskEXTFMT AS ext
                     ON      ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC
                     AND     ext.format=%%s
                 WHERE       rec.id_bibrec_or_bskEXTREC < 0
                 ORDER BY    rec.id_bskBASKET""" % (topic_clause,)
 
     res_external = run_sql(query_external, params_external)
 
     return (res_local, res_external)
 
 def get_all_items_in_user_personal_baskets_by_matching_notes(uid,
                                                              topic="",
                                                              p=""):
     """For the specified user, return all the items in their personal baskets
     matching their notes' titles and bodies, grouped by basket.
     If topic is set, return only that topic's items."""
 
     p = p and '%' + p + '%' or '%'
 
     if topic:
         topic_clause = """AND     ubsk.topic=%s"""
         params = (uid, uid, topic, p, p)
     else:
         topic_clause = ""
         params = (uid, uid, p, p)
 
     query = """ SELECT      notes.id_bskBASKET,
                             bsk.name,
                             ubsk.topic,
                             GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC))
                 FROM        bskRECORDCOMMENT AS notes
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=notes.id_bskBASKET
                     AND     bsk.id_owner=%%s
                 JOIN        user_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=notes.id_bskBASKET
                     AND     ubsk.id_user=%%s
                     %s
                 WHERE       notes.title like %%s
                 OR          notes.body like %%s
                 GROUP BY    notes.id_bskBASKET""" % (topic_clause,)
 
     res = run_sql(query, params)
 
     return res
 
 def get_all_user_topics(uid):
     """Return a list of the user's topics."""
 
     query = """ SELECT      ubsk.topic
                 FROM        bskBASKET AS bsk
                 JOIN        user_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=bsk.id
                     AND     ubsk.id_user=bsk.id_owner
                 WHERE       bsk.id_owner=%s
                 GROUP BY    ubsk.topic"""
     params = (uid,)
     res = run_sql(query, params)
     return res
 
 ########################## Actions on baskets #################################
 
 def get_basket_record(bskid, recid, format='hb'):
     """get record recid in basket bskid
     """
     if recid < 0:
         rec_table = 'bskEXTREC'
         format_table = 'bskEXTFMT'
         id_field = 'id_bskEXTREC'
         sign = '-'
     else:
         rec_table = 'bibrec'
         format_table = 'bibfmt'
         id_field = 'id_bibrec'
         sign = ''
     query = """
     SELECT DATE_FORMAT(record.creation_date, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'),
            DATE_FORMAT(record.modification_date, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'),
            DATE_FORMAT(bskREC.date_added, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'),
            user.nickname,
            count(cmt.id_bibrec_or_bskEXTREC),
            DATE_FORMAT(max(cmt.date_creation), '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'),
            fmt.value
 
     FROM bskREC LEFT JOIN user
                 ON bskREC.id_user_who_added_item=user.id
                 LEFT JOIN bskRECORDCOMMENT cmt
                 ON bskREC.id_bibrec_or_bskEXTREC=cmt.id_bibrec_or_bskEXTREC
                 LEFT JOIN %(rec_table)s record
                 ON (%(sign)sbskREC.id_bibrec_or_bskEXTREC=record.id)
                 LEFT JOIN %(format_table)s fmt
                 ON (record.id=fmt.%(id_field)s)
 
     WHERE bskREC.id_bskBASKET=%%s AND
           bskREC.id_bibrec_or_bskEXTREC=%%s AND
           fmt.format=%%s
 
     GROUP BY bskREC.id_bibrec_or_bskEXTREC
     """ % {'rec_table': rec_table,
            'sign': sign,
            'format_table': format_table,
            'id_field':id_field}
     params = (int(bskid), int(recid), format)
     res = run_sql(query, params)
     if res:
         return __decompress_last(res[0])
     return ()
 
 def get_basket_content(bskid, format='hb'):
     """Get all records for a given basket."""
 
     query = """ SELECT      rec.id_bibrec_or_bskEXTREC,
                             extrec.collection_id,
                             count(cmt.id_bibrec_or_bskEXTREC),
                             DATE_FORMAT(max(cmt.date_creation), '%%Y-%%m-%%d %%H:%%i:%%s'),
                             extern.value as ext_val,
                             intern.value as int_val,
                             rec.score
 
                 FROM        bskREC AS rec
 
                 LEFT JOIN   bskRECORDCOMMENT AS cmt
                     ON     (rec.id_bibrec_or_bskEXTREC=cmt.id_bibrec_or_bskEXTREC
                     AND     rec.id_bskBASKET=cmt.id_bskBASKET)
 
                 LEFT JOIN   bskEXTFMT AS extern
                     ON     (-rec.id_bibrec_or_bskEXTREC=extern.id_bskEXTREC
                     AND     extern.format=%s)
 
                 LEFT JOIN   bibfmt AS intern
                     ON     (rec.id_bibrec_or_bskEXTREC=intern.id_bibrec
                     AND     intern.format=%s)
 
                 LEFT JOIN   bskEXTREC AS extrec
                     ON      extrec.id=-rec.id_bibrec_or_bskEXTREC
 
                 WHERE       rec.id_bskBASKET=%s
 
                 GROUP BY    rec.id_bibrec_or_bskEXTREC
 
                 ORDER BY    rec.score"""
 
     params = (format, format, int(bskid))
 
     res = run_sql(query, params)
 
     if res:
         query2 = "UPDATE bskBASKET SET nb_views=nb_views+1 WHERE id=%s"
         run_sql(query2, (int(bskid),))
         return res
     return ()
 
 def get_basket_item(bskid, recid, format='hb'):
     """Get item (recid) for a given basket."""
 
     query = """ SELECT      rec.id_bibrec_or_bskEXTREC,
                             extrec.collection_id,
                             count(cmt.id_bibrec_or_bskEXTREC),
                             DATE_FORMAT(max(cmt.date_creation), '%%Y-%%m-%%d %%H:%%i:%%s'),
                             extern.value as ext_val,
                             intern.value as int_val,
                             rec.score
                 FROM        bskREC rec
                 LEFT JOIN   bskRECORDCOMMENT cmt
                 ON          (rec.id_bibrec_or_bskEXTREC=cmt.id_bibrec_or_bskEXTREC
                              AND
                              rec.id_bskBASKET=cmt.id_bskBASKET)
                 LEFT JOIN   bskEXTFMT extern
                 ON          (-rec.id_bibrec_or_bskEXTREC=extern.id_bskEXTREC
                              AND
                              extern.format=%s)
                 LEFT JOIN   bibfmt intern
                 ON          (rec.id_bibrec_or_bskEXTREC=intern.id_bibrec
                              AND
                              intern.format=%s)
                 LEFT JOIN   bskEXTREC AS extrec
                     ON      extrec.id=-rec.id_bibrec_or_bskEXTREC
                 WHERE       rec.id_bskBASKET=%s
                 AND         rec.id_bibrec_or_bskEXTREC=%s
                 GROUP BY    rec.id_bibrec_or_bskEXTREC
                 ORDER BY    rec.score"""
     params = (format, format, bskid, recid)
     res = run_sql(query, params)
     if res:
         queryU = """UPDATE bskBASKET SET nb_views=nb_views+1 WHERE id=%s"""
         paramsU = (bskid,)
         run_sql(queryU, paramsU)
         score = res[0][6]
         query_previous = """SELECT      id_bibrec_or_bskEXTREC
                             FROM        bskREC
                             WHERE       id_bskBASKET=%s
                             AND         score<%s
                             ORDER BY    score   DESC
                             LIMIT 1"""
         params_previous = (bskid, score)
         res_previous = run_sql(query_previous, params_previous)
         query_next = """SELECT      id_bibrec_or_bskEXTREC
                         FROM        bskREC
                         WHERE       id_bskBASKET=%s
                         AND         score>%s
                         ORDER BY    score   ASC
                         LIMIT 1"""
         params_next = (bskid, score)
         res_next = run_sql(query_next, params_next)
         query_index = """   SELECT      COUNT(id_bibrec_or_bskEXTREC)
                             FROM        bskREC
                             WHERE       id_bskBASKET=%s
                             AND         score<=%s
                             ORDER BY    score"""
         params_index = (bskid, score)
         res_index = run_sql(query_index, params_index)
         res_index = __wash_sql_count(res_index)
         return (res[0], res_previous and res_previous[0][0] or 0, res_next and res_next[0][0] or 0, res_index)
     else:
         return ()
 
 def get_basket_item_title_and_URL(recid):
     """
     Retrieves the title and URL for the specified item in the specified basket.
 
     @param bskid: The basked id
     @type bskid: int
 
     @param recid: The record (item) id
     @type recid: int
 
     @return: A tuple containing the title as a sting and the URL as a string.
     """
 
     if recid > 0:
         # This is a local record, we can easily retrieve the title using the
         # search engine's get_fieldvalues function and the MARC field and tag.
         title_list = get_fieldvalues(recid, '245___')
         # Check if the main title is always the first element in the list
         if title_list:
             title = title_list[0]
         else:
             title = ""
         url = '%s/record/%i' % (CFG_SITE_URL, recid)
     elif recid < 0:
         # This is an external record or item, use
         title = "This is an external record or item."
         url = '%s' % (CFG_SITE_URL,)
 
         query = """ SELECT  rec.collection_id,
                             rec.original_url,
                             fmt.value
                     FROM    bskEXTREC as rec,
                             bskEXTFMT as fmt
                     WHERE   rec.id=%s
                         AND fmt.id_bskEXTREC=%s
                         AND fmt.format='hb'"""
         params = (-recid, -recid)
         result = run_sql(query, params)
         if result:
             item = __decompress_last(result[0])
             collection = item[0]
             url = item[1]
             hb = item[2]
             if collection == 0:
                 # This is an external item
                 title = hb.split('\n',1)[0]
             elif collection > 0:
                 # This is an external record from a hosted collection
                 title = hb.split('</strong>',1)[0].split('<strong>')[-1]
 
     return (title, url)
 
 def share_basket_with_group(bskid, group_id,
                             share_level=CFG_WEBBASKET_SHARE_LEVELS['READITM']):
     """ Share basket bskid with group group_id with given share_level
     @param share_level:  see CFG_WEBBASKET_SHARE_LEVELS in webbasket_config
     """
     now = convert_datestruct_to_datetext(localtime())
     run_sql("""REPLACE INTO usergroup_bskBASKET
                  (id_usergroup, id_bskBASKET, date_shared, share_level)
                VALUES (%s,%s,%s,%s)""",
             (group_id, bskid, now, str(share_level)))
 
 def update_rights(bskid, group_rights):
     """update rights (permissions) for groups.
     @param bskid: basket id
     @param group_rights: dictionary of {group id: new rights}
     """
     now = convert_datestruct_to_datetext(localtime())
     query1 = """REPLACE INTO usergroup_bskBASKET
                        (id_usergroup, id_bskBASKET, date_shared, share_level)
                 VALUES """ + \
                 ', '.join(["(%s, %s, %s, %s)"] * len(group_rights.items()))
 
     params = ()
     for (group_id, share_level) in group_rights.items():
         params += (int(group_id), int(bskid), now, str(share_level))
 
     run_sql(query1, params)
     query2 = """DELETE FROM usergroup_bskBASKET WHERE share_level='NO'"""
     run_sql(query2)
 
 def move_item(bskid, recid, direction):
     """Change score of an item in a basket"""
     bskid = int(bskid)
     query1 = """SELECT id_bibrec_or_bskEXTREC,
                        score
                 FROM bskREC
                 WHERE id_bskBASKET=%s
                 ORDER BY score, date_added"""
     items = run_sql(query1, (bskid,))
     (recids, scores) = zip(*items)
     (recids, scores) = (list(recids), list(scores))
     if len(recids) and recid in recids:
         current_index = recids.index(recid)
         if direction == CFG_WEBBASKET_ACTIONS['UP']:
             switch_index = 0
             if current_index != 0:
                 switch_index = current_index -1
         else:
             switch_index = len(recids) - 1
             if current_index != len(recids)-1:
                 switch_index = current_index + 1
         query2 = """UPDATE bskREC
                     SET score=%s
                     WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s"""
         res1 = run_sql(query2, (scores[switch_index], bskid, recids[current_index]))
         res2 = run_sql(query2, (scores[current_index], bskid, recids[switch_index]))
         if res1 and res2:
             now = convert_datestruct_to_datetext(localtime())
             query3 = "UPDATE bskBASKET SET date_modification=%s WHERE id=%s"
             params3 = (now, int(bskid))
             run_sql(query3, params3)
 
 def delete_item(bskid, recid, update_date_modification=True):
     """Remove item recid from basket bskid"""
 
     if recid < 0:
         query0A = "select count(id_bskBASKET) from bskREC where id_bibrec_or_bskEXTREC=%s" % (int(recid))
         ncopies = run_sql(query0A)
         if ncopies and ncopies[0][0]<=1:
             # uncomment the following 5 lines and comment the following 2 to delete cached records
             # only for external sources and not for external records
             #query0B = "SELECT collection_id FROM bskEXTREC WHERE id=%s" % (-int(recid))
             #colid = run_sql(query0B)
             #if colid and colid[0][0]==0:
                 #query0C = "DELETE from bskEXTFMT WHERE id_bskEXTREC=%s" % (-int(recid))
                 #run_sql(query0C)
             # the following two lines delete cached external records. We could keep them if we find
             # a way to reuse them in case the external records are added again in the future.
             query0D = "DELETE from bskEXTFMT WHERE id_bskEXTREC=%s" % (-int(recid))
             run_sql(query0D)
             query0E = "DELETE from bskEXTREC WHERE id=%s" % (-int(recid))
             run_sql(query0E)
     query_notes = "DELETE FROM bskRECORDCOMMENT WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s"
     run_sql(query_notes, (bskid, recid,))
     query1 = "DELETE from bskREC WHERE id_bskBASKET=%s AND id_bibrec_or_bskEXTREC=%s"
     params1 = (int(bskid), int(recid))
     res = run_sql(query1, params1)
     if update_date_modification and res:
         now = convert_datestruct_to_datetext(localtime())
         query2 = "UPDATE bskBASKET SET date_modification=%s WHERE id=%s"
         params2 = (now, int(bskid))
         run_sql(query2, params2)
     return res
 
 def add_to_basket(uid,
                   recids=[],
                   colid=0,
                   bskid=0,
                   es_title="",
                   es_desc="",
                   es_url=""):
     """Add items (recids) basket (bskid)."""
     if (recids or (colid == -1 and es_title and es_desc and es_url)) and bskid > 0:
         query_max_score = """   SELECT   MAX(score)
                                 FROM     bskREC
                                 WHERE    id_bskBASKET=%s"""
         params_max_score = (bskid,)
         res_max_score = run_sql(query_max_score, params_max_score)
         max_score = __wash_sql_count(res_max_score)
         if not max_score:
             # max_score == None actually means that the basket doesn't exist.
             # Maybe we should return 0 and inform the admin?
             max_score = 1
 
         if colid > 0:
             query_existing = """    SELECT  id,
                                             external_id
                                     FROM    bskEXTREC
                                     WHERE   %s
                                     AND     collection_id=%s"""
             sep_or = ' OR '
             query_existing %= (sep_or.join(['external_id=%s'] * len(recids)), colid)
             params_existing = tuple(recids)
             res_existing = run_sql(query_existing, params_existing)
             existing_recids = [int(external_ids_couple[1]) for external_ids_couple in res_existing]
             existing_ids = [int(ids[0]) for ids in res_existing]
             new_recids = [recid for recid in recids if int(recid) not in existing_recids]
             # sets approach
             #existing_recids = [ids[1] for ids in res_existing]
             #new_recids = list(set(recids)-set(existing_recids))
             if new_recids:
                 query_new = """ INSERT INTO bskEXTREC (external_id,
                                                        collection_id,
                                                        creation_date,
                                                        modification_date)
                                 VALUES """
                 now = convert_datestruct_to_datetext(localtime())
                 records = ["(%s, %s, %s, %s)"] * len(new_recids)
                 query_new += ', '.join(records)
                 params_new = ()
                 for new_recid in new_recids:
                     params_new += (int(new_recid), colid, now, now)
                 res_new = run_sql(query_new, params_new)
                 recids = [-int(recid) for recid in existing_ids]
                 recids.extend(range(-res_new,-(res_new+len(new_recids)),-1))
             else:
                 recids = [-int(recid) for recid in existing_ids]
         elif colid < 0:
             query_external = """INSERT INTO bskEXTREC (collection_id,
                                                        original_url,
                                                        creation_date,
                                                        modification_date)
                                 VALUES      (%s, %s, %s, %s)"""
             now = convert_datestruct_to_datetext(localtime())
             params_external = (colid, es_url, now, now)
             res_external = run_sql(query_external, params_external)
             recids = [-res_external]
             store_external_source(res_external, es_title, es_desc, es_url, 'xm')
             store_external_source(res_external, es_title, es_desc, es_url, 'hb')
 
         query_insert = """  INSERT IGNORE INTO  bskREC
                                                 (id_bibrec_or_bskEXTREC,
                                                  id_bskBASKET,
                                                  id_user_who_added_item,
                                                  date_added,
                                                  score)
                             VALUES """
         if colid == 0 or (colid > 0 and not new_recids):
             now = convert_datestruct_to_datetext(localtime())
         records = ["(%s, %s, %s, %s, %s)"] * len(recids)
         query_insert += ', '.join(records)
         params_insert = ()
         i = 1
         for recid in recids:
             params_insert += (recid, bskid, uid, now, max_score + i)
             i += 1
         run_sql(query_insert, params_insert)
 
         query_update = """  UPDATE  bskBASKET
                             SET     date_modification=%s
                             WHERE   id=%s"""
         params_update = (now, bskid)
         run_sql(query_update, params_update)
         return recids
     return 0
 
 
 def move_to_basket(uid,
                    recids=None,
                    old_bskid=0,
                    new_bskid=0,
                    update_date_modification=True):
     """ Move items (recids) from basket (old_bskid) to basket (new_bskid) """
     if (recids is not None) and len(recids) > 0:
 
         moved_recids = []
 
         for recid in recids:
             # Prevent duplication of items
             query = """ SELECT  '1'
                         FROM    bskREC
                         WHERE   id_bskBASKET=%s
                                 AND
                                 id_bibrec_or_bskEXTREC=%s
                     """
             params = (int(new_bskid), int(recid))
 
             res = run_sql(query, params)
 
             if len(res) == 0:
                 # Change the item's pointer to basket
                 query = """ UPDATE  bskREC
                             SET     id_bskBASKET=%s,
                                     id_user_who_added_item=%s
                             WHERE   id_bskBASKET=%s
                                     AND id_bibrec_or_bskEXTREC=%s
                         """
 
                 params = (int(new_bskid), int(uid), int(old_bskid), int(recid))
                 res = run_sql(query, params)
 
                 moved_recids.append(int(recid))
 
         # Update 'modification date'
         if len(moved_recids) > 0 and update_date_modification:
             now = convert_datestruct_to_datetext(localtime())
             query = "UPDATE bskBASKET SET date_modification=%s WHERE id=%s"
 
             params = (now, int(old_bskid))
             run_sql(query, params)
 
             params = (now, int(new_bskid))
             run_sql(query, params)
 
     return moved_recids
 
 
 def add_to_many_baskets(uid, recids=[], colid=0, bskids=[], es_title="", es_desc="", es_url=""):
     """Add items recids to every basket in bskids list."""
     if (len(recids) or colid == -1) and len(bskids):
         query1 = """SELECT   id_bskBASKET,
                              max(score)
                     FROM     bskREC
                     WHERE    %s
                     GROUP BY id_bskBASKET"""
         bskids = [bskid for bskid in bskids if int(bskid) >= 0]
         sep_or = ' OR '
         query1 %= sep_or.join(['id_bskBASKET=%s'] * len(bskids))
         bsks = dict.fromkeys(bskids, 0)
         params = tuple(bskids)
         bsks.update(dict(run_sql(query1, params)))
 
         if colid > 0:
             query2A = """SELECT id,
                                 external_id
                          FROM   bskEXTREC
                          WHERE  %s
                          AND    collection_id=%s"""
             query2A %= (sep_or.join(['external_id=%s'] * len(recids)), colid)
             params2A = tuple(recids)
             res2A = run_sql(query2A, params2A)
             existing_recids = [int(external_ids_couple[1]) for external_ids_couple in res2A]
             existing_ids = [int(ids[0]) for ids in res2A]
             new_recids = [recid for recid in recids if int(recid) not in existing_recids]
             # sets approach
             #existing_recids = [ids[1] for ids in res2A]
             #new_recids = list(set(recids)-set(existing_recids))
             if new_recids:
                 query2B = """INSERT
                              INTO   bskEXTREC
                                    (external_id,
                                     collection_id,
                                     creation_date,
                                     modification_date)
                              VALUES """
                 now = convert_datestruct_to_datetext(localtime())
                 records = ["(%s, %s, %s, %s)"] * len(new_recids)
                 query2B += ', '.join(records)
                 params2B = ()
                 for new_recid in new_recids:
                     params2B += (int(new_recid), colid, now, now)
                 res = run_sql(query2B, params2B)
                 recids = [-int(recid) for recid in existing_ids]
                 recids.extend(range(-res,-(res+len(new_recids)),-1))
             else:
                 recids = [-int(recid) for recid in existing_ids]
         elif colid < 0:
             query2C = """INSERT
                         INTO bskEXTREC
                             (collection_id,
                             original_url,
                             creation_date,
                             modification_date)
                         VALUES (%s, %s, %s, %s)"""
             now = convert_datestruct_to_datetext(localtime())
             params = (colid, es_url, now, now)
             res = run_sql(query2C, params)
             recids = [-res]
             store_external_source(res, es_title, es_desc, es_url, 'xm')
             store_external_source(res, es_title, es_desc, es_url, 'hb')
 
         query2 = """INSERT IGNORE
                     INTO   bskREC
                            (id_bibrec_or_bskEXTREC,
                             id_bskBASKET,
                             id_user_who_added_item,
                             date_added,
                             score)
                     VALUES """
         if colid == 0 or (colid > 0 and not new_recids):
             now = convert_datestruct_to_datetext(localtime())
         records = ["(%s, %s, %s, %s, %s)"] * (len(recids) * len(bsks.items()))
         query2 += ', '.join(records)
         params = ()
         for (bskid, max_score) in bsks.items():
             i = 1
             for recid in recids:
                 params += (int(recid), int(bskid), int(uid), now, int(max_score) + i)
                 i += 1
         run_sql(query2, params)
 
         query3 = """UPDATE bskBASKET
                     SET    date_modification=%s
                     WHERE """
         query3 += sep_or.join(["id=%s"] * len(bskids))
         params = (now,) + tuple(bskids)
         run_sql(query3, params)
         return len(bskids)
     return 0
 
 def get_external_records_by_collection(recids):
     """Get the selected recids, both local and external, grouped by collection."""
 
     if recids:
         query = """ SELECT      GROUP_CONCAT(id),
                                 GROUP_CONCAT(external_id),
                                 collection_id
                     FROM        bskEXTREC
                     WHERE       %s
                     GROUP BY    collection_id"""
 
         recids = [-recid for recid in recids]
         sep_or = ' OR '
         query %= sep_or.join(['id=%s'] * len(recids))
         params = tuple(recids)
         res = run_sql(query,params)
         return res
     return 0
 
 def get_external_records(recids, of="hb"):
     """Get formatted external records from the database."""
 
     if recids:
         query = """ SELECT  rec.collection_id,
                             fmt.id_bskEXTREC,
                             fmt.value
                     FROM    bskEXTFMT AS fmt
                     JOIN    bskEXTREC AS rec
                         ON  rec.id=fmt.id_bskEXTREC
                     WHERE   format=%%s
                     AND     ( %s )"""
         recids = [-recid for recid in recids]
         sep_or = ' OR '
         query %= sep_or.join(['id_bskEXTREC=%s'] * len(recids))
         params = [of]
         params.extend(recids)
         params = tuple(params)
         res = run_sql(query,params)
         return res
     return ()
 
 def store_external_records(records, of="hb"):
     """Store formatted external records to the database."""
 
     if records:
         query = """INSERT
                     INTO bskEXTFMT
                         (id_bskEXTREC,
                         format,
                         last_updated,
                         value)
                     VALUES """
         now = convert_datestruct_to_datetext(localtime())
         formatted_records = ["(%s, %s, %s, %s)"] * len(records)
         query += ', '.join(formatted_records)
         params = ()
         for record in records:
             params += (record[0], of, now, compress(record[1]))
         run_sql(query,params)
 
 def store_external_urls(ids_urls):
     """Store original urls for external records to the database."""
 
     #for id_url in ids_urls.iteritems():
     for id_url in ids_urls:
         query = """UPDATE
                     bskEXTREC
                     SET original_url=%s
                     WHERE id=%s"""
         params = (id_url[1], id_url[0])
         run_sql(query,params)
 
 def store_external_source(es_id, es_title, es_desc, es_url, of="hb"):
     """Store formatted external sources to the database."""
 
     if es_id and es_title and es_desc:
         query = """INSERT INTO  bskEXTFMT
                                 (id_bskEXTREC,
                                  format,
                                  last_updated,
                                  value)
                     VALUES      (%s, %s, %s, %s)"""
         now = convert_datestruct_to_datetext(localtime())
         value = create_pseudo_record(es_title, es_desc, es_url, of)
         params = (es_id, of, now, compress(value))
         run_sql(query,params)
 
 def get_external_colid_and_url(recid):
     """Get the collection id and original url for an external record."""
 
     if recid:
         query = """SELECT
                     collection_id,
                     original_url
                     FROM bskEXTREC
                     WHERE id=%s"""
         params = (-recid,)
         res = run_sql(query,params)
         if res:
             return res
         else:
             return 0
 
 ############################ Group baskets ####################################
 
 def get_group_baskets_info_for_group(grpid):
     """Return information about every basket that belongs to the given group,
     provided the user is its manager or a member of it."""
 
     if not grpid:
         return ()
 
     query = """ SELECT      bsk.id,
                             bsk.name,
                             DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
                             bsk.nb_views,
                             COUNT(rec.id_bibrec_or_bskEXTREC),
                             DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s'),
                             ugbsk.share_level,
                             bsk.id_owner
                 FROM        usergroup_bskBASKET AS ugbsk
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=ugbsk.id_bskBASKET
                 LEFT JOIN   bskREC AS rec
                     ON      rec.id_bskBASKET=bsk.id
                 WHERE       ugbsk.id_usergroup=%s
                 AND         ugbsk.share_level!='NO'
                 GROUP BY    bsk.id
                 ORDER BY    bsk.name"""
 
     params = (grpid,)
 
     res = run_sql(query, params)
 
     return res
 
 def get_group_name(gid):
     """Given its id return the group's name."""
 
     query = """ SELECT  name
                 FROM    usergroup
                 WHERE   id=%s"""
     params = (gid,)
     res = run_sql(query, params)
 
     return res
 
 def get_all_user_group_basket_ids_by_group(uid):
     """For a given user return all their group basket ids grouped by group."""
 
     query = """ SELECT      ug.id,
                             ug.name,
                             GROUP_CONCAT(ugbsk.id_bskBASKET)
                 FROM        usergroup AS ug
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_usergroup=ug.id
                 JOIN        bskBASKET AS bsk
                     ON      ugbsk.id_bskBASKET=bsk.id
                 JOIN        user_usergroup AS uug
                     ON      ug.id=uug.id_usergroup
                     AND     uug.id_user=%s
                 GROUP BY    ug.name
                 ORDER BY    ug.name"""
     params = (uid,)
     res = run_sql(query, params)
 
     return res
 
 def get_all_user_group_basket_ids_by_group_with_add_rights(uid):
     """For a given user return all their group basket ids grouped by group.
     Return only the basket ids to which it is allowed to add records."""
 
     query = """ SELECT      ug.name,
                             GROUP_CONCAT(ugbsk.id_bskBASKET)
                 FROM        usergroup AS ug
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_usergroup=ug.id
                     AND     ugbsk.share_level!='NO'
                     AND     ugbsk.share_level!='RI'
                     AND     ugbsk.share_level!='RC'
                     AND     ugbsk.share_level!='AC'
                 JOIN        bskBASKET AS bsk
                     ON      ugbsk.id_bskBASKET=bsk.id
                 JOIN        user_usergroup AS uug
                     ON      ug.id=uug.id_usergroup
                     AND     uug.id_user=%s
                 GROUP BY    ug.name
                 ORDER BY    ug.name"""
     params = (uid,)
     res = run_sql(query, params)
 
     return res
 
 def get_all_group_baskets_names(uid,
                                 min_rights=CFG_WEBBASKET_SHARE_LEVELS['ADDCMT']):
     """For a given user returns every group baskets in which he can <min_rights>
     return a list of tuples: (bskid, bsk_name, group_name)."""
 
     # TODO: This function is no longer used. Delete if necessary.
     uid = int(uid)
     try:
         min_rights_num = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(min_rights)
     except ValueError:
         return ()
     groups = get_groups_user_member_of(uid)
     if groups:
         where_clause = '('
         where_clause += " OR ".join(["ugbsk.id_usergroup=%s"] * len(groups))
         where_clause += ') AND ('
         where_clause += " OR ".join(["ugbsk.share_level=%s"] * len(CFG_WEBBASKET_SHARE_LEVELS_ORDERED[min_rights_num:]))
         where_clause += ")"
         query = """
         SELECT bsk.id,
                bsk.name,
                ug.name
         FROM usergroup ug JOIN usergroup_bskBASKET ugbsk
                           ON ug.id=ugbsk.id_usergroup
                           JOIN bskBASKET bsk
                           ON bsk.id=ugbsk.id_bskBASKET
         WHERE %s AND NOT(ugbsk.share_level='NO')
         ORDER BY ug.name""" % where_clause
         params = tuple([group_id for (group_id, dummy) in groups])
         params += tuple(CFG_WEBBASKET_SHARE_LEVELS_ORDERED[min_rights_num:])
         return run_sql(query, params)
     return ()
 
 def is_shared_to(bskids):
     """For each bskid in bskids get id of one of its group. Used to
     make distinction between private basket (no group), 'world' basket
     (0) or group basket (any int > 0)
     """
     if not((type(bskids) == list) or (type(bskids) == tuple)):
         bskids = [bskids]
     query = """SELECT b.id,
                       min(u.id_usergroup)
                FROM
                       bskBASKET b LEFT JOIN usergroup_bskBASKET u
                       ON (b.id=u.id_bskBASKET) """
     if len(bskids) != 0:
         query += " WHERE "
         query += " OR ".join(['b.id=%s'] * len(bskids))
     query += " GROUP BY b.id"
     params = tuple(bskids)
     res = run_sql(query, params)
     if res:
         return res
     return ()
 
 def get_basket_share_level(bskid):
     """Get the minimum share level of the basket (bskid).
     Returns:
         None for personal baskets
         positive integet for group baskets
         0 for public baskets
     Will return 0 if the basket is both group and publicly shared."""
 
     query = """ SELECT      MIN(ugbsk.id_usergroup)
                 FROM        bskBASKET AS bsk
                 LEFT JOIN   usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=bsk.id
                 WHERE       bsk.id=%s
                 GROUP BY    bsk.id"""
 
     params = (bskid,)
 
     res = run_sql(query, params)
 
     return res
 
 def get_all_items_in_user_group_baskets(uid,
                                         group=0,
                                         format='hb'):
     """For the specified user, return all the items in their group baskets,
     grouped by basket if local or as a list if external.
     If group is set, return only that group's items."""
 
     if group:
         group_clause = """AND     ugbsk.id_usergroup=%s"""
         params_local = (group, uid)
         params_external = (group, uid, format)
     else:
         group_clause = ""
         params_local = (uid,)
         params_external = (uid, format)
 
     query_local = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             uug.id_usergroup,
                             ug.name,
                             ugbsk.share_level,
                             GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC)
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=rec.id_bskBASKET
                     %s
                 JOIN        user_usergroup AS uug
                     ON      uug.id_usergroup=ugbsk.id_usergroup
                     AND     uug.id_user=%%s
                 JOIN        usergroup AS ug
                     ON      ug.id=uug.id_usergroup
                 WHERE       rec.id_bibrec_or_bskEXTREC > 0
                 GROUP BY    rec.id_bskBASKET""" % (group_clause,)
 
     res_local = run_sql(query_local, params_local)
 
     query_external = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             uug.id_usergroup,
                             ug.name,
                             ugbsk.share_level,
                             rec.id_bibrec_or_bskEXTREC,
                             ext.value
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=rec.id_bskBASKET
                     %s
                 JOIN        user_usergroup AS uug
                     ON      uug.id_usergroup=ugbsk.id_usergroup
                     AND     uug.id_user=%%s
                 JOIN        usergroup AS ug
                     ON      ug.id=uug.id_usergroup
                 JOIN        bskEXTFMT AS ext
                     ON      ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC
                     AND     ext.format=%%s
                 WHERE       rec.id_bibrec_or_bskEXTREC < 0
                 ORDER BY    rec.id_bskBASKET""" % (group_clause,)
 
     res_external = run_sql(query_external, params_external)
 
     return (res_local, res_external)
 
 def get_all_items_in_user_group_baskets_by_matching_notes(uid,
                                                           group=0,
                                                           p=""):
     """For the specified user, return all the items in group personal baskets
     matching their notes' titles and bodies, grouped by basket.
     If topic is set, return only that topic's items."""
 
     p = p and '%' + p + '%' or '%'
 
     if group:
         group_clause = """AND     ugbsk.id_usergroup=%s"""
         params = (group, uid, p, p)
     else:
         group_clause = ""
         params = (uid, p, p)
 
     query = """ SELECT      notes.id_bskBASKET,
                             bsk.name,
                             uug.id_usergroup,
                             ug.name,
                             ugbsk.share_level,
                             GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC))
                 FROM        bskRECORDCOMMENT AS notes
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=notes.id_bskBASKET
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=notes.id_bskBASKET
                     AND     ugbsk.share_level IS NOT NULL
                     AND     ugbsk.share_level!='NO'
                     AND     ugbsk.share_level!='RI'
                     %s
                 JOIN        user_usergroup AS uug
                     ON      uug.id_usergroup=ugbsk.id_usergroup
                     AND     uug.id_user=%%s
                 JOIN        usergroup AS ug
                     ON      ug.id=uug.id_usergroup
                 WHERE       notes.title like %%s
                 OR          notes.body like %%s
                 GROUP BY    notes.id_bskBASKET""" % (group_clause,)
 
     res = run_sql(query, params)
 
     return res
 
 def is_group_basket_valid(uid, bskid):
     """Check if the basked (bskid) belongs to one of the groups the user (uid)
     is a member of and is valid."""
 
     query = """ SELECT  id
                 FROM    bskBASKET AS bsk
                 JOIN    usergroup_bskBASKET AS ugbsk
                     ON  ugbsk.id_bskBASKET=bsk.id
                 JOIN    user_usergroup AS uug
                     ON  uug.id_usergroup=ugbsk.id_usergroup
                     AND uug.id_user=%s
                 WHERE   id=%s"""
     params = (uid, bskid)
     res = run_sql(query, params)
 
     return res
 
 def is_group_valid(uid, group):
     """Check if the group exists and the user is a member or manager."""
 
     query = """ SELECT  id_usergroup
                 FROM    user_usergroup
                 WHERE   id_usergroup=%s
                 AND     id_user=%s"""
     params = (group, uid)
     res = run_sql(query, params)
 
     return res
 
 def get_all_user_groups(uid):
     """Return a list of the groups the user is a member of or manages."""
 
     query = """ SELECT      ug.id,
                             ug.name
                 FROM        user_usergroup AS uug
                 JOIN        usergroup AS ug
                     ON      ug.id=uug.id_usergroup
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_usergroup=uug.id_usergroup
                 WHERE       uug.id_user=%s
                 GROUP BY    uug.id_usergroup"""
     params = (uid,)
     res = run_sql(query, params)
     return res
 
 ########################## External baskets ###################################
 
 def get_external_baskets_infos(uid):
     """Get general informations about every external basket user uid has subscribed to."""
     query = """
     SELECT bsk.id,
            bsk.name,
            DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
            bsk.nb_views,
            count(rec.id_bibrec_or_bskEXTREC),
            DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s'),
            ugbsk.share_level
     FROM   bskBASKET bsk JOIN user_bskBASKET ubsk
                          ON (bsk.id=ubsk.id_bskBASKET AND ubsk.id_user=%s)
                          LEFT JOIN bskREC rec
                          ON (bsk.id=rec.id_bskBASKET)
                          LEFT JOIN usergroup_bskBASKET ugbsk
                          ON (ugbsk.id_bskBASKET=bsk.id AND ugbsk.id_usergroup=0)
 
     WHERE  bsk.id_owner!=%s
 
     GROUP BY bsk.id
     """
     uid = int(uid)
     params = (uid, uid)
     res = run_sql(query, params)
     if res:
         return res
     return ()
 
 def get_external_basket_info(bskid):
     """"""
 
     query = """ SELECT      bsk.id,
                             bsk.name,
                             DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
                             bsk.nb_views,
                             count(rec.id_bibrec_or_bskEXTREC),
                             DATE_FORMAT(max(rec.date_added), '%%Y-%%m-%%d %%H:%%i:%%s'),
                             ugbsk.share_level
                 FROM        bskBASKET AS bsk
                 LEFT JOIN   bskREC AS rec
                 ON          bsk.id=rec.id_bskBASKET
                 JOIN        usergroup_bskBASKET AS ugbsk
                 ON          bsk.id=ugbsk.id_bskBASKET
                 AND         ugbsk.id_usergroup=0
                 WHERE       id=%s"""
     params = (bskid,)
     res = run_sql(query, params)
 
     return res
 
 def get_all_external_basket_ids_and_names(uid):
     """For a given user return all their external baskets
     (in tuples: (id, name, number_of_records))."""
 
     query = """ SELECT      bsk.id,
                             bsk.name,
                             count(rec.id_bibrec_or_bskEXTREC),
                             ugbsk.id_usergroup
                 FROM        user_bskBASKET AS ubsk
                 JOIN        bskBASKET AS bsk
                     ON      ubsk.id_bskBASKET=bsk.id
                     AND     ubsk.id_user!=bsk.id_owner
                 LEFT JOIN   bskREC AS rec
                     ON      ubsk.id_bskBASKET=rec.id_bskBASKET
                 LEFT JOIN   usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_usergroup=0
                     AND     ugbsk.id_bskBASKET=bsk.id
                 WHERE       ubsk.id_user=%s
                 GROUP BY    bsk.id
                 ORDER BY    bsk.name"""
     params = (uid,)
     res = run_sql(query, params)
 
     return res
 
 def count_external_baskets(uid):
     """Returns the number of external baskets the user is subscribed to."""
 
     query = """ SELECT      COUNT(ubsk.id_bskBASKET)
                 FROM        user_bskBASKET ubsk
                 LEFT JOIN   bskBASKET bsk
                     ON      (bsk.id=ubsk.id_bskBASKET AND ubsk.id_user=%s)
                 WHERE       bsk.id_owner!=%s"""
 
     params = (int(uid), int(uid))
 
     res = run_sql(query, params)
 
     return __wash_sql_count(res)
 
 def get_all_external_baskets_names(uid,
                                    min_rights=CFG_WEBBASKET_SHARE_LEVELS['ADDCMT']):
 
     """ for a given user returns every basket which he has subscribed to and in which
     he can <min_rights>
     return a list of tuples: (bskid, bsk_name)
     """
     uid = int(uid)
     try:
         min_rights_num = CFG_WEBBASKET_SHARE_LEVELS_ORDERED.index(min_rights)
     except ValueError:
         return ()
     where_clause = ' AND ('
     for right in CFG_WEBBASKET_SHARE_LEVELS_ORDERED[min_rights_num:-1]:
         where_clause += "ugbsk.share_level = '%s' OR " % right
     where_clause += "ugbsk.share_level = '%s')" % CFG_WEBBASKET_SHARE_LEVELS_ORDERED[-1]
     query = """
     SELECT bsk.id,
            bsk.name
     FROM bskBASKET bsk JOIN usergroup_bskBASKET ugbsk
                        ON bsk.id=ugbsk.id_bskBASKET
                        JOIN user_bskBASKET ubsk
                        ON ubsk.id_bskBASKET=bsk.id
     WHERE ugbsk.id_usergroup=0 AND
           ubsk.id_user=%s AND
           NOT(bsk.id_owner=%s) AND
           NOT(ugbsk.share_level='NO')
     """ + where_clause
 
     params = (uid, uid)
     return run_sql(query, params)
 
 def get_all_items_in_user_public_baskets(uid,
                                         format='hb'):
     """For the specified user, return all the items in the public baskets they
     are subscribed to, grouped by basket if local or as a list if external."""
 
     query_local = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             ugbsk.share_level,
                             GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC)
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                     AND     bsk.id_owner!=%s
                 JOIN        user_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ubsk.id_user=%s
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ugbsk.id_usergroup=0
                 WHERE       rec.id_bibrec_or_bskEXTREC > 0
                 GROUP BY    rec.id_bskBASKET"""
 
     params_local = (uid, uid)
 
     res_local = run_sql(query_local, params_local)
 
     query_external = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             ugbsk.share_level,
                             rec.id_bibrec_or_bskEXTREC,
                             ext.value
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                     AND     bsk.id_owner!=%s
                 JOIN        user_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ubsk.id_user=%s
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ugbsk.id_usergroup=0
                 JOIN        bskEXTFMT AS ext
                     ON      ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC
                     AND     ext.format=%s
                 WHERE       rec.id_bibrec_or_bskEXTREC < 0
                 ORDER BY    rec.id_bskBASKET"""
 
     params_external = (uid, uid, format)
 
     res_external = run_sql(query_external, params_external)
 
     return (res_local, res_external)
 
 def get_all_items_in_user_public_baskets_by_matching_notes(uid,
                                                            p=""):
     """For the specified user, return all the items in the public baskets they
     are subscribed to, matching their notes' titles and bodies,
     grouped by basket"""
 
     p = p and '%' + p + '%' or '%'
 
     query = """ SELECT      notes.id_bskBASKET,
                             bsk.name,
                             ugbsk.share_level,
                             GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC))
                 FROM        bskRECORDCOMMENT AS notes
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=notes.id_bskBASKET
                     AND     bsk.id_owner!=%s
                 JOIN        user_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=notes.id_bskBASKET
                     AND     ubsk.id_user=%s
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=notes.id_bskBASKET
                     AND     ugbsk.id_usergroup=0
                     AND     ugbsk.share_level IS NOT NULL
                     AND     ugbsk.share_level!='NO'
                     AND     ugbsk.share_level!='RI'
                 WHERE       notes.title like %s
                 OR          notes.body like %s
                 GROUP BY    notes.id_bskBASKET"""
 
     params = (uid, uid, p, p)
 
     res = run_sql(query, params)
 
     return res
 
 def get_all_items_in_all_public_baskets(format='hb'):
     """Return all the items in all the public baskets,
     grouped by basket if local or as a list if external."""
 
     query_local = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             ugbsk.share_level,
                             GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC)
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ugbsk.id_usergroup=0
                 WHERE       rec.id_bibrec_or_bskEXTREC > 0
                 GROUP BY    rec.id_bskBASKET"""
 
     res_local = run_sql(query_local)
 
     query_external = """
                 SELECT      rec.id_bskBASKET,
                             bsk.name,
                             ugbsk.share_level,
                             rec.id_bibrec_or_bskEXTREC,
                             ext.value
                 FROM        bskREC AS rec
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=rec.id_bskBASKET
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=rec.id_bskBASKET
                     AND     ugbsk.id_usergroup=0
                 JOIN        bskEXTFMT AS ext
                     ON      ext.id_bskEXTREC=-rec.id_bibrec_or_bskEXTREC
                     AND     ext.format=%s
                 WHERE       rec.id_bibrec_or_bskEXTREC < 0
                 ORDER BY    rec.id_bskBASKET"""
 
     params_external = (format,)
 
     res_external = run_sql(query_external, params_external)
 
     return (res_local, res_external)
 
 def get_all_items_in_all_public_baskets_by_matching_notes(p=""):
     """Return all the items in all the public baskets matching
     their notes' titles and bodies, grouped by basket"""
 
     p = p and '%' + p + '%' or '%'
 
     query = """ SELECT      notes.id_bskBASKET,
                             bsk.name,
                             ugbsk.share_level,
                             GROUP_CONCAT(DISTINCT(notes.id_bibrec_or_bskEXTREC))
                 FROM        bskRECORDCOMMENT AS notes
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=notes.id_bskBASKET
                 JOIN        usergroup_bskBASKET AS ugbsk
                     ON      ugbsk.id_bskBASKET=notes.id_bskBASKET
                     AND     ugbsk.id_usergroup=0
                     AND     ugbsk.share_level IS NOT NULL
                     AND     ugbsk.share_level!='NO'
                     AND     ugbsk.share_level!='RI'
                 WHERE       notes.title like %s
                 OR          notes.body like %s
                 GROUP BY    notes.id_bskBASKET"""
 
     params = (p, p)
 
     res = run_sql(query, params)
 
     return res
 
 ############################ Public access ####################################
 
 def get_public_basket_infos(bskid):
     """return (id, name, date modification, nb of views, id of owner, nickname of owner, rights for public access)
     for a given basket"""
     basket = []
     query1 = """SELECT bsk.id,
                        bsk.name,
                        DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
                        bsk.nb_views,
                        bsk.id_owner,
                        user.nickname
                 FROM bskBASKET bsk LEFT JOIN user
                                    ON bsk.id_owner=user.id
                 WHERE bsk.id=%s"""
     res1 = run_sql(query1, (int(bskid),))
     if len(res1):
         basket = list(res1[0])
         query2 = """SELECT share_level
                     FROM usergroup_bskBASKET
                     WHERE id_usergroup=0 and id_bskBASKET=%s"""
         res2 = run_sql(query2, (int(bskid),))
         if res2:
             basket.append(res2[0][0])
         else:
             basket.append(None)
     return basket
 
 def get_public_basket_info(bskid):
     """Return information about a given public basket."""
 
     query = """ SELECT      bsk.id,
                             bsk.name,
                             bsk.id_owner,
                             DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
                             bsk.nb_views,
                             COUNT(rec.id_bibrec_or_bskEXTREC),
                             GROUP_CONCAT(rec.id_bibrec_or_bskEXTREC),
                             ubsk.share_level
                 FROM        bskBASKET AS bsk
                 LEFT JOIN   bskREC AS rec
                     ON      rec.id_bskBASKET=bsk.id
                 JOIN        usergroup_bskBASKET AS ubsk
                     ON      ubsk.id_bskBASKET=bsk.id
                     AND     ubsk.id_usergroup=0
                 WHERE       bsk.id=%s
                 GROUP BY    bsk.id;"""
 
     params = (bskid,)
 
     res = run_sql(query, params)
 
     return res
 
 def get_basket_general_infos(bskid):
     """return information about a basket, suited for public access.
     @return: a (id, name, date of modification, nb of views, nb of records, id of owner) tuple
     """
     query = """SELECT bsk.id,
                       bsk.name,
                       DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
                       bsk.nb_views,
                       count(rec.id_bibrec_or_bskEXTREC),
                       bsk.id_owner
 
     FROM   bskBASKET bsk LEFT JOIN bskREC rec
                          ON bsk.id=rec.id_bskBASKET
     WHERE bsk.id=%s
 
     GROUP BY bsk.id"""
     res = run_sql(query, (int(bskid),))
     if res:
         query2 = "UPDATE bskBASKET SET nb_views=nb_views+1 WHERE id=%s"
         run_sql(query2, (int(bskid),))
         return res[0]
     return ()
 
 def get_basket_owner_id(bskid):
     """Return the uid of the owner."""
     query = """SELECT id_owner
                  FROM bskBASKET
                 WHERE id=%s"""
     res = run_sql(query, (bskid, ))
     if res:
         return res[0][0]
     return -1
 
 def count_public_baskets():
     """Returns the number of public baskets."""
 
     query = """ SELECT  COUNT(id_bskBASKET)
                 FROM    usergroup_bskBASKET
                 WHERE   id_usergroup=0"""
 
     res = run_sql(query)
 
     return __wash_sql_count(res)
 
 def get_public_baskets_list(inf_limit, max_number, order=1, asc=1):
     """Return list of public baskets
     @param inf_limit: limit to baskets from number x
     @param max_number: number of baskets to return
     @order: 1: order by name of basket, 2: number of views, 3: owner
     @return:
     [(basket id, basket name, nb of views, uid of owner, nickname of owner)]"""
 
     query = """SELECT bsk.id,
                       bsk.name,
                       bsk.nb_views,
                       u.id,
                       u.nickname
                FROM   bskBASKET bsk LEFT JOIN usergroup_bskBASKET ugbsk
                                     on bsk.id=ugbsk.id_bskBASKET
                                     LEFT JOIN user u
                                     on bsk.id_owner=u.id
                WHERE ugbsk.id_usergroup=0
     """
     if order == 2:
         query += 'ORDER BY bsk.nb_views'
     elif order == 3:
         query += 'ORDER BY u.nickname'
         if asc:
             query += ' ASC'
         else:
             query += ' DESC'
         query += ', u.id'
     else:
         query += 'ORDER BY bsk.name'
     if asc:
         query += ' ASC '
     else:
         query += ' DESC '
     query += "LIMIT %s,%s"
 
     return run_sql(query, (inf_limit, max_number))
 
 def count_all_public_baskets():
     """Return the number of all the public baskets."""
 
     query = """ SELECT  count(id_bskBASKET)
                 FROM    usergroup_bskBASKET
                 WHERE   id_usergroup=0"""
 
     res = run_sql(query)
 
     return __wash_sql_count(res)
 
 def get_list_public_baskets(page, max_number, sort='name', asc=1):
     """Return list of public baskets
     @param page: limit to baskets from number x
     @param max_number: maximum number of baskets to return
     @sort: 1: order by name of basket, 2: number of views, 3: owner
     @return:
     [(basket id, basket name, nb of views, uid of owner, nickname of owner)]"""
 
     query = """ SELECT      bsk.id,
                             bsk.name,
                             bsk.id_owner,
                             u.nickname,
                             DATE_FORMAT(bsk.date_modification, '%%Y-%%m-%%d %%H:%%i:%%s'),
                             COUNT(rec.id_bibrec_or_bskEXTREC) AS items,
                             bsk.nb_views
                 FROM        usergroup_bskBASKET AS ugbsk
                 JOIN        bskBASKET AS bsk
                     ON      bsk.id=ugbsk.id_bskBASKET
                 LEFT JOIN   bskREC AS rec
                     ON      rec.id_bskBASKET=bsk.id
                 LEFT JOIN   user AS u
                     ON      u.id=bsk.id_owner
                 WHERE       ugbsk.id_usergroup=0
                 GROUP BY    bsk.id"""
 
     if sort == 'name':
         query += """
                 ORDER BY bsk.name"""
     elif sort == 'owner':
         query += """
                 ORDER BY u.nickname"""
     elif sort == 'views':
         query += """
                 ORDER BY bsk.nb_views"""
     elif sort == 'date':
         query += """
                 ORDER BY bsk.date_modification"""
     elif sort == 'items':
         query += """
                 ORDER BY items"""
     else:
         query += """
                 ORDER BY bsk.name"""
     if asc:
         query += """ ASC"""
         if sort == """owner""":
             query += """, u.id"""
     else:
         query += """ DESC"""
         if sort == """owner""":
             query += """, u.id"""
 
     query += """
                 LIMIT %s, %s"""
 
     page = max(0, page)
 
     res = run_sql(query, (page, max_number))
 
     return res
 
 def is_basket_public(bskid):
     """Check if the given basket is public.
     Returns ((0,),) if False, ((1,),) if True."""
 
     query = """ SELECT  COUNT(*)
                 FROM    usergroup_bskBASKET
                 WHERE   id_usergroup=0
                 AND     id_bskBASKET=%s"""
 
     params = (bskid,)
 
     res = run_sql(query, params)
 
     return __wash_sql_count(res)
 
 def subscribe(uid, bskid):
     """Subscribe the given user to the given basket."""
 
     query1 = """SELECT  COUNT(*)
                 FROM    user_bskBASKET
                 WHERE   id_user=%s
                 AND     id_bskBASKET=%s"""
 
     params1 = (uid, bskid)
 
     res1 = run_sql(query1, params1)
 
     if res1[0][0]:
         # The user is either the owner of the basket or is already subscribed.
         return False
     else:
         query2 = """INSERT INTO user_bskBASKET (id_user, id_bskBASKET)
                                    VALUES      (%s, %s)"""
 
         params2 = (uid, bskid)
 
         run_sql(query2, params2)
 
         return True
 
 def unsubscribe(uid, bskid):
     """Unsubscribe the given user from the given basket."""
 
     query1 = """SELECT  COUNT(*)
                 FROM    bskBASKET
                 WHERE   id_owner=%s
                 AND     id=%s"""
 
     params1 = (uid, bskid)
 
     res1 = run_sql(query1, params1)
 
     if res1[0][0]:
         # The user is the owner of the basket.
         return False
     else:
         query2 = """DELETE FROM user_bskBASKET
                     WHERE       id_user=%s
                     AND         id_bskBASKET=%s"""
 
         params2 = (uid, bskid)
 
         res2 = run_sql(query2, params2)
 
         if res2:
             return True
         else:
             return False
 
 def is_user_subscribed_to_basket(uid, bskid):
     """Return ((1,),) if the user is subscribed to the given basket
     or ((0,),) if the user is not subscribed or is the owner of the basket."""
 
     query = """ SELECT  COUNT(ubsk.id_bskBASKET)
                 FROM    user_bskBASKET AS ubsk
                 JOIN    bskBASKET AS bsk
                     ON  bsk.id=ubsk.id_bskBASKET
                     AND bsk.id_owner!=ubsk.id_user
                 WHERE   ubsk.id_user=%s
                 AND     ubsk.id_bskBASKET=%s"""
 
     params = (uid, bskid)
 
     res = run_sql(query, params)
 
     return __wash_sql_count(res)
 
 def count_subscribers(uid, bskid):
     """Returns a (number of users, number of groups, number of alerts) tuple
     for the given user (uid) and basket (bskid)."""
 
     uid = int(uid)
     bskid = int(bskid)
 
     query_groups = """  SELECT      count(id_usergroup)
                         FROM        usergroup_bskBASKET
                         WHERE       id_bskBASKET=%s
                         AND         NOT(share_level='NO')
                         GROUP BY    id_bskBASKET"""
     params_groups = (bskid,)
     res_groups = run_sql(query_groups, params_groups)
     nb_groups = __wash_sql_count(res_groups)
 
     query_users = """   SELECT      count(id_user)
                         FROM        user_bskBASKET
                         WHERE       id_bskBASKET=%s
                         AND         id_user!=%s
                         GROUP BY    id_bskBASKET"""
     params_users = (bskid, uid)
     res_users = run_sql(query_users, params_users)
     nb_users = __wash_sql_count(res_users)
 
     query_alerts = """  SELECT      count(id_query)
                         FROM        user_query_basket
                         WHERE       id_basket=%s
                         GROUP BY    id_basket"""
     params_alerts = (bskid,)
     res_alerts = run_sql(query_alerts, params_alerts)
     nb_alerts = __wash_sql_count(res_alerts)
     return (nb_users, nb_groups, nb_alerts)
 
 def get_groups_subscribing_to_basket(bskid):
     """ get list of (group id, group name, rights) tuples for a given basket
     Please note that group 0 is used to mean everybody.
     """
     query = """SELECT ugb.id_usergroup,
                       ug.name,
                       ugb.share_level
                FROM usergroup_bskBASKET ugb LEFT JOIN usergroup ug
                                             ON ugb.id_usergroup=ug.id
                WHERE ugb.id_bskBASKET=%s
                ORDER BY ugb.id_usergroup"""
     return run_sql(query, (int(bskid),))
 
 def get_rights_on_public_basket(bskid):
     """"""
 
     query = """ SELECT  share_level
                 FROM    usergroup_bskBASKET
                 WHERE   id_usergroup=0
                 AND     id_bskBASKET=%s"""
 
     params = (bskid,)
 
     res = run_sql(query, params)
 
     return res
 
 def count_public_basket_subscribers(bskid):
     """Return the number of users subscribed to the given public basket."""
 
     query = """ SELECT  COUNT(ubsk.id_user)
                 FROM    user_bskBASKET AS ubsk
                 JOIN    bskBASKET AS bsk
                     ON  bsk.id=ubsk.id_bskBASKET
                     AND bsk.id_owner!=ubsk.id_user
                 WHERE   ubsk.id_bskBASKET=%s"""
 
     params = (bskid,)
 
     res = run_sql(query, params)
 
     return __wash_sql_count(res)
 
 ################################ Notes ########################################
 
 def get_notes(bskid, recid):
     """Return all comments for record recid in basket bskid."""
 
     query = """
     SELECT user.id,
            user.nickname,
            bskcmt.title,
            bskcmt.body,
            DATE_FORMAT(bskcmt.date_creation, '%%Y-%%m-%%d %%H:%%i:%%s'),
            bskcmt.priority,
            bskcmt.id,
            bskcmt.in_reply_to_id_bskRECORDCOMMENT
 
     FROM   bskRECORDCOMMENT bskcmt LEFT JOIN user
                                    ON (bskcmt.id_user=user.id)
 
     WHERE  bskcmt.id_bskBASKET=%s AND
            bskcmt.id_bibrec_or_bskEXTREC=%s
 
     ORDER BY bskcmt.reply_order_cached_data
     """
     bskid = int(bskid)
     recid = int(recid)
     res = run_sql(query, (bskid, recid))
     if res:
         return res
     else:
         return ()
 
 def get_note(cmtid):
     """Return comment cmtid as a (author's nickname, author's uid, title, body, date of creation, priority) tuple"""
     out = ()
     query = """
     SELECT user.nickname,
            user.id,
            bskcmt.title,
            bskcmt.body,
            DATE_FORMAT(bskcmt.date_creation, '%%Y-%%m-%%d %%H:%%i:%%s'),
            bskcmt.priority
 
     FROM   bskRECORDCOMMENT bskcmt LEFT JOIN user
                                    ON (bskcmt.id_user=user.id)
 
     WHERE  bskcmt.id=%s
     """
     cmtid = int(cmtid)
     res = run_sql(query, (cmtid,))
     if res:
         return res[0]
     return out
 
 def save_note(uid, bskid, recid, title, body, date_creation=None, reply_to=None):
     """Save then given note (title, body) on the given item in the given basket.
     @param date_creation: date in which the note was created
     @type date_creation: None or String, e.g: '2011-07-04 14:20:57'
 
     Note: convert_datestruct_to_datetext((2005, 11, 16, 15, 11, 44, 2, 320, 0)) -> '2005-11-16 15:11:44'
     """
     if reply_to and CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH >= 0:
         # Check that we have not reached max depth
         note_ancestors = get_note_ancestors(reply_to)
         if len(note_ancestors) >= CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH:
             if CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH == 0:
                 reply_to = None
             else:
                 reply_to = note_ancestors[CFG_WEBBASKET_MAX_COMMENT_THREAD_DEPTH - 1]
 
     if not date_creation:
         date = convert_datestruct_to_datetext(localtime())
     else: #the date comes with the proper format
         date = date_creation
 
     res = run_sql("""INSERT INTO bskRECORDCOMMENT (id_user, id_bskBASKET,
                        id_bibrec_or_bskEXTREC, title, body, date_creation, in_reply_to_id_bskRECORDCOMMENT)
                      VALUES (%s, %s, %s, %s, %s, %s, %s)""",
                   (int(uid), int(bskid), int(recid), title, body, date, reply_to or 0))
     if res:
         new_comid = int(res)
         parent_reply_order = run_sql("""SELECT reply_order_cached_data from bskRECORDCOMMENT where id=%s""", (reply_to,))
         if not parent_reply_order or parent_reply_order[0][0] is None:
             parent_reply_order = ''
         else:
             parent_reply_order = parent_reply_order[0][0]
-        run_sql("""UPDATE bskRECORDCOMMENT SET reply_order_cached_data=%s WHERE id=%s""",
+        run_sql("""UPDATE bskRECORDCOMMENT SET reply_order_cached_data=_binary %s WHERE id=%s""",
                 (parent_reply_order + get_reply_order_cache_data(new_comid), new_comid))
         return int(res)
     return 0
 
 def delete_note(bskid, recid, cmtid):
     """Delete a comment on an item of a basket"""
 
     query = """ DELETE
                 FROM    bskRECORDCOMMENT
                 WHERE   id_bskBASKET=%s
                 AND     id_bibrec_or_bskEXTREC=%s
                 AND     id=%s"""
 
     params = (int(bskid), int(recid), int(cmtid))
 
     run_sql(query, params)
 
 def get_note_ancestors(cmtid, depth=None):
     """
     Returns the list of ancestors of the given note, ordered from
     oldest to newest ("top-down": direct parent of cmtid is at last position),
     up to given depth
 
     @param cmtid: the ID of the note for which we want to retrieve ancestors
     @type cmtid: int
     @param depth: the maximum of levels up from the given note we
                   want to retrieve ancestors. None for no limit, 1 for
                   direct parent only, etc.
     @type depth: int
     @return the list of ancestors
     @rtype: list
     """
     if depth == 0:
         return []
 
     res = run_sql("SELECT in_reply_to_id_bskRECORDCOMMENT FROM bskRECORDCOMMENT WHERE id=%s", (cmtid,))
     if res:
         parent_cmtid = res[0][0]
         if parent_cmtid == 0:
             return []
         parent_ancestors = []
         if depth:
             depth -= 1
         parent_ancestors = get_note_ancestors(parent_cmtid, depth)
         parent_ancestors.append(parent_cmtid)
         return parent_ancestors
     else:
         return []
 
 def note_belongs_to_item_in_basket_p(cmtid, recid, bskid):
     """Returns 1 (True) if the given note (cmtid) belongs to the given item
     (recid) and the given basket (bskid) or 0 (False)."""
 
     query = """ SELECT  COUNT(*)
                 FROM    bskRECORDCOMMENT
                 WHERE   id=%s
                 AND     id_bibrec_or_bskEXTREC=%s
                 AND     id_bskBASKET=%s"""
 
     params = (cmtid, recid, bskid)
 
     res = run_sql(query, params)
 
     return __wash_sql_count(res)
 
 def get_number_of_notes_per_record_in_basket(bskid, recids):
     """Returns the number of comments per record
     for all the given records in the given basket"""
 
     # We need to convert the list of recids into a string of commma separated
     # numbers (recids), instead of a tuple, to cover the case where we have
     # single element lists of recids. Example:
     # [1] --> '1' instaed of [1] --> (1,)
     # Single element tuples would cause the query to fail due to the syntax.
     query = """ SELECT      rec.id_bibrec_or_bskEXTREC,
                             COUNT(cmt.id_bibrec_or_bskEXTREC)
                 FROM        bskREC as rec
                 LEFT JOIN   bskRECORDCOMMENT as cmt
                     ON      cmt.id_bibrec_or_bskEXTREC = rec.id_bibrec_or_bskEXTREC
                 WHERE       rec.id_bskBASKET=%%s
                     AND     rec.id_bibrec_or_bskEXTREC IN (%s)
                 GROUP BY    id_bibrec_or_bskEXTREC
                 ORDER BY    rec.score""" % (str(map(int, recids))[1:-1],)
 
     params = (bskid,)
 
     result = run_sql(query, params)
 
     return result
 
 ########################## Usergroup functions ################################
 
 def get_group_infos(uid):
     """For each group the user with uid is a member of return the id, name and number of baskets."""
     query = """SELECT g.id,
                       g.name,
                       count(ugb.id_bskBASKET)
                FROM usergroup g LEFT JOIN (user_usergroup ug,
                                            usergroup_bskBASKET ugb)
                                 ON (g.id=ug.id_usergroup
                                             AND
                                     g.id=ugb.id_usergroup)
                WHERE ug.id_user=%s AND NOT(ugb.share_level='NO') AND ug.user_status!=%s
                GROUP BY g.id
                ORDER BY g.name"""
     params = (int(uid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING'])
     res = run_sql(query, params)
     return res
 
 def count_groups_user_member_of(uid):
     """Returns the number of groups the user has joined."""
 
     query = """ SELECT  COUNT(id_usergroup)
                 FROM    user_usergroup
                 WHERE   id_user=%s
                 AND     user_status!=%s"""
 
     params = (int(uid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING'])
 
     res = run_sql(query, params)
 
     return __wash_sql_count(res)
 
 def get_groups_user_member_of(uid):
     """
     Get uids and names of groups user is member of.
     @param uid: user id (int)
     @return: a tuple of (group_id, group_name) tuples
     """
     query = """
     SELECT g.id,
            g.name
     FROM usergroup g JOIN user_usergroup ug
                    ON (g.id=ug.id_usergroup)
     WHERE ug.id_user=%s and ug.user_status!=%s
     ORDER BY g.name
     """
     params = (int(uid), CFG_WEBSESSION_USERGROUP_STATUS['PENDING'])
     res = run_sql(query, params)
     if res:
         return res
     return ()
 
 ########################## auxilliary functions ###############################
 
 def __wash_sql_count(res):
     """Wash the result of SQL COUNT function and return only an integer."""
     if res:
         return res[0][0]
     return 0
 
 def __decompress_last(item):
     """private function, used to shorten code"""
     item = list(item)
     item[-1] = decompress(item[-1])
     return item
 
 def create_pseudo_record(es_title, es_desc, es_url, of="hb"):
     """Return a pseudo record representation given a title and a description."""
 
     if of == 'hb':
         record = '\n'.join([es_title, es_desc, es_url])
     if of == 'xm':
 # In case we want to use the controlfield,
 # the -es_id must be used.
 #<controlfield tag="001">%s</controlfield>
         record = """<record>
   <datafield tag="245" ind1=" " ind2=" ">
     <subfield code="a">%s</subfield>
   </datafield>
   <datafield tag="520" ind1=" " ind2=" ">
     <subfield code="a">%s</subfield>
   </datafield>
   <datafield tag="856" ind1="4" ind2=" ">
     <subfield code="u">%s</subfield>
   </datafield>
 </record>""" % (encode_for_xml(es_title), encode_for_xml(es_desc), encode_for_xml(es_url))
     return record
 
 def prettify_url(url, char_limit=50, nb_dots=3):
     """If the url has more characters than char_limit return a shortened version of it
     keeping the beginning and ending and replacing the rest with dots."""
 
     if len(url) > char_limit:
         # let's set a minimum character limit
         if char_limit < 5:
             char_limit = 5
         # let's set a maximum number of dots in relation to the character limit
         if nb_dots > char_limit/4:
             nb_dots = char_limit/5
         nb_char_url = char_limit - nb_dots
         nb_char_end = nb_char_url/4
         nb_char_beg = nb_char_url - nb_char_end
         return url[:nb_char_beg] + '.'*nb_dots + url[-nb_char_end:]
     else:
         return url
diff --git a/modules/webcomment/lib/webcomment.py b/modules/webcomment/lib/webcomment.py
index b3f1fabbd..374f186c6 100644
--- a/modules/webcomment/lib/webcomment.py
+++ b/modules/webcomment/lib/webcomment.py
@@ -1,2039 +1,2039 @@
 # -*- coding: utf-8 -*-
 
 # This file is part of Invenio.
-# Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 CERN.
+# Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """ Comments and reviews for records """
 
 __revision__ = "$Id$"
 
 # non Invenio imports:
 import time
 import math
 import os
 import shutil
 import cgi
 import re
 from datetime import datetime, timedelta
 
 # Invenio imports:
 
 from invenio.dbquery import run_sql
 from invenio.config import CFG_PREFIX, \
      CFG_SITE_LANG, \
      CFG_WEBALERT_ALERT_ENGINE_EMAIL,\
      CFG_SITE_SUPPORT_EMAIL,\
      CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL,\
      CFG_SITE_URL,\
      CFG_SITE_NAME,\
      CFG_WEBCOMMENT_ALLOW_REVIEWS,\
      CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS,\
      CFG_WEBCOMMENT_ALLOW_COMMENTS,\
      CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL,\
      CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN,\
      CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS,\
      CFG_WEBCOMMENT_DEFAULT_MODERATOR, \
      CFG_SITE_RECORD, \
      CFG_WEBCOMMENT_EMAIL_REPLIES_TO, \
      CFG_WEBCOMMENT_ROUND_DATAFIELD, \
      CFG_WEBCOMMENT_RESTRICTION_DATAFIELD, \
      CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH
 from invenio.webmessage_mailutils import \
      email_quote_txt, \
      email_quoted_txt2html
 from invenio.htmlutils import tidy_html
 from invenio.webuser import get_user_info, get_email, collect_user_info
 from invenio.dateutils import convert_datetext_to_dategui, \
                               datetext_default, \
                               convert_datestruct_to_datetext
 from invenio.mailutils import send_email
 from invenio.errorlib import register_exception
 from invenio.messages import wash_language, gettext_set_language
 from invenio.urlutils import wash_url_argument
 from invenio.webcomment_config import CFG_WEBCOMMENT_ACTION_CODE
 from invenio.access_control_engine import acc_authorize_action
 from invenio.search_engine import \
      guess_primary_collection_of_a_record, \
      check_user_can_view_record, \
      get_collection_reclist, \
      get_colID
 from invenio.search_engine_utils import get_fieldvalues
 from invenio.webcomment_washer import EmailWasher
 try:
     import invenio.template
     webcomment_templates = invenio.template.load('webcomment')
 except:
     pass
 
 
 def perform_request_display_comments_or_remarks(req, recID, display_order='od', display_since='all', nb_per_page=100, page=1, ln=CFG_SITE_LANG, voted=-1, reported=-1, subscribed=0, reviews=0, uid=-1, can_send_comments=False, can_attach_files=False, user_is_subscribed_to_discussion=False, user_can_unsubscribe_from_discussion=False, display_comment_rounds=None):
     """
     Returns all the comments (reviews) of a specific internal record or external basket record.
     @param recID:  record id where (internal record IDs > 0) or (external basket record IDs < -100)
     @param display_order:       hh = highest helpful score, review only
                                 lh = lowest helpful score, review only
                                 hs = highest star score, review only
                                 ls = lowest star score, review only
                                 od = oldest date
                                 nd = newest date
     @param display_since:       all= no filtering by date
                                 nd = n days ago
                                 nw = n weeks ago
                                 nm = n months ago
                                 ny = n years ago
                                 where n is a single digit integer between 0 and 9
     @param nb_per_page: number of results per page
     @param page: results page
     @param voted: boolean, active if user voted for a review, see perform_request_vote function
     @param reported: boolean, active if user reported a certain comment/review, perform_request_report function
     @param subscribed: int, 1 if user just subscribed to discussion, -1 if unsubscribed
     @param reviews: boolean, enabled if reviews, disabled for comments
     @param uid: the id of the user who is reading comments
     @param can_send_comments: if user can send comment or not
     @param can_attach_files: if user can attach file to comment or not
     @param user_is_subscribed_to_discussion: True if user already receives new comments by email
     @param user_can_unsubscribe_from_discussion: True is user is allowed to unsubscribe from discussion
     @return html body.
     """
     _ = gettext_set_language(ln)
 
     warnings = []
     nb_reviews = 0
     nb_comments = 0
 
     # wash arguments
     recID = wash_url_argument(recID, 'int')
     ln = wash_language(ln)
     display_order = wash_url_argument(display_order, 'str')
     display_since = wash_url_argument(display_since, 'str')
     nb_per_page = wash_url_argument(nb_per_page, 'int')
     page = wash_url_argument(page, 'int')
     voted = wash_url_argument(voted, 'int')
     reported = wash_url_argument(reported, 'int')
     reviews = wash_url_argument(reviews, 'int')
 
     # vital argument check
     (valid, error_body) = check_recID_is_in_range(recID, warnings, ln)
     if not(valid):
         return error_body
 
     # CERN hack begins: filter out ATLAS comments
     from invenio.config import CFG_CERN_SITE
     if CFG_CERN_SITE:
         restricted_comments_p = False
         for report_number in  get_fieldvalues(recID, '088__a'):
             if report_number.startswith("ATL-"):
                 restricted_comments_p = True
                 break
         if restricted_comments_p:
             err_code, err_msg = acc_authorize_action(uid, 'viewrestrcoll',
                                                      collection='ATLAS Communications')
             if err_code:
                 return err_msg
     # CERN hack ends
 
     # Query the database and filter results
     user_info = collect_user_info(uid)
     res = query_retrieve_comments_or_remarks(recID, display_order, display_since, reviews, user_info=user_info)
     # res2 = query_retrieve_comments_or_remarks(recID, display_order, display_since, not reviews, user_info=user_info)
     nb_res = len(res)
 
     from invenio.webcommentadminlib import get_nb_reviews, get_nb_comments
 
     nb_reviews = get_nb_reviews(recID, count_deleted=False)
     nb_comments = get_nb_comments(recID, count_deleted=False)
 
     # checking non vital arguemnts - will be set to default if wrong
     #if page <= 0 or page.lower() != 'all':
     if page < 0:
         page = 1
     if nb_per_page < 0:
         nb_per_page = 100
     if CFG_WEBCOMMENT_ALLOW_REVIEWS and reviews:
         if display_order not in ['od', 'nd', 'hh', 'lh', 'hs', 'ls']:
             display_order = 'hh'
     else:
         if display_order not in ['od', 'nd']:
             display_order = 'od'
 
     if not display_comment_rounds:
         display_comment_rounds = []
 
     # filter results according to page and number of reults per page
     if nb_per_page > 0:
         if nb_res > 0:
             last_page = int(math.ceil(nb_res / float(nb_per_page)))
         else:
             last_page = 1
         if page > last_page:
             page = 1
         if nb_res > nb_per_page: # if more than one page of results
             if  page < last_page:
                 res = res[(page-1)*(nb_per_page) : (page*nb_per_page)]
             else:
                 res = res[(page-1)*(nb_per_page) : ]
         else: # one page of results
             pass
     else:
         last_page = 1
 
     # Add information regarding visibility of comment for user
     user_collapsed_comments = get_user_collapsed_comments_for_record(uid, recID)
     if reviews:
         res = [row[:] + (row[10] in user_collapsed_comments,) for row in res]
     else:
         res = [row[:] + (row[6] in user_collapsed_comments,) for row in res]
 
     # Send to template
     avg_score = 0.0
     # comments not allowed by admin
     if not CFG_WEBCOMMENT_ALLOW_COMMENTS and not CFG_WEBCOMMENT_ALLOW_REVIEWS:
         body = webcomment_templates.tmpl_error(
             _('Comments on records have been disallowed by the'
               ' administrator.'), ln)
         return body
     if reported > 0:
         warnings.append((_('Your feedback has been recorded, many thanks.'),
                          'green'))
     elif reported == 0:
         warnings.append((_('You have already reported an abuse for this'
                            ' comment.'), ''))
     elif reported == -2:
         warnings.append((_('The comment you have reported no longer '
                            'exists.'), ''))
     if CFG_WEBCOMMENT_ALLOW_REVIEWS and reviews:
         avg_score = calculate_avg_score(res)
         if voted > 0:
             warnings.append((_('Your feedback has been recorded, many'
                                ' thanks.'), 'green'))
         elif voted == 0:
             warnings.append((_('Sorry, you have already voted. This vote has '
                                'not been recorded.'), ''))
     if subscribed == 1:
         warnings.append(
             (_('You have been subscribed to this discussion. From now on, you'
                ' will receive an email whenever a new comment is posted.'),
              'green')
         )
     elif subscribed == -1:
         warnings.append((_('You have been unsubscribed from this discussion.'),
                          'green'))
 
     grouped_comments = group_comments_by_round(res, reviews)
 
     # Clean list of comments round names
     if not display_comment_rounds:
         display_comment_rounds = []
     elif 'all' in display_comment_rounds:
         display_comment_rounds = [cmtgrp[0] for cmtgrp in grouped_comments]
     elif 'latest' in display_comment_rounds:
         if grouped_comments:
             display_comment_rounds.append(grouped_comments[-1][0])
         display_comment_rounds.remove('latest')
 
     body = webcomment_templates.tmpl_get_comments(req,
                                                   recID,
                                                   ln,
                                                   nb_per_page, page, last_page,
                                                   display_order, display_since,
                                                   CFG_WEBCOMMENT_ALLOW_REVIEWS,
                                                   grouped_comments, nb_comments, avg_score,
                                                   warnings,
                                                   border=0,
                                                   reviews=reviews,
                                                   total_nb_reviews=nb_reviews,
                                                   uid=uid,
                                                   can_send_comments=can_send_comments,
                                                   can_attach_files=can_attach_files,
                                                   user_is_subscribed_to_discussion=\
                                                   user_is_subscribed_to_discussion,
                                                   user_can_unsubscribe_from_discussion=\
                                                   user_can_unsubscribe_from_discussion,
                                                   display_comment_rounds=display_comment_rounds)
     return body
 
 def perform_request_vote(cmt_id, client_ip_address, value, uid=-1):
     """
     Vote positively or negatively for a comment/review
     @param cmt_id: review id
     @param value: +1 for voting positively
                   -1 for voting negatively
     @return: integer 1 if successful, integer 0 if not
     """
     cmt_id = wash_url_argument(cmt_id, 'int')
     client_ip_address = wash_url_argument(client_ip_address, 'str')
     value = wash_url_argument(value, 'int')
     uid = wash_url_argument(uid, 'int')
     if cmt_id > 0 and value in [-1, 1] and check_user_can_vote(cmt_id, client_ip_address, uid):
         action_date = convert_datestruct_to_datetext(time.localtime())
         action_code = CFG_WEBCOMMENT_ACTION_CODE['VOTE']
         query = """INSERT INTO cmtACTIONHISTORY (id_cmtRECORDCOMMENT,
                     id_bibrec, id_user, client_host, action_time,
                     action_code)
                    VALUES (%s, NULL ,%s, inet_aton(%s), %s, %s)"""
         params = (cmt_id, uid, client_ip_address, action_date, action_code)
         run_sql(query, params)
         return query_record_useful_review(cmt_id, value)
     else:
         return 0
 
 def check_user_can_comment(recID, client_ip_address, uid=-1):
     """ Check if a user hasn't already commented within the last seconds
     time limit: CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS
     @param recID: record id
     @param client_ip_address: IP => use: str(req.remote_ip)
     @param uid: user id, as given by invenio.webuser.getUid(req)
     """
     recID = wash_url_argument(recID, 'int')
     client_ip_address = wash_url_argument(client_ip_address, 'str')
     uid = wash_url_argument(uid, 'int')
     max_action_time = time.time() - CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS
     max_action_time = convert_datestruct_to_datetext(time.localtime(max_action_time))
     action_code = CFG_WEBCOMMENT_ACTION_CODE['ADD_COMMENT']
     query = """SELECT id_bibrec
                FROM cmtACTIONHISTORY
                WHERE id_bibrec=%s AND
                      action_code=%s AND
                      action_time>%s
             """
     params = (recID, action_code, max_action_time)
     if uid < 0:
         query += " AND client_host=inet_aton(%s)"
         params += (client_ip_address,)
     else:
         query += " AND id_user=%s"
         params += (uid,)
     res = run_sql(query, params)
     return len(res) == 0
 
 def check_user_can_review(recID, client_ip_address, uid=-1):
     """ Check if a user hasn't already reviewed within the last seconds
     time limit: CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS
     @param recID: record ID
     @param client_ip_address: IP => use: str(req.remote_ip)
     @param uid: user id, as given by invenio.webuser.getUid(req)
     """
     action_code = CFG_WEBCOMMENT_ACTION_CODE['ADD_REVIEW']
     query = """SELECT id_bibrec
                FROM cmtACTIONHISTORY
                WHERE id_bibrec=%s AND
                      action_code=%s
             """
     params = (recID, action_code)
     if uid < 0:
         query += " AND client_host=inet_aton(%s)"
         params += (client_ip_address,)
     else:
         query += " AND id_user=%s"
         params += (uid,)
     res = run_sql(query, params)
     return len(res) == 0
 
 def check_user_can_vote(cmt_id, client_ip_address, uid=-1):
     """ Checks if a user hasn't already voted
     @param cmt_id: comment id
     @param client_ip_address: IP => use: str(req.remote_ip)
     @param uid: user id, as given by invenio.webuser.getUid(req)
     """
     cmt_id = wash_url_argument(cmt_id, 'int')
     client_ip_address = wash_url_argument(client_ip_address, 'str')
     uid = wash_url_argument(uid, 'int')
     query = """SELECT id_cmtRECORDCOMMENT
                FROM cmtACTIONHISTORY
                WHERE id_cmtRECORDCOMMENT=%s"""
     params = (cmt_id,)
     if uid < 0:
         query += " AND client_host=inet_aton(%s)"
         params += (client_ip_address,)
     else:
         query += " AND id_user=%s"
         params += (uid, )
     res = run_sql(query, params)
     return (len(res) == 0)
 
 def get_comment_collection(cmt_id):
     """
     Extract the collection where the comment is written
     """
     query = "SELECT id_bibrec FROM cmtRECORDCOMMENT WHERE id=%s"
     recid = run_sql(query, (cmt_id,))
     record_primary_collection = guess_primary_collection_of_a_record(recid[0][0])
     return record_primary_collection
 
 def get_collection_moderators(collection):
     """
     Return the list of comment moderators for the given collection.
     """
     from invenio.access_control_engine import acc_get_authorized_emails
 
     res =  list(acc_get_authorized_emails('moderatecomments', collection=collection))
     if not res:
         return [CFG_WEBCOMMENT_DEFAULT_MODERATOR,]
     return res
 
 def perform_request_report(cmt_id, client_ip_address, uid=-1):
     """
     Report a comment/review for inappropriate content.
     Will send an email to the administrator if number of reports is a multiple of CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN
     @param cmt_id: comment id
     @return: integer 1 if successful, integer 0 if not. -2 if comment does not exist
     """
     cmt_id = wash_url_argument(cmt_id, 'int')
     if cmt_id <= 0:
         return 0
     (query_res, nb_abuse_reports) = query_record_report_this(cmt_id)
     if query_res == 0:
         return 0
     elif query_res == -2:
         return -2
     if not(check_user_can_report(cmt_id, client_ip_address, uid)):
         return 0
     action_date = convert_datestruct_to_datetext(time.localtime())
     action_code = CFG_WEBCOMMENT_ACTION_CODE['REPORT_ABUSE']
     query = """INSERT INTO cmtACTIONHISTORY (id_cmtRECORDCOMMENT, id_bibrec,
                   id_user, client_host, action_time, action_code)
                VALUES (%s, NULL, %s, inet_aton(%s), %s, %s)"""
     params = (cmt_id, uid, client_ip_address, action_date, action_code)
     run_sql(query, params)
     if nb_abuse_reports % CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN == 0:
         (cmt_id2,
          id_bibrec,
          id_user,
          cmt_body,
          cmt_date,
          cmt_star,
          cmt_vote, cmt_nb_votes_total,
          cmt_title,
          cmt_reported,
          round_name,
          restriction) = query_get_comment(cmt_id)
         (user_nb_abuse_reports,
          user_votes,
          user_nb_votes_total) = query_get_user_reports_and_votes(int(id_user))
         (nickname, user_email, last_login) = query_get_user_contact_info(id_user)
         from_addr = '%s Alert Engine <%s>' % (CFG_SITE_NAME, CFG_WEBALERT_ALERT_ENGINE_EMAIL)
         comment_collection = get_comment_collection(cmt_id)
         to_addrs = get_collection_moderators(comment_collection)
         subject = "A comment has been reported as inappropriate by a user"
         body = '''
 The following comment has been reported a total of %(cmt_reported)s times.
 
 Author:     nickname    = %(nickname)s
             email       = %(user_email)s
             user_id     = %(uid)s
             This user has:
                 total number of reports         = %(user_nb_abuse_reports)s
                 %(votes)s
 Comment:    comment_id      = %(cmt_id)s
             record_id       = %(id_bibrec)s
             date written    = %(cmt_date)s
             nb reports      = %(cmt_reported)s
             %(review_stuff)s
             body            =
 ---start body---
 %(cmt_body)s
 ---end body---
 
 Please go to the record page %(comment_admin_link)s to delete this message if necessary. A warning will be sent to the user in question.''' % \
                 {   'cfg-report_max'        : CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN,
                     'nickname'              : nickname,
                     'user_email'            : user_email,
                     'uid'                   : id_user,
                     'user_nb_abuse_reports' : user_nb_abuse_reports,
                     'user_votes'            : user_votes,
                     'votes'                 : CFG_WEBCOMMENT_ALLOW_REVIEWS and \
                                               "total number of positive votes\t= %s\n\t\ttotal number of negative votes\t= %s" % \
                                               (user_votes, (user_nb_votes_total - user_votes)) or "\n",
                     'cmt_id'                : cmt_id,
                     'id_bibrec'             : id_bibrec,
                     'cmt_date'              : cmt_date,
                     'cmt_reported'          : cmt_reported,
                     'review_stuff'          : CFG_WEBCOMMENT_ALLOW_REVIEWS and \
                                               "star score\t= %s\n\treview title\t= %s" % (cmt_star, cmt_title) or "",
                     'cmt_body'              : cmt_body,
                     'comment_admin_link'    : CFG_SITE_URL + "/"+ CFG_SITE_RECORD +"/" + str(id_bibrec) + '/comments#' + str(cmt_id),
                     'user_admin_link'       : "user_admin_link" #! FIXME
                 }
 
         #FIXME to be added to email when websession module is over:
         #If you wish to ban the user, you can do so via the User Admin Panel %(user_admin_link)s.
 
         send_email(from_addr, to_addrs, subject, body)
     return 1
 
 def check_user_can_report(cmt_id, client_ip_address, uid=-1):
     """ Checks if a user hasn't already reported a comment
     @param cmt_id: comment id
     @param client_ip_address: IP => use: str(req.remote_ip)
     @param uid: user id, as given by invenio.webuser.getUid(req)
     """
     cmt_id = wash_url_argument(cmt_id, 'int')
     client_ip_address = wash_url_argument(client_ip_address, 'str')
     uid = wash_url_argument(uid, 'int')
     query = """SELECT id_cmtRECORDCOMMENT
                FROM cmtACTIONHISTORY
                WHERE id_cmtRECORDCOMMENT=%s"""
     params = (uid,)
     if uid < 0:
         query += " AND client_host=inet_aton(%s)"
         params += (client_ip_address,)
     else:
         query += " AND id_user=%s"
         params += (uid,)
     res = run_sql(query, params)
     return (len(res) == 0)
 
 def query_get_user_contact_info(uid):
     """
     Get the user contact information
     @return: tuple (nickname, email, last_login), if none found return ()
     Note: for the moment, if no nickname, will return email address up to the '@'
     """
     query1 = """SELECT nickname, email,
                      DATE_FORMAT(last_login, '%%Y-%%m-%%d %%H:%%i:%%s')
                      FROM user WHERE id=%s"""
     params1 = (uid,)
     res1 = run_sql(query1, params1)
     if res1:
         return res1[0]
     else:
         return ()
 
 
 def query_get_user_reports_and_votes(uid):
     """
     Retrieve total number of reports and votes of a particular user
     @param uid: user id
     @return: tuple (total_nb_reports, total_nb_votes_yes, total_nb_votes_total)
             if none found return ()
     """
     query1 = """SELECT nb_votes_yes,
                        nb_votes_total,
                        nb_abuse_reports
                 FROM cmtRECORDCOMMENT
                 WHERE id_user=%s"""
     params1 = (uid,)
     res1 = run_sql(query1, params1)
     if len(res1) == 0:
         return ()
     nb_votes_yes = nb_votes_total = nb_abuse_reports = 0
     for cmt_tuple in res1:
         nb_votes_yes += int(cmt_tuple[0])
         nb_votes_total += int(cmt_tuple[1])
         nb_abuse_reports += int(cmt_tuple[2])
     return (nb_abuse_reports, nb_votes_yes, nb_votes_total)
 
 def query_get_comment(comID):
     """
     Get all fields of a comment
     @param comID: comment id
     @return: tuple (comID, id_bibrec, id_user, body, date_creation, star_score, nb_votes_yes, nb_votes_total, title, nb_abuse_reports, round_name, restriction)
             if none found return ()
     """
     query1 = """SELECT id,
                        id_bibrec,
                        id_user,
                        body,
                        DATE_FORMAT(date_creation, '%%Y-%%m-%%d %%H:%%i:%%s'),
                        star_score,
                        nb_votes_yes,
                        nb_votes_total,
                        title,
                        nb_abuse_reports,
                        round_name,
                        restriction
                 FROM cmtRECORDCOMMENT
                 WHERE id=%s"""
     params1 = (comID,)
     res1 = run_sql(query1, params1)
     if len(res1)>0:
         return res1[0]
     else:
         return ()
 
 def query_record_report_this(comID):
     """
     Increment the number of reports for a comment
     @param comID: comment id
     @return: tuple (success, new_total_nb_reports_for_this_comment) where
     success is integer 1 if success, integer 0 if not, -2 if comment does not exist
     """
     #retrieve nb_abuse_reports
     query1 = "SELECT nb_abuse_reports FROM cmtRECORDCOMMENT WHERE id=%s"
     params1 = (comID,)
     res1 = run_sql(query1, params1)
     if len(res1) == 0:
         return (-2, 0)
 
     #increment and update
     nb_abuse_reports = int(res1[0][0]) + 1
     query2 = "UPDATE cmtRECORDCOMMENT SET nb_abuse_reports=%s WHERE id=%s"
     params2 = (nb_abuse_reports, comID)
     res2 = run_sql(query2, params2)
     return (int(res2), nb_abuse_reports)
 
 def query_record_useful_review(comID, value):
     """
     private funciton
     Adjust the number of useful votes and number of total votes for a comment.
     @param comID: comment id
     @param value: +1 or -1
     @return: integer 1 if successful, integer 0 if not
     """
     # retrieve nb_useful votes
     query1 = "SELECT nb_votes_total, nb_votes_yes FROM cmtRECORDCOMMENT WHERE id=%s"
     params1 = (comID,)
     res1 = run_sql(query1, params1)
     if len(res1)==0:
         return 0
 
     # modify and insert new nb_useful votes
     nb_votes_yes = int(res1[0][1])
     if value >= 1:
         nb_votes_yes = int(res1[0][1]) + 1
     nb_votes_total = int(res1[0][0]) + 1
     query2 = "UPDATE cmtRECORDCOMMENT SET nb_votes_total=%s, nb_votes_yes=%s WHERE id=%s"
     params2 = (nb_votes_total, nb_votes_yes, comID)
     res2 = run_sql(query2, params2)
     return int(res2)
 
 def query_retrieve_comments_or_remarks(recID, display_order='od', display_since='0000-00-00 00:00:00',
                                        ranking=0, limit='all', user_info=None):
     """
     Private function
     Retrieve tuple of comments or remarks from the database
     @param recID: record id
     @param display_order:   hh = highest helpful score
                             lh = lowest helpful score
                             hs = highest star score
                             ls = lowest star score
                             od = oldest date
                             nd = newest date
     @param display_since: datetime, e.g. 0000-00-00 00:00:00
     @param ranking: boolean, enabled if reviews, disabled for comments
     @param limit: number of comments/review to return
     @return: tuple of comment where comment is
             tuple (nickname, uid, date_creation, body, status, id) if ranking disabled or
             tuple (nickname, uid, date_creation, body, status, nb_votes_yes, nb_votes_total, star_score, title, id)
     Note: for the moment, if no nickname, will return email address up to '@'
     """
     display_since = calculate_start_date(display_since)
 
     order_dict =    {   'hh'   : "cmt.nb_votes_yes/(cmt.nb_votes_total+1) DESC, cmt.date_creation DESC ",
                         'lh'   : "cmt.nb_votes_yes/(cmt.nb_votes_total+1) ASC, cmt.date_creation ASC ",
                         'ls'   : "cmt.star_score ASC, cmt.date_creation DESC ",
                         'hs'   : "cmt.star_score DESC, cmt.date_creation DESC ",
                         'nd'   : "cmt.reply_order_cached_data DESC ",
                         'od'   : "cmt.reply_order_cached_data ASC "
                     }
 
     # Ranking only done for comments and when allowed
     if ranking and recID > 0:
         try:
             display_order = order_dict[display_order]
         except:
             display_order = order_dict['od']
     else:
         # in case of recID > 0 => external record => no ranking!
         ranking = 0
     try:
         if display_order[-1] == 'd':
             display_order = order_dict[display_order]
         else:
             display_order = order_dict['od']
     except:
         display_order = order_dict['od']
 
     #display_order = order_dict['nd']
     query = """SELECT user.nickname,
                       cmt.id_user,
                       DATE_FORMAT(cmt.date_creation, '%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s'),
                       cmt.body,
                       cmt.status,
                       cmt.nb_abuse_reports,
                       %(ranking)s cmt.id,
                       cmt.round_name,
                       cmt.restriction,
                       %(reply_to_column)s
                FROM   cmtRECORDCOMMENT cmt LEFT JOIN user ON
                                               user.id=cmt.id_user
                WHERE cmt.id_bibrec=%%s
                %(ranking_only)s
                %(display_since)s
                ORDER BY %(display_order)s
                """ % {'ranking'       : ranking and ' cmt.nb_votes_yes, cmt.nb_votes_total, cmt.star_score, cmt.title, ' or '',
                       'ranking_only'  : ranking and ' AND cmt.star_score>0 ' or ' AND cmt.star_score=0 ',
 #                      'id_bibrec'     : recID > 0 and 'cmt.id_bibrec' or 'cmt.id_bibrec_or_bskEXTREC',
 #                      'table'         : recID > 0 and 'cmtRECORDCOMMENT' or 'bskRECORDCOMMENT',
                       'display_since' : display_since == '0000-00-00 00:00:00' and ' ' or 'AND cmt.date_creation>=\'%s\' ' % display_since,
                'display_order': display_order,
                'reply_to_column':  recID > 0 and 'cmt.in_reply_to_id_cmtRECORDCOMMENT' or 'cmt.in_reply_to_id_bskRECORDCOMMENT'}
     params = (recID,)
     res = run_sql(query, params)
 #    return res
 
     new_limit = limit
     comments_list = []
     for row in res:
         if ranking:
             # when dealing with reviews, row[12] holds restriction info:
             restriction = row[12]
         else:
             # when dealing with comments, row[8] holds restriction info:
             restriction = row[8]
         if user_info and check_user_can_view_comment(user_info, None, restriction)[0] != 0:
             # User cannot view comment. Look further
             continue
         comments_list.append(row)
         if limit.isdigit():
             new_limit -= 1
             if limit < 1:
                 break
 
     if comments_list:
         if limit.isdigit():
             return comments_list[:limit]
         else:
             return comments_list
     return ()
 
 # def get_comment_children(comID):
 #     """
 #     Returns the list of children (i.e. direct descendants) ordered by time of addition.
 
 #     @param comID: the ID of the comment for which we want to retrieve children
 #     @type comID: int
 #     @return the list of children
 #     @rtype: list
 #     """
 #     res = run_sql("SELECT id FROM cmtRECORDCOMMENT WHERE in_reply_to_id_cmtRECORDCOMMENT=%s", (comID,))
 #     return [row[0] for row in res]
 
 # def get_comment_descendants(comID, depth=None):
 #     """
 #     Returns the list of descendants of the given comment, orderd from
 #     oldest to newest ("top-down"), down to depth specified as parameter.
 
 #     @param comID: the ID of the comment for which we want to retrieve descendant
 #     @type comID: int
 #     @param depth: the max depth down to which we want to retrieve
 #                   descendants. Specify None for no limit, 1 for direct
 #                   children only, etc.
 #     @return the list of ancestors
 #     @rtype: list(tuple(comment ID, descendants comments IDs))
 #     """
 #     if depth == 0:
 #         return (comID, [])
 
 #     res = run_sql("SELECT id FROM cmtRECORDCOMMENT WHERE in_reply_to_id_cmtRECORDCOMMENT=%s", (comID,))
 #     if res:
 #         children_comID = [row[0] for row in res]
 #         children_descendants = []
 #         if depth:
 #             depth -= 1
 #         children_descendants = [get_comment_descendants(child_comID, depth) for child_comID in children_comID]
 #         return (comID, children_descendants)
 #     else:
 #         return (comID, [])
 
 def get_comment_ancestors(comID, depth=None):
     """
     Returns the list of ancestors of the given comment, ordered from
     oldest to newest ("top-down": direct parent of comID is at last position),
     up to given depth
 
     @param comID: the ID of the comment for which we want to retrieve ancestors
     @type comID: int
     @param depth: the maximum of levels up from the given comment we
                   want to retrieve ancestors. None for no limit, 1 for
                   direct parent only, etc.
     @type depth: int
     @return the list of ancestors
     @rtype: list
     """
     if depth == 0:
         return []
 
     res = run_sql("SELECT in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT WHERE id=%s", (comID,))
     if res:
         parent_comID = res[0][0]
         if parent_comID == 0:
             return []
         parent_ancestors = []
         if depth:
             depth -= 1
         parent_ancestors = get_comment_ancestors(parent_comID, depth)
         parent_ancestors.append(parent_comID)
         return parent_ancestors
     else:
         return []
 
 def get_reply_order_cache_data(comid):
     """
     Prepare a representation of the comment ID given as parameter so
     that it is suitable for byte ordering in MySQL.
     """
     return "%s%s%s%s" % (chr((comid >> 24) % 256), chr((comid >> 16) % 256),
                          chr((comid >> 8) % 256), chr(comid % 256))
 
 def query_add_comment_or_remark(reviews=0, recID=0, uid=-1, msg="",
                                 note="", score=0, priority=0,
                                 client_ip_address='', editor_type='textarea',
                                 req=None, reply_to=None, attached_files=None):
     """
     Private function
     Insert a comment/review or remarkinto the database
     @param recID: record id
     @param uid: user id
     @param msg: comment body
     @param note: comment title
     @param score: review star score
     @param priority: remark priority #!FIXME
     @param editor_type: the kind of editor used to submit the comment: 'textarea', 'ckeditor'
     @param req: request object. If provided, email notification are sent after we reply to user request.
     @param reply_to: the id of the comment we are replying to with this inserted comment.
     @return: integer >0 representing id if successful, integer 0 if not
     """
     current_date = calculate_start_date('0d')
     #change utf-8 message into general unicode
     msg = msg.decode('utf-8')
     note = note.decode('utf-8')
     #change general unicode back to utf-8
     msg = msg.encode('utf-8')
     note = note.encode('utf-8')
     msg_original = msg
     (restriction, round_name) = get_record_status(recID)
     if attached_files is None:
         attached_files = {}
     if reply_to and CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH >= 0:
         # Check that we have not reached max depth
         comment_ancestors = get_comment_ancestors(reply_to)
         if len(comment_ancestors) >= CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH:
             if CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH == 0:
                 reply_to = None
             else:
                 reply_to = comment_ancestors[CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH - 1]
         # Inherit restriction and group/round of 'parent'
         comment = query_get_comment(reply_to)
         if comment:
             (round_name, restriction) = comment[10:12]
     if editor_type == 'ckeditor':
         # Here we remove the line feeds introduced by CKEditor (they
         # have no meaning for the user) and replace the HTML line
         # breaks by linefeeds, so that we are close to an input that
         # would be done without the CKEditor. That's much better if a
         # reply to a comment is made with a browser that does not
         # support CKEditor.
         msg = msg.replace('\n', '').replace('\r', '')
 
         # We clean the quotes that could have been introduced by
         # CKEditor when clicking the 'quote' button, as well as those
         # that we have introduced when quoting the original message.
         # We can however not use directly '>>' chars to quote, as it
         # will be washed/fixed when calling tidy_html(): double-escape
         # all &gt; first, and use &gt;&gt;
         msg = msg.replace('&gt;', '&amp;gt;')
         msg = re.sub('^\s*<blockquote', '<br/> <blockquote', msg)
         msg = re.sub('<blockquote.*?>\s*<(p|div).*?>', '&gt;&gt;', msg)
         msg = re.sub('</(p|div)>\s*</blockquote>', '', msg)
         # Then definitely remove any blockquote, whatever it is
         msg = re.sub('<blockquote.*?>', '<div>', msg)
         msg = re.sub('</blockquote>', '</div>', msg)
         # Tidy up the HTML
         msg = tidy_html(msg)
         # We remove EOL that might have been introduced when tidying
         msg = msg.replace('\n', '').replace('\r', '')
         # Now that HTML has been cleaned, unescape &gt;
         msg = msg.replace('&gt;', '>')
         msg = msg.replace('&amp;gt;', '&gt;')
         msg = re.sub('<br .*?(/>)', '\n', msg)
         msg = msg.replace('&nbsp;', ' ')
         # In case additional <p> or <div> got inserted, interpret
         # these as new lines (with a sad trick to do it only once)
         # (note that it has been deactivated, as it is messing up
         # indentation with >>)
         #msg = msg.replace('</div><', '</div>\n<')
         #msg = msg.replace('</p><', '</p>\n<')
 
     query = """INSERT INTO cmtRECORDCOMMENT (id_bibrec,
                                            id_user,
                                            body,
                                            date_creation,
                                            star_score,
                                            nb_votes_total,
                                            title,
                                            round_name,
                                            restriction,
                                            in_reply_to_id_cmtRECORDCOMMENT)
                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
     params = (recID, uid, msg, current_date, score, 0, note, round_name, restriction, reply_to or 0)
     res = run_sql(query, params)
     if res:
         new_comid = int(res)
         move_attached_files_to_storage(attached_files, recID, new_comid)
         parent_reply_order = run_sql("""SELECT reply_order_cached_data from cmtRECORDCOMMENT where id=%s""", (reply_to,))
         if not parent_reply_order or parent_reply_order[0][0] is None:
             # This is not a reply, but a first 0-level comment
             parent_reply_order = ''
         else:
             parent_reply_order = parent_reply_order[0][0]
-        run_sql("""UPDATE cmtRECORDCOMMENT SET reply_order_cached_data=%s WHERE id=%s""",
+        run_sql("""UPDATE cmtRECORDCOMMENT SET reply_order_cached_data=_binary %s WHERE id=%s""",
                 (parent_reply_order + get_reply_order_cache_data(new_comid), new_comid))
         action_code = CFG_WEBCOMMENT_ACTION_CODE[reviews and 'ADD_REVIEW' or 'ADD_COMMENT']
         action_time = convert_datestruct_to_datetext(time.localtime())
         query2 = """INSERT INTO cmtACTIONHISTORY  (id_cmtRECORDCOMMENT,
                      id_bibrec, id_user, client_host, action_time, action_code)
                     VALUES (%s, %s, %s, inet_aton(%s), %s, %s)"""
         params2 = (res, recID, uid, client_ip_address, action_time, action_code)
         run_sql(query2, params2)
 
         def notify_subscribers_callback(data):
             """
             Define a callback that retrieves subscribed users, and
             notify them by email.
 
             @param data: contains the necessary parameters in a tuple:
                          (recid, uid, comid, msg, note, score, editor_type, reviews)
             """
             recid, uid, comid, msg, note, score, editor_type, reviews = data
             # Email this comment to 'subscribers'
             (subscribers_emails1, subscribers_emails2) = \
                                   get_users_subscribed_to_discussion(recid)
             email_subscribers_about_new_comment(recid, reviews=reviews,
                                                 emails1=subscribers_emails1,
                                                 emails2=subscribers_emails2,
                                                 comID=comid, msg=msg,
                                                 note=note, score=score,
                                                 editor_type=editor_type, uid=uid)
 
         # Register our callback to notify subscribed people after
         # having replied to our current user.
         data = (recID, uid, res, msg, note, score, editor_type, reviews)
         if req:
             req.register_cleanup(notify_subscribers_callback, data)
         else:
             notify_subscribers_callback(data)
 
     return int(res)
 
 def move_attached_files_to_storage(attached_files, recID, comid):
     """
     Move the files that were just attached to a new comment to their
     final location.
 
     @param attached_files: the mappings of desired filename to attach
                            and path where to find the original file
     @type attached_files: dict {filename, filepath}
     @param recID: the record ID to which we attach the files
     @param comid: the comment ID to which we attach the files
     """
     for filename, filepath in attached_files.iteritems():
         dest_dir = os.path.join(CFG_PREFIX, 'var', 'data', 'comments',
                                 str(recID), str(comid))
         try:
             os.makedirs(dest_dir)
         except:
             # Dir most probably already existed
             pass
         shutil.move(filepath,
                     os.path.join(dest_dir, filename))
 
 def get_attached_files(recid, comid):
     """
     Returns a list with tuples (filename, filepath, fileurl)
 
     @param recid: the recid to which the comment belong
     @param comid: the commment id for which we want to retrieve files
     """
     base_dir = os.path.join(CFG_PREFIX, 'var', 'data', 'comments',
                             str(recid), str(comid))
     if os.path.isdir(base_dir):
         filenames = os.listdir(base_dir)
         return [(filename, os.path.join(CFG_PREFIX, 'var', 'data', 'comments',
                                         str(recid), str(comid), filename),
                  CFG_SITE_URL + '/'+ CFG_SITE_RECORD +'/' + str(recid) + '/comments/attachments/get/' + str(comid) + '/' + filename) \
                 for filename in filenames]
     else:
         return []
 
 def subscribe_user_to_discussion(recID, uid):
     """
     Subscribe a user to a discussion, so the she receives by emails
     all new new comments for this record.
 
     @param recID: record ID corresponding to the discussion we want to
                   subscribe the user
     @param uid: user id
     """
     query = """INSERT INTO cmtSUBSCRIPTION (id_bibrec, id_user, creation_time)
                     VALUES (%s, %s, %s)"""
     params = (recID, uid, convert_datestruct_to_datetext(time.localtime()))
     try:
         run_sql(query, params)
     except:
         return 0
     return 1
 
 def unsubscribe_user_from_discussion(recID, uid):
     """
     Unsubscribe users from a discussion.
 
     @param recID: record ID corresponding to the discussion we want to
                   unsubscribe the user
     @param uid: user id
     @return 1 if successful, 0 if not
     """
     query = """DELETE FROM cmtSUBSCRIPTION
                      WHERE id_bibrec=%s AND id_user=%s"""
     params = (recID, uid)
     try:
         res = run_sql(query, params)
     except:
         return 0
     if res > 0:
         return 1
     return 0
 
 def get_user_subscription_to_discussion(recID, uid):
     """
     Returns the type of subscription for the given user to this
     discussion. This does not check authorizations (for eg. if user
     was subscribed, but is suddenly no longer authorized).
 
     @param recID: record ID
     @param uid: user id
     @return:
                - 0 if user is not subscribed to discussion
                - 1 if user is subscribed, and is allowed to unsubscribe
                - 2 if user is subscribed, but cannot unsubscribe
     """
     user_email = get_email(uid)
     (emails1, emails2) = get_users_subscribed_to_discussion(recID, check_authorizations=False)
     if user_email in emails1:
         return 1
     elif user_email in emails2:
         return 2
     else:
         return 0
 
 def get_users_subscribed_to_discussion(recID, check_authorizations=True):
     """
     Returns the lists of users subscribed to a given discussion.
 
     Two lists are returned: the first one is the list of emails for
     users who can unsubscribe from the discussion, the second list
     contains the emails of users who cannot unsubscribe (for eg. author
     of the document, etc).
 
     Users appear in only one list. If a user has manually subscribed
     to a discussion AND is an automatic recipients for updates, it
     will only appear in the second list.
 
     @param recID: record ID for which we want to retrieve subscribed users
     @param check_authorizations: if True, check again if users are authorized to view comment
     @return tuple (emails1, emails2)
     """
     subscribers_emails = {}
 
     # Get users that have subscribed to this discussion
     query = """SELECT id_user FROM cmtSUBSCRIPTION WHERE id_bibrec=%s"""
     params = (recID,)
     res = run_sql(query, params)
     for row in res:
         uid = row[0]
         if check_authorizations:
             user_info = collect_user_info(uid)
             (auth_code, auth_msg) = check_user_can_view_comments(user_info, recID)
         else:
             # Don't check and grant access
             auth_code = False
         if auth_code:
             # User is no longer authorized to view comments.
             # Delete subscription
             unsubscribe_user_from_discussion(recID, uid)
         else:
             email = get_email(uid)
             if '@' in email:
                 subscribers_emails[email] = True
 
     # Get users automatically subscribed, based on the record metadata
     collections_with_auto_replies = CFG_WEBCOMMENT_EMAIL_REPLIES_TO.keys()
     for collection in collections_with_auto_replies:
         if (get_colID(collection) is not None) and \
                (recID in get_collection_reclist(collection)):
             fields = CFG_WEBCOMMENT_EMAIL_REPLIES_TO[collection]
             for field in fields:
                 emails = get_fieldvalues(recID, field)
                 for email in emails:
                     if not '@' in email:
                         # Is a group: add domain name
                         subscribers_emails[email + '@' + \
                                            CFG_SITE_SUPPORT_EMAIL.split('@')[1]] = False
                     else:
                         subscribers_emails[email] = False
 
     return ([email for email, can_unsubscribe_p \
              in subscribers_emails.iteritems() if can_unsubscribe_p],
             [email for email, can_unsubscribe_p \
              in subscribers_emails.iteritems() if not can_unsubscribe_p] )
 
 def email_subscribers_about_new_comment(recID, reviews, emails1,
                                         emails2, comID, msg="",
                                         note="", score=0,
                                         editor_type='textarea',
                                         ln=CFG_SITE_LANG, uid=-1):
     """
     Notify subscribers that a new comment was posted.
     FIXME: consider recipient preference to send email in correct language.
 
     @param recID: record id
     @param emails1: list of emails for users who can unsubscribe from discussion
     @param emails2: list of emails for users who cannot unsubscribe from discussion
     @param comID: the comment id
     @param msg: comment body
     @param note: comment title
     @param score: review star score
     @param editor_type: the kind of editor used to submit the comment: 'textarea', 'ckeditor'
     @rtype: bool
     @return: True if email was sent okay, False if it was not.
     """
     _ = gettext_set_language(ln)
 
     if not emails1 and not emails2:
         return 0
 
     # Get title
     titles = get_fieldvalues(recID, "245__a")
     if not titles:
         # usual title not found, try conference title:
         titles = get_fieldvalues(recID, "111__a")
 
     title = ''
     if titles:
         title = titles[0]
     else:
         title = _("Record %i") % recID
 
     # Get report number
     report_numbers = get_fieldvalues(recID, "037__a")
     if not report_numbers:
         report_numbers = get_fieldvalues(recID, "088__a")
         if not report_numbers:
             report_numbers = get_fieldvalues(recID, "021__a")
 
     # Prepare email subject and body
     if reviews:
         email_subject = _('%(report_number)s"%(title)s" has been reviewed') % \
                         {'report_number': report_numbers and ('[' + report_numbers[0] + '] ') or '',
                          'title': title}
     else:
         email_subject = _('%(report_number)s"%(title)s" has been commented') % \
                         {'report_number': report_numbers and ('[' + report_numbers[0] + '] ') or '',
                          'title': title}
 
     washer = EmailWasher()
     msg = washer.wash(msg)
     msg = msg.replace('&gt;&gt;', '>')
     email_content = msg
     if note:
         email_content = note + email_content
 
     # Send emails to people who can unsubscribe
     email_header = webcomment_templates.tmpl_email_new_comment_header(recID,
                                                                       title,
                                                                       reviews,
                                                                       comID,
                                                                       report_numbers,
                                                                       can_unsubscribe=True,
                                                                       ln=ln,
                                                                       uid=uid)
 
     email_footer = webcomment_templates.tmpl_email_new_comment_footer(recID,
                                                                       title,
                                                                       reviews,
                                                                       comID,
                                                                       report_numbers,
                                                                       can_unsubscribe=True,
                                                                       ln=ln)
     res1 = True
     if emails1:
         res1 = send_email(fromaddr=CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL,
                           toaddr=emails1,
                           subject=email_subject,
                           content=email_content,
                           header=email_header,
                           footer=email_footer,
                           ln=ln)
 
     # Then send email to people who have been automatically
     # subscribed to the discussion (they cannot unsubscribe)
     email_header = webcomment_templates.tmpl_email_new_comment_header(recID,
                                                                       title,
                                                                       reviews,
                                                                       comID,
                                                                       report_numbers,
                                                                       can_unsubscribe=False,
                                                                       ln=ln,
                                                                       uid=uid)
 
     email_footer = webcomment_templates.tmpl_email_new_comment_footer(recID,
                                                                       title,
                                                                       reviews,
                                                                       comID,
                                                                       report_numbers,
                                                                       can_unsubscribe=False,
                                                                       ln=ln)
     res2 = True
     if emails2:
         res2 = send_email(fromaddr=CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL,
                           toaddr=emails2,
                           subject=email_subject,
                           content=email_content,
                           header=email_header,
                           footer=email_footer,
                           ln=ln)
 
     return res1 and res2
 
 def get_record_status(recid):
     """
     Returns the current status of the record, i.e. current restriction to apply for newly submitted
     comments, and current commenting round.
 
     The restriction to apply can be found in the record metadata, in
     field(s) defined by config CFG_WEBCOMMENT_RESTRICTION_DATAFIELD. The restriction is empty string ""
     in cases where the restriction has not explicitely been set, even
     if the record itself is restricted.
 
     @param recid: the record id
     @type recid: int
     @return tuple(restriction, round_name), where 'restriction' is empty string when no restriction applies
     @rtype (string, int)
     """
     collections_with_rounds = CFG_WEBCOMMENT_ROUND_DATAFIELD.keys()
     commenting_round = ""
     for collection in collections_with_rounds:
         # Find the first collection defines rounds field for this
         # record
         if get_colID(collection) is not None and \
                (recid in get_collection_reclist(collection)):
             commenting_rounds = get_fieldvalues(recid, CFG_WEBCOMMENT_ROUND_DATAFIELD.get(collection, ""))
             if commenting_rounds:
                 commenting_round = commenting_rounds[0]
             break
 
     collections_with_restrictions = CFG_WEBCOMMENT_RESTRICTION_DATAFIELD.keys()
     restriction = ""
     for collection in collections_with_restrictions:
         # Find the first collection that defines restriction field for
         # this record
         if get_colID(collection) is not None and \
                recid in get_collection_reclist(collection):
             restrictions = get_fieldvalues(recid, CFG_WEBCOMMENT_RESTRICTION_DATAFIELD.get(collection, ""))
             if restrictions:
                 restriction = restrictions[0]
             break
 
     return (restriction, commenting_round)
 
 def calculate_start_date(display_since):
     """
     Private function
     Returns the datetime of display_since argument in MYSQL datetime format
     calculated according to the local time.
     @param display_since: =  all= no filtering
                             nd = n days ago
                             nw = n weeks ago
                             nm = n months ago
                             ny = n years ago
                             where n is a single digit number
     @return: string of wanted datetime.
             If 'all' given as argument, will return datetext_default
             datetext_default is defined in miscutils/lib/dateutils and
             equals 0000-00-00 00:00:00 => MySQL format
             If bad arguement given, will return datetext_default
             If library 'dateutil' is not found return datetext_default
             and register exception.
     """
     time_types = {'d':0, 'w':0, 'm':0, 'y':0}
     today = datetime.today()
     try:
         nb = int(display_since[:-1])
     except:
         return datetext_default
     if display_since in [None, 'all']:
         return datetext_default
     if str(display_since[-1]) in time_types:
         time_type = str(display_since[-1])
     else:
         return datetext_default
     # year
     if time_type == 'y':
         if (int(display_since[:-1]) > today.year - 1) or (int(display_since[:-1]) < 1):
             #   1 < nb years < 2008
             return datetext_default
         else:
             final_nb_year = today.year - nb
             yesterday = today.replace(year=final_nb_year)
     # month
     elif time_type == 'm':
         try:
             from dateutil.relativedelta import relativedelta
         except ImportError:
             # The dateutil library is only recommended: if not
             # available, then send warning about this.
             register_exception(alert_admin=True)
             return datetext_default
         # obtain only the date: yyyy-mm-dd
         date_today = datetime.now().date()
         final_date = date_today - relativedelta(months=nb)
         yesterday = today.replace(year=final_date.year, month=final_date.month, day=final_date.day)
     # week
     elif time_type == 'w':
         delta = timedelta(weeks=nb)
         yesterday = today - delta
     # day
     elif time_type == 'd':
         delta = timedelta(days=nb)
         yesterday = today - delta
     return yesterday.strftime("%Y-%m-%d %H:%M:%S")
 
 def get_first_comments_or_remarks(recID=-1,
                                   ln=CFG_SITE_LANG,
                                   nb_comments='all',
                                   nb_reviews='all',
                                   voted=-1,
                                   reported=-1,
                                   user_info=None,
                                   show_reviews=False):
     """
     Gets nb number comments/reviews or remarks.
     In the case of comments, will get both comments and reviews
     Comments and remarks sorted by most recent date, reviews sorted by highest helpful score
     @param recID: record id
     @param ln: language
     @param nb_comments: number of comment or remarks to get
     @param nb_reviews: number of reviews or remarks to get
     @param voted: 1 if user has voted for a remark
     @param reported: 1 if user has reported a comment or review
     @return: if comment, tuple (comments, reviews) both being html of first nb comments/reviews
             if remark, tuple (remakrs, None)
     """
     _ = gettext_set_language(ln)
     warnings = []
     voted = wash_url_argument(voted, 'int')
     reported = wash_url_argument(reported, 'int')
 
     ## check recID argument
     if type(recID) is not int:
         return ()
     if recID >= 1: #comment or review. NB: suppressed reference to basket (handled in webbasket)
         if CFG_WEBCOMMENT_ALLOW_REVIEWS:
             res_reviews = query_retrieve_comments_or_remarks(recID=recID, display_order="hh", ranking=1,
                                                              limit=nb_comments, user_info=user_info)
             nb_res_reviews = len(res_reviews)
             ## check nb argument
             if type(nb_reviews) is int and nb_reviews < len(res_reviews):
                 first_res_reviews = res_reviews[:nb_reviews]
             else:
                 first_res_reviews = res_reviews
         if CFG_WEBCOMMENT_ALLOW_COMMENTS:
             res_comments = query_retrieve_comments_or_remarks(recID=recID, display_order="od", ranking=0,
                                                               limit=nb_reviews, user_info=user_info)
             nb_res_comments = len(res_comments)
             ## check nb argument
             if type(nb_comments) is int and nb_comments < len(res_comments):
                 first_res_comments = res_comments[:nb_comments]
             else:
                 first_res_comments = res_comments
     else:
         body = webcomment_templates.tmpl_error(
             _('%s is an invalid record ID') % recID, ln)
         return body
 
     # comment
     if recID >= 1:
         comments = reviews = ""
         if reported > 0:
             warnings.append((_('Your feedback has been recorded, many '
                                'thanks.'), 'green'))
         elif reported == 0:
             warnings.append((_('Your feedback could not be recorded, please'
                                ' try again.'), ''))
         # normal comments
         if CFG_WEBCOMMENT_ALLOW_COMMENTS:
             grouped_comments = group_comments_by_round(first_res_comments, ranking=0)
             comments = webcomment_templates.tmpl_get_first_comments_without_ranking(recID, ln, grouped_comments, nb_res_comments, warnings)
         if show_reviews:
             # ranked comments
             if CFG_WEBCOMMENT_ALLOW_REVIEWS:
                 # calculate average score
                 avg_score = calculate_avg_score(res_reviews)
                 if voted > 0:
                     warnings.append((_('Your feedback has been recorded, '
                                        'many thanks.'), 'green'))
                 elif voted == 0:
                     warnings.append((_('Your feedback could not be recorded, '
                                        'please try again.'), ''))
                 grouped_reviews = group_comments_by_round(first_res_reviews, ranking=0)
                 reviews = webcomment_templates.tmpl_get_first_comments_with_ranking(recID, ln, grouped_reviews, nb_res_reviews, avg_score, warnings)
         return (comments, reviews)
     # remark
     else:
         return(webcomment_templates.tmpl_get_first_remarks(first_res_comments, ln, nb_res_comments), None)
 
 def group_comments_by_round(comments, ranking=0):
     """
     Group comments by the round to which they belong
     """
     comment_rounds = {}
     ordered_comment_round_names = []
     for comment in comments:
         comment_round_name = ranking and comment[11] or comment[7]
         if not comment_rounds.has_key(comment_round_name):
             comment_rounds[comment_round_name] = []
             ordered_comment_round_names.append(comment_round_name)
         comment_rounds[comment_round_name].append(comment)
     return [(comment_round_name, comment_rounds[comment_round_name]) \
             for comment_round_name in ordered_comment_round_names]
 
 def calculate_avg_score(res):
     """
     private function
     Calculate the avg score of reviews present in res
     @param res: tuple of tuple returned from query_retrieve_comments_or_remarks
     @return: a float of the average score rounded to the closest 0.5
     """
     c_star_score = 6
     avg_score = 0.0
     nb_reviews = 0
     for comment in res:
         if comment[c_star_score] > 0:
             avg_score += comment[c_star_score]
             nb_reviews += 1
     if nb_reviews ==  0:
         return 0.0
     avg_score = avg_score / nb_reviews
     avg_score_unit = avg_score - math.floor(avg_score)
     if avg_score_unit < 0.25:
         avg_score = math.floor(avg_score)
     elif avg_score_unit > 0.75:
         avg_score = math.floor(avg_score) + 1
     else:
         avg_score = math.floor(avg_score) + 0.5
     if avg_score > 5:
         avg_score = 5.0
     return avg_score
 
 def perform_request_add_comment_or_remark(recID=0,
                                           uid=-1,
                                           action='DISPLAY',
                                           ln=CFG_SITE_LANG,
                                           msg=None,
                                           score=None,
                                           note=None,
                                           priority=None,
                                           reviews=0,
                                           comID=0,
                                           client_ip_address=None,
                                           editor_type='textarea',
                                           can_attach_files=False,
                                           subscribe=False,
                                           req=None,
                                           attached_files=None,
                                           warnings=None):
     """
     Add a comment/review or remark
     @param recID: record id
     @param uid: user id
     @param action:  'DISPLAY' to display add form
                     'SUBMIT' to submit comment once form is filled
                     'REPLY' to reply to an existing comment
     @param ln: language
     @param msg: the body of the comment/review or remark
     @param score: star score of the review
     @param note: title of the review
     @param priority: priority of remark (int)
     @param reviews: boolean, if enabled will add a review, if disabled will add a comment
     @param comID: if replying, this is the comment id of the comment we are replying to
     @param editor_type: the kind of editor/input used for the comment: 'textarea', 'ckeditor'
     @param can_attach_files: if user can attach files to comments or not
     @param subscribe: if True, subscribe user to receive new comments by email
     @param req: request object. Used to register callback to send email notification
     @param attached_files: newly attached files to this comment, mapping filename to filepath
     @type attached_files: dict
     @param warnings: list of warning tuples (warning_text, warning_color) that should be considered
     @return:
              - html add form if action is display or reply
              - html successful added form if action is submit
     """
     _ = gettext_set_language(ln)
     if warnings is None:
         warnings = []
 
     actions = ['DISPLAY', 'REPLY', 'SUBMIT']
     _ = gettext_set_language(ln)
 
     ## check arguments
     check_recID_is_in_range(recID, warnings, ln)
     if uid <= 0:
         body = webcomment_templates.tmpl_error(
             _('%s is an invalid user ID.') % uid, ln)
         return body
 
     if attached_files is None:
         attached_files = {}
 
     user_contact_info = query_get_user_contact_info(uid)
     nickname = ''
     if user_contact_info:
         if user_contact_info[0]:
             nickname = user_contact_info[0]
     # show the form
     if action == 'DISPLAY':
         if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS:
             return webcomment_templates.tmpl_add_comment_form_with_ranking(recID, uid, nickname, ln, msg, score, note, warnings, can_attach_files=can_attach_files)
         elif not reviews and CFG_WEBCOMMENT_ALLOW_COMMENTS:
             return webcomment_templates.tmpl_add_comment_form(recID, uid, nickname, ln, msg, warnings, can_attach_files=can_attach_files)
         else:
             body = webcomment_templates.tmpl_error(
                 _('Comments on records have been disallowed by the '
                   'administrator.'), ln)
             return body
 
     elif action == 'REPLY':
         if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS:
             body = webcomment_templates.tmpl_error(
                 _('Cannot reply to a review.'), ln)
             return body
         elif not reviews and CFG_WEBCOMMENT_ALLOW_COMMENTS:
             textual_msg = msg
             if comID > 0:
                 comment = query_get_comment(comID)
                 if comment:
                     user_info = get_user_info(comment[2])
                     if user_info:
                         date_creation = convert_datetext_to_dategui(str(comment[4]))
                         # Build two msg: one mostly textual, the other one with HTML markup, for the CkEditor.
                         msg = _("%(x_name)s wrote on %(x_date)s:")% {'x_name': user_info[2], 'x_date': date_creation}
                         textual_msg = msg
                         # 1 For CkEditor input
                         msg += '\n\n'
                         msg += comment[3]
                         msg = email_quote_txt(text=msg)
                         # Now that we have a text-quoted version, transform into
                         # something that CkEditor likes, using <blockquote> that
                         # do still enable users to insert comments inline
                         msg = email_quoted_txt2html(text=msg,
                                                     indent_html=('<blockquote><div>', '&nbsp;&nbsp;</div></blockquote>'),
                                                     linebreak_html="&nbsp;<br/>",
                                                     indent_block=False)
                         # Add some space for users to easily add text
                         # around the quoted message
                         msg = '<br/>' + msg + '<br/>'
                         # Due to how things are done, we need to
                         # escape the whole msg again for the editor
                         msg = cgi.escape(msg)
 
                         # 2 For textarea input
                         textual_msg += "\n\n"
                         textual_msg += comment[3]
                         textual_msg = email_quote_txt(text=textual_msg)
             return webcomment_templates.tmpl_add_comment_form(recID, uid, nickname, ln, msg, warnings, textual_msg, can_attach_files=can_attach_files, reply_to=comID)
         else:
             body = webcomment_templates.tmpl_error(
                 _('Comments on records have been disallowed by the '
                   'administrator.'), ln)
             return body
 
     # check before submitting form
     elif action == 'SUBMIT':
         if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS:
             if note.strip() in ["", "None"] and not CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS:
                 warnings.append((_('You must enter a title.'), ''))
             if score == 0 or score > 5:
                 warnings.append((_('You must choose a score.'), ''))
         if msg.strip() in ["", "None"] and not CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS:
             warnings.append((_('You must enter a text.'), ''))
         # if no warnings, submit
         if len(warnings) == 0:
             if reviews:
                 if check_user_can_review(recID, client_ip_address, uid):
                     success = query_add_comment_or_remark(reviews, recID=recID, uid=uid, msg=msg,
                                                           note=note, score=score, priority=0,
                                                           client_ip_address=client_ip_address,
                                                           editor_type=editor_type,
                                                           req=req,
                                                           reply_to=comID)
                 else:
                     warnings.append((_('You already wrote a review for '
                                        'this record.'), ''))
                     success = 1
             else:
                 if check_user_can_comment(recID, client_ip_address, uid):
                     success = query_add_comment_or_remark(reviews, recID=recID, uid=uid, msg=msg,
                                                           note=note, score=score, priority=0,
                                                           client_ip_address=client_ip_address,
                                                           editor_type=editor_type,
                                                           req=req,
 
                                                           reply_to=comID, attached_files=attached_files)
                     if success > 0 and subscribe:
                         subscribe_user_to_discussion(recID, uid)
                 else:
                     warnings.append((_('You already posted a comment '
                                        'short ago. Please retry later.'), ''))
                     success = 1
             if success > 0:
                 if CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL > 0:
                     notify_admin_of_new_comment(comID=success)
                 return webcomment_templates.tmpl_add_comment_successful(recID, ln, reviews, warnings, success)
             else:
                 register_exception(req=req)
                 body = webcomment_templates.tmpl_error(
                     _('Failed to insert your comment to the database.'
                         ' Please try again.'), ln)
                 return body
         # if are warnings or if inserting comment failed, show user where warnings are
         if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS:
             return webcomment_templates.tmpl_add_comment_form_with_ranking(recID, uid, nickname, ln, msg, score, note, warnings, can_attach_files=can_attach_files)
         else:
             return webcomment_templates.tmpl_add_comment_form(recID, uid, nickname, ln, msg, warnings, can_attach_files=can_attach_files)
     # unknown action send to display
     else:
         warnings.append((_('Unknown action --> showing you the default '
                            'add comment form.'), ''))
         if reviews and CFG_WEBCOMMENT_ALLOW_REVIEWS:
             return webcomment_templates.tmpl_add_comment_form_with_ranking(recID, uid, ln, msg, score, note, warnings, can_attach_files=can_attach_files)
         else:
             return webcomment_templates.tmpl_add_comment_form(recID, uid, ln, msg, warnings, can_attach_files=can_attach_files)
 
     return ''
 
 def notify_admin_of_new_comment(comID):
     """
     Sends an email to the admin with details regarding comment with ID = comID
     """
     comment = query_get_comment(comID)
     if len(comment) > 0:
         (comID2,
          id_bibrec,
          id_user,
          body,
          date_creation,
          star_score, nb_votes_yes, nb_votes_total,
          title,
          nb_abuse_reports, round_name, restriction) = comment
     else:
         return
     user_info = query_get_user_contact_info(id_user)
     if len(user_info) > 0:
         (nickname, email, last_login) = user_info
         if not len(nickname) > 0:
             nickname = email.split('@')[0]
     else:
         nickname = email = last_login = "ERROR: Could not retrieve"
 
     review_stuff = '''
     Star score  = %s
     Title       = %s''' % (star_score, title)
 
     washer = EmailWasher()
     try:
         body = washer.wash(body)
     except:
         body = cgi.escape(body)
 
     record_info = webcomment_templates.tmpl_email_new_comment_admin(id_bibrec)
     out = '''
 The following %(comment_or_review)s has just been posted (%(date)s).
 
 AUTHOR:
     Nickname    = %(nickname)s
     Email       = %(email)s
     User ID     = %(uid)s
 
 RECORD CONCERNED:
     Record ID   = %(recID)s
     URL         = <%(siteurl)s/%(CFG_SITE_RECORD)s/%(recID)s/%(comments_or_reviews)s/>
 %(record_details)s
 
 %(comment_or_review_caps)s:
     %(comment_or_review)s ID    = %(comID)s %(review_stuff)s
     Body        =
 <--------------->
 %(body)s
 
 <--------------->
 ADMIN OPTIONS:
 To moderate the %(comment_or_review)s go to %(siteurl)s/%(CFG_SITE_RECORD)s/%(recID)s/%(comments_or_reviews)s/display?%(arguments)s
     ''' % \
         {   'comment_or_review'     : star_score >  0 and 'review' or 'comment',
             'comment_or_review_caps': star_score > 0 and 'REVIEW' or 'COMMENT',
             'comments_or_reviews'   : star_score >  0 and 'reviews' or 'comments',
             'date'                  : date_creation,
             'nickname'              : nickname,
             'email'                 : email,
             'uid'                   : id_user,
             'recID'                 : id_bibrec,
             'record_details'        : record_info,
             'comID'                 : comID2,
             'review_stuff'          : star_score > 0 and review_stuff or "",
             'body'                  : body.replace('<br />','\n'),
             'siteurl'               : CFG_SITE_URL,
             'CFG_SITE_RECORD'        : CFG_SITE_RECORD,
             'arguments'             : 'ln=en&do=od#%s' % comID
         }
 
     from_addr = '%s WebComment <%s>' % (CFG_SITE_NAME, CFG_WEBALERT_ALERT_ENGINE_EMAIL)
     comment_collection = get_comment_collection(comID)
     to_addrs = get_collection_moderators(comment_collection)
 
     rec_collection = guess_primary_collection_of_a_record(id_bibrec)
     report_nums = get_fieldvalues(id_bibrec, "037__a")
     report_nums += get_fieldvalues(id_bibrec, "088__a")
     report_nums = ', '.join(report_nums)
     subject = "A new comment/review has just been posted [%s|%s]" % (rec_collection, report_nums)
 
     send_email(from_addr, to_addrs, subject, out)
 
 def check_recID_is_in_range(recID, warnings=[], ln=CFG_SITE_LANG):
     """
     Check that recID is >= 0
     @param recID: record id
     @param warnings: list of warning tuples (warning_text, warning_color)
     @return: tuple (boolean, html) where boolean (1=true, 0=false)
                                   and html is the body of the page to display if there was a problem
     """
     _ = gettext_set_language(ln)
 
     try:
         recID = int(recID)
     except:
         pass
 
     if type(recID) is int:
         if recID > 0:
             from invenio.search_engine import record_exists
             success = record_exists(recID)
             if success == 1:
                 return (1, "")
             else:
                 if success == -1:
                     status = 'deleted'
                     warning_message = _('The record has been deleted.')
                 else:
                     status = 'inexistant'
                     warning_message = _(
                         'Record ID %s does not exist in the database.') % recID
                 warnings.append((warning_message, ''))
                 return (0, webcomment_templates.tmpl_record_not_found(
                     status=status, recID=recID, ln=ln))
         elif recID == 0:
             warnings.append((_('No record ID was given.'), ''))
             return (0, webcomment_templates.tmpl_record_not_found(
                 status='missing', recID=recID, ln=ln))
         else:
             warnings.append((_('Record ID %s is an invalid ID.') % recID, ''))
             return (0, webcomment_templates.tmpl_record_not_found(
                 status='invalid', recID=recID, ln=ln))
     else:
         warnings.append((_('Record ID %s is not a number.') % recID, ''))
         return (0, webcomment_templates.tmpl_record_not_found(
             status='nan', recID=recID, ln=ln))
 
 def check_int_arg_is_in_range(value, name, gte_value, lte_value=None):
     """
     Check that variable with name 'name' >= gte_value & optionally <= lte_value
     @param value: variable value
     @param name: variable name
     @param errors: list of error tuples (error_id, value)
     @param gte_value: greater than or equal to value
     @param lte_value: less than or equal to value
     @return: boolean (1=true, 0=false)
     """
 
     if type(value) is not int:
         body = webcomment_templates.tmpl_error('%s is not a number.' % value)
         return body
 
     if value < gte_value:
         body = webcomment_templates.tmpl_error('%s invalid argument.' % value)
         return body
     if lte_value:
         if value > lte_value:
             body = webcomment_templates.tmpl_error(
                 '%s invalid argument.' % value)
             return body
     return 1
 
 def get_mini_reviews(recid, ln=CFG_SITE_LANG):
     """
     Returns the web controls to add reviews to a record from the
     detailed record pages mini-panel.
 
     @param recid: the id of the displayed record
     @param ln: the user's language
     """
     if CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS:
         action = 'SUBMIT'
     else:
         action = 'DISPLAY'
 
     reviews = query_retrieve_comments_or_remarks(recid, ranking=1)
 
     return webcomment_templates.tmpl_mini_review(recid, ln, action=action,
                                                  avg_score=calculate_avg_score(reviews),
                                                  nb_comments_total=len(reviews))
 
 def check_user_can_view_comments(user_info, recid):
     """Check if the user is authorized to view comments for given
     recid.
 
     Returns the same type as acc_authorize_action
     """
     # Check user can view the record itself first
     (auth_code, auth_msg) = check_user_can_view_record(user_info, recid)
     if auth_code:
         return (auth_code, auth_msg)
 
     # Check if user can view the comments
     ## But first can we find an authorization for this case action,
     ## for this collection?
     record_primary_collection = guess_primary_collection_of_a_record(recid)
     return acc_authorize_action(user_info, 'viewcomment', authorized_if_no_roles=True, collection=record_primary_collection)
 
 def check_user_can_view_comment(user_info, comid, restriction=None):
     """Check if the user is authorized to view a particular comment,
     given the comment restriction. Note that this function does not
     check if the record itself is restricted to the user, which would
     mean that the user should not see the comment.
 
     You can omit 'comid' if you already know the 'restriction'
 
     @param user_info: the user info object
     @param comid: the comment id of that we want to check
     @param restriction: the restriction applied to given comment (if known. Otherwise retrieved automatically)
 
     @return: the same type as acc_authorize_action
     """
     if restriction is None:
         comment = query_get_comment(comid)
         if comment:
             restriction = comment[11]
         else:
             return (1, 'Comment %i does not exist' % comid)
     if restriction == "":
         return  (0, '')
     return acc_authorize_action(user_info, 'viewrestrcomment', status=restriction)
 
 def check_user_can_send_comments(user_info, recid):
     """Check if the user is authorized to comment the given
     recid. This function does not check that user can view the record
     or view the comments
 
     Returns the same type as acc_authorize_action
     """
     ## First can we find an authorization for this case, action + collection
     record_primary_collection = guess_primary_collection_of_a_record(recid)
     return acc_authorize_action(user_info, 'sendcomment', authorized_if_no_roles=True, collection=record_primary_collection)
 
 def check_comment_belongs_to_record(comid, recid):
     """
     Return True if the comment is indeed part of given record (even if comment or/and record have
     been "deleted"). Else return False.
 
     @param comid: the id of the comment to check membership
     @param recid: the recid of the record we want to check if comment belongs to
     """
     query = """SELECT id_bibrec from cmtRECORDCOMMENT WHERE id=%s"""
     params = (comid,)
     res = run_sql(query, params)
     if res and res[0][0] == recid:
         return True
 
     return False
 
 def check_user_can_attach_file_to_comments(user_info, recid):
     """Check if the user is authorized to attach a file to comments
     for given recid. This function does not check that user can view
     the comments or send comments.
 
     Returns the same type as acc_authorize_action
     """
     ## First can we find an authorization for this case action, for
     ## this collection?
     record_primary_collection = guess_primary_collection_of_a_record(recid)
     return acc_authorize_action(user_info, 'attachcommentfile', authorized_if_no_roles=False, collection=record_primary_collection)
 
 def toggle_comment_visibility(uid, comid, collapse, recid):
     """
     Toggle the visibility of the given comment (collapse) for the
     given user.  Return the new visibility
 
     @param uid: the user id for which the change applies
     @param comid: the comment id to close/open
     @param collapse: if the comment is to be closed (1) or opened (0)
     @param recid: the record id to which the comment belongs
     @return: if the comment is visible or not after the update
     """
     # We rely on the client to tell if comment should be collapsed or
     # developed, to ensure consistency between our internal state and
     # client state.  Even if not strictly necessary, we store the
     # record ID for quicker retrieval of the collapsed comments of a
     # given discussion page. To prevent unnecessary population of the
     # table, only one distinct tuple (record ID, comment ID, user ID)
     # can be inserted (due to table definition). For the same purpose
     # we also check that comment to collapse exists, and corresponds
     # to an existing record: we cannot rely on the recid found as part
     # of the URL, as no former check is done. This rule is not applied
     # when deleting an entry, as in the worst case no line would be
     # removed. For optimized retrieval of row to delete, the id_bibrec
     # column is used, though not strictly necessary.
     if collapse:
         query = """SELECT id_bibrec from cmtRECORDCOMMENT WHERE id=%s"""
         params = (comid,)
         res = run_sql(query, params)
         if res:
             query = """INSERT IGNORE INTO cmtCOLLAPSED (id_bibrec, id_cmtRECORDCOMMENT, id_user)
                               VALUES (%s, %s, %s)"""
             params = (res[0][0], comid, uid)
             run_sql(query, params)
         return True
     else:
         query = """DELETE FROM cmtCOLLAPSED WHERE
                       id_cmtRECORDCOMMENT=%s and
                       id_user=%s and
                       id_bibrec=%s"""
         params = (comid, uid, recid)
         run_sql(query, params)
         return False
 
 def get_user_collapsed_comments_for_record(uid, recid):
     """
     Get the comments collapsed for given user on given recid page
     """
     # Collapsed state is not an attribute of cmtRECORDCOMMENT table
     # (vary per user) so it cannot be found when querying for the
     # comment. We must therefore provide a efficient way to retrieve
     # the collapsed state for a given discussion page and user.
     query = """SELECT id_cmtRECORDCOMMENT from cmtCOLLAPSED WHERE id_user=%s and id_bibrec=%s"""
     params = (uid, recid)
     return [res[0] for res in run_sql(query, params)]
 
 def is_comment_deleted(comid):
     """
     Return True of the comment is deleted. Else False
 
     @param comid: ID of comment to check
     """
     query = "SELECT status from cmtRECORDCOMMENT WHERE id=%s"
     params = (comid,)
     res = run_sql(query, params)
     if res and res[0][0] != 'ok':
         return True
 
     return False
 
 def perform_display_your_comments(user_info,
                                   page_number=1,
                                   selected_order_by_option="lcf",
                                   selected_display_number_option="all",
                                   selected_display_format_option="rc",
                                   ln=CFG_SITE_LANG):
     """
     Display all comments submitted by the user.
 
     @TODO: support reviews too
 
     @param user_info: standard user info object.
     @param comments: ordered list of tuples (id_bibrec, comid, date_creation, body, status, in_reply_to_id_cmtRECORDCOMMENT)
     @param page_number: page on which the user is.
     @type page_number: integer
     @param selected_order_by_option: seleccted ordering option. Can be one of:
           - ocf: Oldest comment first
           - lcf: Latest comment first
           - grof: Group by record, oldest commented first
           - grlf: Group by record, latest commented first
     @type selected_order_by_option: string
     @param selected_display_number_option: number of results to show per page. Can be a string-digit or 'all'.
     @type selected_display_number_option: string
     @param selected_display_format_option: how to show records. Can be one of:
           - rc: Records and comments
           - ro: Records only
           - co: Comments only
     @type selected_display_format_option: string
     @ln: language
     @type ln: string
     """
     query_params = ""
     nb_total_pages = 0
 
     if selected_display_format_option in ('rc', 'co'):
         nb_total_results = run_sql("SELECT count(id) from cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0", \
                                    (user_info['uid'], ))[0][0]
     else:
         if selected_order_by_option in ('grlf', 'grof'):
             nb_total_results = run_sql("SELECT count(distinct(id_bibrec)) from cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0", \
                                        (user_info['uid'], ))[0][0]
         else:
             nb_total_results = run_sql("SELECT count(id_bibrec) from cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0", \
                                        (user_info['uid'], ))[0][0]
     if page_number < 1:
         page_number = 1
 
     if selected_display_number_option != 'all' and \
            not selected_display_number_option.isdigit():
         # must be some garbage
         selected_display_number_option = 'all'
 
     query = ''
     if selected_order_by_option == "lcf":
         query_params += " ORDER BY date_creation DESC"
     elif selected_order_by_option == "ocf":
         query_params += " ORDER BY date_creation ASC"
     elif selected_order_by_option == "grlf":
         query = "SELECT cmt.id_bibrec, cmt.id, cmt.date_creation, cmt.body, cmt.status, cmt.in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT as cmt left join (SELECT max(date_creation) as maxdatecreation, id_bibrec FROM cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0 GROUP BY id_bibrec) as grp on cmt.id_bibrec = grp.id_bibrec WHERE id_user=%s AND star_score = 0 ORDER BY grp.maxdatecreation DESC, cmt.date_creation DESC"
     elif selected_order_by_option == "grof":
         query = "SELECT cmt.id_bibrec, cmt.id, cmt.date_creation, cmt.body, cmt.status, cmt.in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT as cmt left join (SELECT min(date_creation) as mindatecreation, id_bibrec FROM cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0 GROUP BY id_bibrec) as grp on cmt.id_bibrec = grp.id_bibrec WHERE id_user=%s AND star_score = 0 ORDER BY grp.mindatecreation ASC"
 
     if selected_display_number_option.isdigit():
         selected_display_number_option_as_int = int(selected_display_number_option)
         if selected_display_number_option_as_int < 5:
             selected_display_number_option_as_int = 5
             selected_display_number_option = str(selected_display_number_option_as_int)
         from_index = (page_number - 1) * int(selected_display_number_option)
         query_params += ' LIMIT ' + \
                         str(from_index) + \
                         ',' + \
                         str(int(selected_display_number_option))
         nb_total_pages = int(math.ceil(float(nb_total_results) / selected_display_number_option_as_int))
 
 
     if selected_order_by_option in ("grlf", "grof"):
         res = run_sql(query + query_params, (user_info['uid'], user_info['uid']))
     else:
         res = run_sql("SELECT id_bibrec, id, date_creation, body, status, in_reply_to_id_cmtRECORDCOMMENT FROM cmtRECORDCOMMENT WHERE id_user=%s AND star_score = 0" + query_params, (user_info['uid'], ))
 
     return webcomment_templates.tmpl_your_comments(user_info, res,
                                                    page_number=page_number,
                                                    selected_order_by_option=selected_order_by_option,
                                                    selected_display_number_option=selected_display_number_option,
                                                    selected_display_format_option=selected_display_format_option,
                                                    nb_total_results=nb_total_results,
                                                    nb_total_pages=nb_total_pages,
                                                    ln=ln)
diff --git a/modules/webcomment/lib/webcommentadminlib.py b/modules/webcomment/lib/webcommentadminlib.py
index fac14b4b6..b586d7ccd 100644
--- a/modules/webcomment/lib/webcommentadminlib.py
+++ b/modules/webcomment/lib/webcommentadminlib.py
@@ -1,690 +1,690 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
+# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 __revision__ = "$Id$"
 
 from invenio.config import CFG_SITE_LANG, CFG_SITE_URL, \
      CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN
 from invenio.webcomment_config import InvenioWebCommentWarning
 from invenio.webcomment import query_get_comment, \
      get_reply_order_cache_data
 from invenio.urlutils import wash_url_argument
 from invenio.dbquery import run_sql
 from invenio.messages import gettext_set_language, wash_language
 from invenio.errorlib import register_exception
 from invenio.webuser import get_user_info, collect_user_info, \
                             isUserAdmin
 from invenio.access_control_engine import acc_authorize_action, \
      acc_get_authorized_emails
 from invenio.search_engine import perform_request_search
 
 import invenio.template
 webcomment_templates = invenio.template.load('webcomment')
 
 def getnavtrail(previous = '', ln=CFG_SITE_LANG):
     """Get the navtrail"""
     previous = wash_url_argument(previous, 'str')
     ln = wash_language(ln)
     _ = gettext_set_language(ln)
     navtrail = """<a class="navtrail" href="%s/help/admin">%s</a> """ % (CFG_SITE_URL, _("Admin Area"))
     navtrail = navtrail + previous
     return navtrail
 
 def get_nb_reviews(recID, count_deleted=True):
     """
     Return number of reviews for the record recID
 
     if count_deleted is True, deleted reviews are also counted
     """
     query = """SELECT count(*)
             FROM cmtRECORDCOMMENT c
             WHERE c.id_bibrec = %s and c.star_score > 0
             """
 
     if not count_deleted:
         query += "and c.status != 'dm' and c.status != 'da'"
 
     res = run_sql(query, (recID,))
 
     return res[0][0]
 
 def get_nb_comments(recID, count_deleted=True):
     """
     Return number of comments for the record recID
 
     if count_deleted is True, deleted comments are also counted
     """
     query = """SELECT count(*)
             FROM cmtRECORDCOMMENT c
             WHERE c.id_bibrec = %s and c.star_score = 0
             """
 
     if not count_deleted:
         query += "and c.status != 'dm' and c.status != 'da'"
 
     res = run_sql(query, (recID,))
 
     return res[0][0]
 
 def get_user_collections(req):
     """
     Return collections for which the user is moderator
     """
     user_info = collect_user_info(req)
     res = []
     collections = run_sql('SELECT name FROM collection')
     for collection in collections:
         collection_emails = acc_get_authorized_emails('moderatecomments', collection=collection[0])
         if user_info['email'] in collection_emails or isUserAdmin(user_info):
             res.append(collection[0])
     return res
 
 def perform_request_index(ln=CFG_SITE_LANG):
     """
     """
     return webcomment_templates.tmpl_admin_index(ln=ln)
 
 def perform_request_delete(comID=-1, recID=-1, uid=-1, reviews="", ln=CFG_SITE_LANG):
     """
     """
     _ = gettext_set_language(ln)
 
     from invenio.search_engine import record_exists
 
     warnings = []
 
     ln = wash_language(ln)
     comID = wash_url_argument(comID, 'int')
     recID = wash_url_argument(recID, 'int')
     uid = wash_url_argument(uid, 'int')
     # parameter reviews is deduced from comID when needed
 
     if comID is not None and recID is not None and uid is not None:
         if comID <= 0 and recID <= 0 and uid <= 0:
             if comID != -1:
                 try:
                     raise InvenioWebCommentWarning(_('Invalid comment ID.'))
                 except InvenioWebCommentWarning, exc:
                     register_exception(stream='warning')
                     warnings.append((exc.message, ''))
                 #warnings.append(("WRN_WEBCOMMENT_ADMIN_INVALID_COMID",))
             return webcomment_templates.tmpl_admin_delete_form(ln, warnings)
 
         if comID > 0 and not recID > 0:
             comment = query_get_comment(comID)
 
             if comment:
                 # Figure out if this is a review or a comment
                 c_star_score = 5
                 if comment[c_star_score] > 0:
                     reviews = 1
                 else:
                     reviews = 0
                 return (perform_request_comments(ln=ln, comID=comID, recID=recID, reviews=reviews), None, warnings)
             else:
                 try:
                     raise InvenioWebCommentWarning(_('Comment ID %s does not exist.') % comID)
                 except InvenioWebCommentWarning, exc:
                     register_exception(stream='warning')
                     warnings.append((exc.message, ''))
                 #warnings.append(('WRN_WEBCOMMENT_ADMIN_COMID_INEXISTANT', comID))
                 return webcomment_templates.tmpl_admin_delete_form(ln, warnings)
 
         elif recID > 0:
             if record_exists(recID):
                 comID = ''
                 reviews = wash_url_argument(reviews, 'int')
                 return (perform_request_comments(ln=ln, comID=comID, recID=recID, reviews=reviews), None, warnings)
             else:
                 try:
                     raise InvenioWebCommentWarning(_('Record ID %s does not exist.') % comID)
                 except InvenioWebCommentWarning, exc:
                     register_exception(stream='warning')
                     warnings.append((exc.message, ''))
                 #warnings.append(('WRN_WEBCOMMENT_ADMIN_RECID_INEXISTANT', comID))
                 return webcomment_templates.tmpl_admin_delete_form(ln, warnings)
         else:
             return webcomment_templates.tmpl_admin_delete_form(ln, warnings)
 
     else:
         return webcomment_templates.tmpl_admin_delete_form(ln, warnings)
 
 def perform_request_users(ln=CFG_SITE_LANG):
     """
     """
     ln = wash_language(ln)
 
     users_data = query_get_users_reported()
     return webcomment_templates.tmpl_admin_users(ln=ln, users_data=users_data)
 
 def query_get_users_reported():
     """
     Get the users who have been reported at least once.
     @return: tuple of ct, i.e. (ct, ct, ...)
             where ct is a tuple (total_number_reported, total_comments_reported, total_reviews_reported,
                                  total_nb_votes_yes_of_reported, total_nb_votes_total_of_reported, user_id, user_email, user_nickname)
             sorted by order of ct having highest total_number_reported
     """
     query1 = "SELECT c.nb_abuse_reports, c.nb_votes_yes, c.nb_votes_total, u.id, u.email, u.nickname, c.star_score " \
              "FROM user AS u, cmtRECORDCOMMENT AS c " \
              "WHERE c.id_user=u.id AND c.nb_abuse_reports > 0 " \
              "ORDER BY u.id "
     res1 = run_sql(query1)
     if type(res1) is None:
         return ()
     users = {}
     for cmt in res1:
         uid = int(cmt[3])
         if users.has_key(uid):
             users[uid] = (users[uid][0]+int(cmt[0]), int(cmt[6])>0 and users[uid][1] or users[uid][1]+1, int(cmt[6])>0 and users[uid][2]+1 or users[uid][2],
                           users[uid][3]+int(cmt[1]), users[uid][4]+int(cmt[2]), int(cmt[3]), cmt[4], cmt[5])
         else:
             users[uid] = (int(cmt[0]), int(cmt[6])==0 and 1 or 0, int(cmt[6])>0 and 1 or 0, int(cmt[1]), int(cmt[2]), int(cmt[3]), cmt[4], cmt[5])
     users = users.values()
     users.sort()
     users.reverse()
     users = tuple(users)
     return users
 
 def perform_request_comments(req=None, ln=CFG_SITE_LANG, uid="", comID="", recID="", reviews=0, abuse=False, collection=""):
     """
     Display the list of comments/reviews along with information about the comment.
 
     Display the comment given by its ID, or the list of comments for
     the given record ID.
     If abuse == True, only list records reported as abuse.
     If comID and recID are not provided, list all comments, or all
     abused comments (check parameter 'abuse')
     """
     ln = wash_language(ln)
     uid = wash_url_argument(uid, 'int')
     comID = wash_url_argument(comID, 'int')
     recID = wash_url_argument(recID, 'int')
     reviews = wash_url_argument(reviews, 'int')
     collection = wash_url_argument(collection, 'str')
 
     user_info = collect_user_info(req)
     user_collections = ['Show all']
     user_collections.extend(get_user_collections(req))
     if collection and collection != 'Show all':
         (auth_code, auth_msg) = acc_authorize_action(req, 'moderatecomments', collection=collection)
         if auth_code:
             return webcomment_templates.tmpl_admin_comments(ln=ln, uid=uid,
                                                             comID=comID,
                                                             recID=recID,
                                                             comment_data=None,
                                                             reviews=reviews,
                                                             error=1,
                                                             user_collections=user_collections,
                                                             collection=collection)
     if collection:
         if recID or uid:
             comments = query_get_comments(uid, comID, recID, reviews, ln, abuse=abuse, user_collections=user_collections, collection=collection)
         else:
             comments = query_get_comments('', comID, '', reviews, ln, abuse=abuse, user_collections=user_collections, collection=collection)
     else:
         if recID or uid:
             comments = query_get_comments(uid, comID, recID, reviews, ln, abuse=abuse, user_collections=user_collections, collection=user_collections[0])
         else:
             comments = query_get_comments('', comID, '', reviews, ln, abuse=abuse, user_collections=user_collections, collection=user_collections[0])
     if comments:
         return webcomment_templates.tmpl_admin_comments(ln=ln, uid=uid,
                                                         comID=comID,
                                                         recID=recID,
                                                         comment_data=comments,
                                                         reviews=reviews,
                                                         error=0,
                                                         user_collections=user_collections,
                                                         collection=collection)
     else:
         return webcomment_templates.tmpl_admin_comments(ln=ln, uid=uid,
                                                         comID=comID,
                                                         recID=recID,
                                                         comment_data=comments,
                                                         reviews=reviews,
                                                         error=2,
                                                         user_collections=user_collections,
                                                         collection=collection)
 
 
 
 def perform_request_hot(req=None, ln=CFG_SITE_LANG, comments=1, top=10, collection="Show all"):
     """
     Display the list of hottest comments/reviews along with information about the comment.
 
     @param req: request object for obtaining user information
     @param ln: language
     @param comments: boolean activated if using comments, deactivated for reviews
     @param top: specify number of results to be shown
     @param collection: filter by collection
     """
     ln = wash_language(ln)
     comments = wash_url_argument(comments, 'int')
     top = wash_url_argument(top, 'int')
     collection = wash_url_argument(collection, 'str')
 
     user_info = collect_user_info(req)
     user_collections = ['Show all']
     user_collections.extend(get_user_collections(req))
     if collection and collection != 'Show all':
         (auth_code, auth_msg) = acc_authorize_action(req, 'moderatecomments', collection=collection)
         if auth_code:
             return webcomment_templates.tmpl_admin_hot(ln=ln,
                                                        comment_data = None,
                                                        comments=comments, error=1, user_collections=user_collections, collection=collection)
     if collection:
         comments_retrieved = query_get_hot(comments, ln, top, user_collections, collection)
     else:
         comments_retrieved = query_get_hot(comments, ln, top, user_collections, user_collections[0])
     if comments_retrieved:
         return webcomment_templates.tmpl_admin_hot(ln=ln,
                                                    comment_data=comments_retrieved,
                                                    comments=comments, error=0, user_collections=user_collections, collection=collection)
     else:
         return webcomment_templates.tmpl_admin_hot(ln=ln,
                                                    comment_data=comments_retrieved,
                                                    comments=comments, error=2, user_collections=user_collections, collection=collection)
 
 
 def perform_request_latest(req=None, ln=CFG_SITE_LANG, comments=1, top=10, collection=""):
     """
     Display the list of latest comments/reviews along with information about the comment.
 
     @param req: request object for obtaining user information
     @param ln: language
     @param comments: boolean activated if using comments, deactivated for reviews
     @param top: Specify number of results to be shown
     @param collection: filter by collection
     """
     ln = wash_language(ln)
     comments = wash_url_argument(comments, 'int')
     top = wash_url_argument(top, 'int')
     collection = wash_url_argument(collection, 'str')
 
     user_info = collect_user_info(req)
     user_collections = ['Show all']
     user_collections.extend(get_user_collections(req))
     if collection and collection != 'Show all':
         (auth_code, auth_msg) = acc_authorize_action(req, 'moderatecomments', collection=collection)
         if auth_code:
             return webcomment_templates.tmpl_admin_latest(ln=ln,
                                                           comment_data=None,
                                                           comments=comments, error=1, user_collections=user_collections, collection=collection)
     if collection:
         comments_retrieved = query_get_latest(comments, ln, top, user_collections, collection)
     else:
         comments_retrieved = query_get_latest(comments, ln, top, user_collections, user_collections[0])
     if comments_retrieved:
         return webcomment_templates.tmpl_admin_latest(ln=ln,
                                                       comment_data=comments_retrieved,
                                                       comments=comments, error=0, user_collections=user_collections, collection=collection)
     else:
         return webcomment_templates.tmpl_admin_latest(ln=ln,
                                                       comment_data=comments_retrieved,
                                                       comments=comments, error=2, user_collections=user_collections, collection=collection)
 
 
 def perform_request_undel_single_com(ln=CFG_SITE_LANG, id=id):
     """
     Mark comment referenced by id as active
     """
     ln = wash_language(ln)
     id = wash_url_argument(id, 'int')
 
     return query_undel_single_comment(id)
 
 def query_get_comments(uid, cmtID, recID, reviews, ln, abuse=False, user_collections='', collection=''):
     """
     private function
     @param user_collections: allowed collections for the user
     @param collection: collection to display
     @return tuple of comment where comment is
     tuple (nickname, uid, date_creation, body, id, status) if ranking disabled or
     tuple (nickname, uid, date_creation, body, nb_votes_yes, nb_votes_total, star_score, title, id, status)
     """
     qdict = {'id': 0, 'id_bibrec': 1, 'uid': 2, 'date_creation': 3, 'body': 4,
     'status': 5, 'nb_abuse_reports': 6, 'nb_votes_yes': 7, 'nb_votes_total': 8,
              'star_score': 9, 'title': 10, 'email': -2, 'nickname': -1}
     query = """SELECT c.id, c.id_bibrec, c.id_user,
                       DATE_FORMAT(c.date_creation, '%%Y-%%m-%%d %%H:%%i:%%S'), c.body,
                       c.status, c.nb_abuse_reports,
                       %s
                       u.email, u.nickname
                FROM cmtRECORDCOMMENT c LEFT JOIN user u
                                        ON c.id_user = u.id
                %s
                ORDER BY c.nb_abuse_reports DESC, c.nb_votes_yes DESC, c.date_creation
     """
     select_fields = reviews and 'c.nb_votes_yes, c.nb_votes_total, c.star_score, c.title,' or ''
     where_clause = "WHERE " + (reviews and 'c.star_score>0' or 'c.star_score=0')
     if uid:
         where_clause += ' AND c.id_user=%i' % uid
     if recID:
         where_clause += ' AND c.id_bibrec=%i' % recID
     if cmtID:
         where_clause += ' AND c.id=%i' % cmtID
     if abuse:
         where_clause += ' AND c.nb_abuse_reports>0'
 
     res = run_sql(query % (select_fields, where_clause))
     collection_records = []
     if collection == 'Show all':
         for collection_name in user_collections:
             collection_records.extend(perform_request_search(cc=collection_name))
     else:
         collection_records.extend(perform_request_search(cc=collection))
     output = []
     for qtuple in res:
         if qtuple[qdict['id_bibrec']] in collection_records:
             nickname = qtuple[qdict['nickname']] or get_user_info(qtuple[qdict['uid']], ln)[2]
             if reviews:
                 comment_tuple = (nickname,
                                  qtuple[qdict['uid']],
                                  qtuple[qdict['date_creation']],
                                  qtuple[qdict['body']],
                                  qtuple[qdict['nb_votes_yes']],
                                  qtuple[qdict['nb_votes_total']],
                                  qtuple[qdict['star_score']],
                                  qtuple[qdict['title']],
                                  qtuple[qdict['id']],
                                  qtuple[qdict['status']])
             else:
                 comment_tuple = (nickname,
                                  qtuple[qdict['uid']],
                                  qtuple[qdict['date_creation']],
                                  qtuple[qdict['body']],
                                  qtuple[qdict['id']],
                                  qtuple[qdict['status']])
             general_infos_tuple = (nickname,
                                    qtuple[qdict['uid']],
                                    qtuple[qdict['email']],
                                    qtuple[qdict['id']],
                                    qtuple[qdict['id_bibrec']],
                                    qtuple[qdict['nb_abuse_reports']])
             out_tuple = (comment_tuple, general_infos_tuple)
             output.append(out_tuple)
     return tuple(output)
 
 def query_get_hot(comments, ln, top, user_collections, collection):
     """
     private function
     @param comments:  boolean indicating if we want to retrieve comments or reviews
     @param ln: language
     @param top: number of results to display
     @param user_collections: allowed collections for the user
     @param collection: collection to display
     @return: tuple (id_bibrec, date_last_comment, users, count)
     """
     qdict = {'id_bibrec': 0, 'date_last_comment': 1, 'users': 2, 'total_count': 3}
     query = """SELECT c.id_bibrec,
                DATE_FORMAT(max(c.date_creation), '%%Y-%%m-%%d %%H:%%i:%%S') as date_last_comment,
                count(distinct c.id_user) as users,
                count(*) as count
                FROM cmtRECORDCOMMENT c
                %s
                GROUP BY c.id_bibrec
                ORDER BY count(*) DESC
                LIMIT %s
     """
     where_clause = "WHERE " + (comments and 'c.star_score=0' or 'c.star_score>0') + ' AND c.status="ok" AND c.nb_abuse_reports < %s' % CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN
 
     res = run_sql(query % (where_clause, top))
 
     collection_records = []
     if collection == 'Show all':
         for collection_name in user_collections:
             collection_records.extend(perform_request_search(cc=collection_name))
     else:
         collection_records.extend(perform_request_search(cc=collection))
 
     output = []
     for qtuple in res:
         if qtuple[qdict['id_bibrec']] in collection_records:
             general_infos_tuple = (qtuple[qdict['id_bibrec']],
                                    qtuple[qdict['date_last_comment']],
                                    qtuple[qdict['users']],
                                    qtuple[qdict['total_count']])
             output.append(general_infos_tuple)
     return tuple(output)
 
 def query_get_latest(comments, ln, top, user_collections, collection):
     """
     private function
     @param comments:  boolean indicating if we want to retrieve comments or reviews
     @param ln: language
     @param top: number of results to display
     @param user_collections: allowed collections for the user
     @param collection: collection to display
     @return tuple of comment where comment is
     tuple (nickname, uid, date_creation, body, id) if latest comments or
     tuple (nickname, uid, date_creation, body, star_score, id) if latest reviews
     """
     qdict = {'id': 0, 'id_bibrec': 1, 'uid': 2, 'date_creation': 3, 'body': 4,
              'nb_abuse_reports': 5, 'star_score': 6, 'nickname': -1}
     query = """SELECT c.id, c.id_bibrec, c.id_user,
                       DATE_FORMAT(c.date_creation, '%%Y-%%m-%%d %%H:%%i:%%S'), c.body,
                       c.nb_abuse_reports,
                       %s
                       u.nickname
                       FROM cmtRECORDCOMMENT c LEFT JOIN user u
                       ON c.id_user = u.id
                %s
                ORDER BY c.date_creation DESC
                LIMIT %s
     """
     select_fields = not comments and 'c.star_score, ' or ''
     where_clause = "WHERE " + (comments and 'c.star_score=0' or 'c.star_score>0') + ' AND c.status="ok" AND c.nb_abuse_reports < %s' % CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN
 
     res = run_sql(query % (select_fields, where_clause, top))
 
     collection_records = []
     if collection == 'Show all':
         for collection_name in user_collections:
             collection_records.extend(perform_request_search(cc=collection_name))
     else:
         collection_records.extend(perform_request_search(cc=collection))
     output = []
     for qtuple in res:
         if qtuple[qdict['id_bibrec']] in collection_records:
             nickname = qtuple[qdict['nickname']] or get_user_info(qtuple[qdict['uid']], ln)[2]
             if not comments:
                 comment_tuple = (nickname,
                                  qtuple[qdict['uid']],
                                  qtuple[qdict['date_creation']],
                                  qtuple[qdict['body']],
                                  qtuple[qdict['star_score']],
                                  qtuple[qdict['id']])
             else:
                 comment_tuple = (nickname,
                                  qtuple[qdict['uid']],
                                  qtuple[qdict['date_creation']],
                                  qtuple[qdict['body']],
                                  qtuple[qdict['id']])
             general_infos_tuple = (nickname,
                                    qtuple[qdict['uid']],
                                    qtuple[qdict['id']],
                                    qtuple[qdict['id_bibrec']],
                                    qtuple[qdict['nb_abuse_reports']])
 
             out_tuple = (comment_tuple, general_infos_tuple)
             output.append(out_tuple)
     return tuple(output)
 
 def perform_request_del_com(ln=CFG_SITE_LANG, comIDs=[]):
     """
     private function
     Delete the comments and say whether successful or not
     @param ln: language
     @param comIDs: list of comment ids
     """
     ln = wash_language(ln)
     comIDs = wash_url_argument(comIDs, 'list')
     # map ( fct, list, arguments of function)
     comIDs = map(wash_url_argument, comIDs, ('int '*len(comIDs)).split(' ')[:-1])
 
     if not comIDs:
         comIDs = map(coerce, comIDs, ('0 '*len(comIDs)).split(' ')[:-1])
         return webcomment_templates.tmpl_admin_del_com(del_res=comIDs, ln=ln)
 
     del_res = []
     for comID in comIDs:
         del_res.append((comID, query_delete_comment_mod(comID)))
     return webcomment_templates.tmpl_admin_del_com(del_res=del_res, ln=ln)
 
 def perform_request_undel_com(ln=CFG_SITE_LANG, comIDs=[]):
     """
     private function
     Undelete the comments and say whether successful or not
     @param ln: language
     @param comIDs: list of comment ids
     """
     ln = wash_language(ln)
     comIDs = wash_url_argument(comIDs, 'list')
     # map ( fct, list, arguments of function)
     comIDs = map(wash_url_argument, comIDs, ('int '*len(comIDs)).split(' ')[:-1])
 
     if not comIDs:
         comIDs = map(coerce, comIDs, ('0 '*len(comIDs)).split(' ')[:-1])
         return webcomment_templates.tmpl_admin_undel_com(del_res=comIDs, ln=ln)
 
     del_res = []
     for comID in comIDs:
         del_res.append((comID, query_undel_single_comment(comID)))
     return webcomment_templates.tmpl_admin_undel_com(del_res=del_res, ln=ln)
 
 
 def perform_request_del_single_com_mod(ln=CFG_SITE_LANG, id=id):
     """
     private function
     Delete a single comment requested by a moderator
     @param ln: language
     @param id: comment id to be deleted
     """
     ln = wash_language(ln)
     id = wash_url_argument(id, 'int')
     return query_delete_comment_mod(id)
 
 def perform_request_del_single_com_auth(ln=CFG_SITE_LANG, id=id):
     """
     private function
     Delete a single comment requested by the author
     @param ln: language
     @param id: comment id to be deleted
     """
     ln = wash_language(ln)
     id = wash_url_argument(id, 'int')
     return query_delete_comment_auth(id)
 
 def perform_request_unreport_single_com(ln=CFG_SITE_LANG, id=""):
     """
     private function
     Unreport a single comment
     @param ln: language
     @param id: comment id to be deleted
     """
     ln = wash_language(ln)
     id = wash_url_argument(id, 'int')
     return query_suppress_abuse_report(id)
 
 def suppress_abuse_report(ln=CFG_SITE_LANG, comIDs=[]):
     """
     private function
     suppress the abuse reports for the given comIDs.
     @param ln: language
     @param comIDs: list of ids to suppress attached reports.
     """
     ln = wash_language(ln)
     comIDs = wash_url_argument(comIDs, 'list')
     # map ( fct, list, arguments of function)
     comIDs = map(wash_url_argument, comIDs, ('int '*len(comIDs)).split(' ')[:-1])
 
     if not comIDs:
         comIDs = map(coerce, comIDs, ('0 '*len(comIDs)).split(' ')[:-1])
         return webcomment_templates.tmpl_admin_del_com(del_res=comIDs, ln=ln)
 
     del_res = []
     for comID in comIDs:
         del_res.append((comID, query_suppress_abuse_report(comID)))
     return webcomment_templates.tmpl_admin_suppress_abuse_report(del_res=del_res, ln=ln)
 
 def query_suppress_abuse_report(comID):
     """ suppress abuse report for a given comment
     @return: integer 1 if successful, integer 0 if not
     """
     query = "UPDATE cmtRECORDCOMMENT SET nb_abuse_reports=0, status='ap' WHERE id=%s"
     params = (comID,)
     res = run_sql(query, params)
     return int(res)
 
 def query_delete_comment_mod(comID):
     """
     delete comment with id comID
     @return: integer 1 if successful, integer 0 if not
     """
     query1 = "UPDATE cmtRECORDCOMMENT SET status='dm' WHERE id=%s"
     params1 = (comID,)
     res1 = run_sql(query1, params1)
     return int(res1)
 
 def query_delete_comment_auth(comID):
     """
     delete comment with id comID
     @return: integer 1 if successful, integer 0 if not
     """
     query1 = "UPDATE cmtRECORDCOMMENT SET status='da' WHERE id=%s"
     params1 = (comID,)
     res1 = run_sql(query1, params1)
     return int(res1)
 
 def query_undel_single_comment(comID):
     """
     undelete comment with id comID
     @return: integer 1 if successful, integer 0 if not
     """
     query = "UPDATE cmtRECORDCOMMENT SET status='ok' WHERE id=%s"
     params = (comID,)
     res = run_sql(query, params)
     return int(res)
 
 def check_user_is_author(user_id, com_id):
     """ Check if the user is the author of the given comment """
     res = run_sql("SELECT id, id_user FROM cmtRECORDCOMMENT WHERE id=%s and id_user=%s", (str(com_id), str(user_id)))
     if res:
         return 1
     return 0
 
 def migrate_comments_populate_threads_index():
     """
     Fill in the `reply_order_cached_data' columns in cmtRECORDCOMMENT and
     bskRECORDCOMMENT tables with adequate values so that thread
     are displayed correctly.
     """
     # Update WebComment comments
     res = run_sql("SELECT id FROM cmtRECORDCOMMENT WHERE reply_order_cached_data is NULL")
     for row in res:
         reply_order_cached_data = get_reply_order_cache_data(row[0])
-        run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=%s WHERE id=%s",
+        run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=_binary %s WHERE id=%s",
                 (reply_order_cached_data, row[0]))
 
     # Update WebBasket comments
     res = run_sql("SELECT id FROM bskRECORDCOMMENT WHERE reply_order_cached_data is NULL")
     for row in res:
         reply_order_cached_data = get_reply_order_cache_data(row[0])
-        run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=%s WHERE id=%s",
+        run_sql("UPDATE cmtRECORDCOMMENT set reply_order_cached_data=_binary %s WHERE id=%s",
                 (reply_order_cached_data, row[0]))
diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py
index 16662eb90..259a3d7b9 100644
--- a/modules/websearch/lib/websearch_webcoll.py
+++ b/modules/websearch/lib/websearch_webcoll.py
@@ -1,1224 +1,1224 @@
 # This file is part of Invenio.
-# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 CERN.
+# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """Create Invenio collection cache."""
 
 __revision__ = "$Id$"
 
 import calendar
 import copy
 import sys
 import cgi
 import re
 import os
 import string
 import time
 import cPickle
 
 from invenio.config import \
      CFG_CERN_SITE, \
      CFG_WEBSEARCH_INSTANT_BROWSE, \
      CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \
      CFG_WEBSEARCH_I18N_LATEST_ADDITIONS, \
      CFG_CACHEDIR, \
      CFG_SITE_LANG, \
      CFG_SITE_NAME, \
      CFG_SITE_LANGS, \
      CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, \
      CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, \
      CFG_SCOAP3_SITE
 from invenio.messages import gettext_set_language
 from invenio.search_engine import search_pattern_parenthesised, get_creation_date, get_field_i18nname, collection_restricted_p, sort_records, EM_REPOSITORY
 from invenio.search_engine_config import CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES
 from invenio.dbquery import run_sql, Error, get_table_update_time
 from invenio.bibrank_record_sorter import get_bibrank_methods
 from invenio.dateutils import convert_datestruct_to_dategui, strftime
 from invenio.bibformat import format_record
 from invenio.shellutils import mymkdir
 from invenio.intbitset import intbitset
 from invenio.websearch_external_collections import \
      external_collection_load_states, \
      dico_collection_external_searches, \
      external_collection_sort_engine_by_name
 from invenio.bibtask import task_init, task_get_option, task_set_option, \
     write_message, task_has_option, task_update_progress, \
     task_sleep_now_if_required
 import invenio.template
 websearch_templates = invenio.template.load('websearch')
 
 from invenio.websearch_external_collections_searcher import external_collections_dictionary
 from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_TIMEOUT
 from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS
 
 # global vars
 COLLECTION_HOUSE = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ...
 
 # CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE -- cache timestamp
 # tolerance (in seconds), to account for the fact that an admin might
 # accidentally happen to edit the collection definitions at exactly
 # the same second when some webcoll process was about to be started.
 # In order to be safe, let's put an exaggerated timestamp tolerance
 # value such as 20 seconds:
 CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE = 20
 
 # CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE -- location of the cache
 # timestamp file:
 CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_updated" % CFG_CACHEDIR
 
 # CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE -- location of the cache
 # timestamp file usef when running webcoll in the fast-mode.
 CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_fast_updated" % CFG_CACHEDIR
 
 
 def get_collection(colname):
     """Return collection object from the collection house for given colname.
        If does not exist, then create it."""
     if not COLLECTION_HOUSE.has_key(colname):
         colobject = Collection(colname)
         COLLECTION_HOUSE[colname] = colobject
     return COLLECTION_HOUSE[colname]
 
 # auxiliary functions:
 def is_selected(var, fld):
     "Checks if the two are equal, and if yes, returns ' selected'.  Useful for select boxes."
     if var == fld:
         return ' selected="selected"'
     else:
         return ""
 
 def get_field(recID, tag):
     "Gets list of field 'tag' for the record with 'recID' system number."
 
     out = []
     digit = tag[0:2]
 
     bx = "bib%sx" % digit
     bibx = "bibrec_bib%sx" % digit
     query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \
             % (bx, bibx, recID, tag)
     res = run_sql(query)
     for row in res:
         out.append(row[0])
     return out
 
 def check_nbrecs_for_all_external_collections():
     """Check if any of the external collections have changed their total number of records, aka nbrecs.
     Return True if any of the total numbers of records have changed and False if they're all the same."""
     res = run_sql("SELECT name FROM collection WHERE dbquery LIKE 'hostedcollection:%';")
     for row in res:
         coll_name = row[0]
         if (get_collection(coll_name)).check_nbrecs_for_external_collection():
             return True
     return False
 
 class Collection:
     "Holds the information on collections (id,name,dbquery)."
 
     def __init__(self, name=""):
         "Creates collection instance by querying the DB configuration database about 'name'."
         self.calculate_reclist_run_already = 0 # to speed things up without much refactoring
         self.update_reclist_run_already = 0 # to speed things up without much refactoring
         self.reclist_updated_since_start = 0 # to check if webpage cache need rebuilding
         self.reclist_with_nonpublic_subcolls = intbitset()
         # temporary counters for the number of records in hosted collections
         self.nbrecs_tmp = None # number of records in a hosted collection
         self.nbrecs_from_hosted_collections = 0 # total number of records from
                                                 # descendant hosted collections
         if not name:
             self.name = CFG_SITE_NAME # by default we are working on the home page
             self.id = 1
             self.dbquery = None
             self.nbrecs = None
             self.reclist = intbitset()
             self.old_reclist = intbitset()
             self.reclist_updated_since_start = 1
         else:
             self.name = name
             try:
                 res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection
                                   WHERE name=%s""", (name,))
                 if res:
                     self.id = res[0][0]
                     self.name = res[0][1]
                     self.dbquery = res[0][2]
                     self.nbrecs = res[0][3]
                     try:
                         self.reclist = intbitset(res[0][4])
                     except:
                         self.reclist = intbitset()
                         self.reclist_updated_since_start = 1
                 else: # collection does not exist!
                     self.id = None
                     self.dbquery = None
                     self.nbrecs = None
                     self.reclist = intbitset()
                     self.reclist_updated_since_start = 1
                 self.old_reclist = intbitset(self.reclist)
             except Error, e:
                 print "Error %d: %s" % (e.args[0], e.args[1])
                 sys.exit(1)
 
     def get_example_search_queries(self):
         """Returns list of sample search queries for this collection.
         """
         res = run_sql("""SELECT example.body FROM example
         LEFT JOIN collection_example on example.id=collection_example.id_example
         WHERE collection_example.id_collection=%s ORDER BY collection_example.score""", (self.id,))
         return [query[0] for query in res]
 
     def get_name(self, ln=CFG_SITE_LANG, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""):
         """Return nicely formatted collection name for language LN.
         The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc."""
         out = prolog
         i18name = ""
         res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type))
         try:
             i18name += res[0][0]
         except IndexError:
             pass
         if i18name:
             out += i18name
         else:
             out += self.name
         out += epilog
         return out
 
     def get_collectionbox_name(self, ln=CFG_SITE_LANG, box_type="r"):
         """
         Return collection-specific labelling of 'Focus on' (regular
         collection), 'Narrow by' (virtual collection) and 'Latest
         addition' boxes.
 
         If translation for given language does not exist, use label
         for CFG_SITE_LANG. If no custom label is defined for
         CFG_SITE_LANG, return default label for the box.
 
         @param ln: the language of the label
         @param box_type: can be 'r' (=Narrow by), 'v' (=Focus on), 'l' (=Latest additions)
         """
         i18name = ""
         res = run_sql("SELECT value FROM collectionboxname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, box_type))
         try:
             i18name = res[0][0]
         except IndexError:
             res = run_sql("SELECT value FROM collectionboxname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, CFG_SITE_LANG, box_type))
             try:
                 i18name = res[0][0]
             except IndexError:
                 pass
 
         if not i18name:
             # load the right message language
             _ = gettext_set_language(ln)
             if box_type == "v":
                 i18name = _('Focus on:')
             elif box_type == "r":
                 if CFG_SCOAP3_SITE:
                     i18name = _('Narrow by publisher/journal:')
                 else:
                     i18name = _('Narrow by collection:')
             elif box_type == "l":
                 i18name = _('Latest additions:')
 
         return i18name
 
     def get_ancestors(self):
         "Returns list of ancestors of the current collection."
         ancestors = []
         ancestors_ids = intbitset()
         id_son = self.id
         while 1:
             query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\
                     "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son)
             res = run_sql(query, None, 1)
             if res:
                 col_ancestor = get_collection(res[0][1])
                 # looking for loops
                 if self.id in ancestors_ids:
                     write_message("Loop found in collection %s" % self.name, stream=sys.stderr)
                     raise OverflowError("Loop found in collection %s" % self.name)
                 else:
                     ancestors.append(col_ancestor)
                     ancestors_ids.add(col_ancestor.id)
                     id_son = res[0][0]
             else:
                 break
         ancestors.reverse()
         return ancestors
 
     def restricted_p(self):
         """Predicate to test if the collection is restricted or not.  Return the contect of the
          `restrited' column of the collection table (typically Apache group).  Otherwise return
          None if the collection is public."""
 
         if collection_restricted_p(self.name):
             return 1
         return None
 
     def get_sons(self, type='r'):
         "Returns list of direct sons of type 'type' for the current collection."
         sons = []
         id_dad = self.id
         query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
                 "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score ASC, c.name ASC" % (int(id_dad), type)
         res = run_sql(query)
         for row in res:
             sons.append(get_collection(row[1]))
         return sons
 
     def get_descendants(self, type='r'):
         "Returns list of all descendants of type 'type' for the current collection."
         descendants = []
         descendant_ids = intbitset()
         id_dad = self.id
         query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
                 "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score ASC" % (int(id_dad), type)
         res = run_sql(query)
         for row in res:
             col_desc = get_collection(row[1])
             # looking for loops
             if self.id in descendant_ids:
                 write_message("Loop found in collection %s" % self.name, stream=sys.stderr)
                 raise OverflowError("Loop found in collection %s" % self.name)
             else:
                 descendants.append(col_desc)
                 descendant_ids.add(col_desc.id)
                 tmp_descendants = col_desc.get_descendants()
                 for descendant in tmp_descendants:
                     descendant_ids.add(descendant.id)
                 descendants += tmp_descendants
         return descendants
 
     def write_cache_file(self, filename='', filebody={}):
         "Write a file inside collection cache."
         # open file:
         dirname = "%s/collections" % (CFG_CACHEDIR)
         mymkdir(dirname)
         fullfilename = dirname + "/%s.html" % filename.replace('/', '___SLASH___')
         try:
             os.umask(022)
             f = open(fullfilename, "wb")
         except IOError, v:
             try:
                 (code, message) = v
             except:
                 code = 0
                 message = v
             print "I/O Error: " + str(message) + " (" + str(code) + ")"
             sys.exit(1)
         # print user info:
         write_message("... creating %s" % fullfilename, verbose=6)
         # print page body:
         cPickle.dump(filebody, f, cPickle.HIGHEST_PROTOCOL)
         # close file:
         f.close()
 
     def update_webpage_cache(self, lang):
         """Create collection page header, navtrail, body (including left and right stripes) and footer, and
            call write_cache_file() afterwards to update the collection webpage cache."""
 
         ## precalculate latest additions for non-aggregate
         ## collections (the info is ln and as independent)
         if self.dbquery:
             if CFG_WEBSEARCH_I18N_LATEST_ADDITIONS:
                 self.create_latest_additions_info(ln=lang)
             else:
                 self.create_latest_additions_info()
 
         # load the right message language
         _ = gettext_set_language(lang)
 
         # create dictionary with data
         cache = {"te_portalbox" : self.create_portalbox(lang, 'te'),
                  "np_portalbox" : self.create_portalbox(lang, 'np'),
                  "ne_portalbox" : self.create_portalbox(lang, 'ne'),
                  "tp_portalbox" : self.create_portalbox(lang, "tp"),
                  "lt_portalbox" : self.create_portalbox(lang, "lt"),
                  "rt_portalbox" : self.create_portalbox(lang, "rt"),
                  "last_updated" : convert_datestruct_to_dategui(time.localtime(),
                                                                     ln=lang)}
         for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages:
             cache["navtrail_%s" % aas] = self.create_navtrail_links(aas, lang)
             cache["searchfor_%s" % aas] = self.create_searchfor(aas, lang)
             cache["narrowsearch_%s" % aas] = self.create_narrowsearch(aas, lang, 'r')
             cache["focuson_%s" % aas] = self.create_narrowsearch(aas, lang, "v")+ \
                         self.create_external_collections_box(lang)
             cache["instantbrowse_%s" % aas] = self.create_instant_browse(aas=aas, ln=lang)
         # write cache file
         self.write_cache_file("%s-ln=%s"%(self.name, lang), cache)
 
         return cache
 
     def create_navtrail_links(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
         """Creates navigation trail links, i.e. links to collection
         ancestors (except Home collection).  If aas==1, then links to
         Advanced Search interfaces; otherwise Simple Search.
         """
 
         dads = []
         for dad in self.get_ancestors():
             if dad.name != CFG_SITE_NAME: # exclude Home collection
                 dads.append((dad.name, dad.get_name(ln)))
 
         return websearch_templates.tmpl_navtrail_links(
             aas=aas, ln=ln, dads=dads)
 
 
     def create_portalbox(self, lang=CFG_SITE_LANG, position="rt"):
         """Creates portalboxes of language CFG_SITE_LANG of the position POSITION by consulting DB configuration database.
            The position may be: 'lt'='left top', 'rt'='right top', etc."""
         out = ""
         query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\
                 " WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\
                 " ORDER BY cp.score DESC" % (self.id, lang, position)
         res = run_sql(query)
         for row in res:
             title, body = row[0], row[1]
             if title:
                 out += websearch_templates.tmpl_portalbox(title = title,
                                              body = body)
             else:
                 # no title specified, so print body ``as is'' only:
                 out += body
         return out
 
     def create_narrowsearch(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG, type="r"):
         """Creates list of collection descendants of type 'type' under title 'title'.
         If aas==1, then links to Advanced Search interfaces; otherwise Simple Search.
         Suitable for 'Narrow search' and 'Focus on' boxes."""
 
         # get list of sons and analyse it
         sons = self.get_sons(type)
 
         if not sons:
             return ''
 
         # get descendents
         descendants = self.get_descendants(type)
 
         grandsons = []
         if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS:
             # load grandsons for each son
             for son in sons:
                 grandsons.append(son.get_sons())
 
         # return ""
         return websearch_templates.tmpl_narrowsearch(
                  aas = aas,
                  ln = ln,
                  type = type,
                  father = self,
                  has_grandchildren = len(descendants)>len(sons),
                  sons = sons,
                  display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS,
                  grandsons = grandsons
                )
 
     def create_external_collections_box(self, ln=CFG_SITE_LANG):
         external_collection_load_states()
         if not dico_collection_external_searches.has_key(self.id):
             return ""
 
         engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id])
 
         return websearch_templates.tmpl_searchalso(ln, engines_list, self.id)
 
     def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG):
         """
         Create info about latest additions that will be used for
         create_instant_browse() later.
         """
         self.latest_additions_info = []
         if self.nbrecs and self.reclist:
             # firstly, get last 'rg' records:
             recIDs = list(self.reclist)
             of = 'hb'
             # CERN hack begins: tweak latest additions for selected collections:
             if CFG_CERN_SITE:
                 # alter recIDs list for some CERN collections:
                 this_year = time.strftime("%Y", time.localtime())
                 if self.name in ['CERN Yellow Reports','Videos']:
                     last_year = str(int(this_year) - 1)
                     # detect recIDs only from this and past year:
                     recIDs = list(self.reclist & \
                                   search_pattern_parenthesised(p='year:%s or year:%s' % \
                                                  (this_year, last_year)))
                 # apply special filters:
                 if self.name in ['Videos']:
                     # select only videos with movies:
                     recIDs = list(intbitset(recIDs) & \
                                   search_pattern_parenthesised(p='collection:"PUBLVIDEOMOVIE" -"Virtual Visit"'))
                     of = 'hvp'
                 if self.name in ['General Talks', 'Academic Training Lectures', 'Summer Student Lectures']:
                     #select only the lectures with material
                     recIDs = list(self.reclist & search_pattern_parenthesised(p='856:MediaArchive'))
                 # sort some CERN collections specially:
                 if self.name in ['Videos',
                                  'Video Clips',
                                  'Video Movies',
                                  'Video News',
                                  'Video Rushes',
                                  'Webcast',
                                  'ATLAS Videos',
                                  'Restricted Video Movies',
                                  'Restricted Video Rushes',
                                  'LHC First Beam Videos',
                                  'CERN openlab Videos']:
                     recIDs = sort_records(None, recIDs, '269__c', 'a')
                 elif self.name in ['LHCb Talks']:
                     recIDs = sort_records(None, recIDs, 'reportnumber', 'a')
                 elif self.name in ['CERN Yellow Reports']:
                     recIDs = sort_records(None, recIDs, '084__a', 'a')
                 elif self.name in ['CERN Courier Issues',
                                    'CERN Courier Articles',
                                    'CERN Bulletin Issues',
                                    'CERN Bulletin Articles']:
                     recIDs = sort_records(None, recIDs, '773__y', 'a')
             # CERN hack ends.
 
             total = len(recIDs)
             to_display = min(rg, total)
 
             for idx in range(total-1, total-to_display-1, -1):
                 recid = recIDs[idx]
                 self.latest_additions_info.append({'id': recid,
                                                    'format': format_record(recid, of, ln=ln),
                                                    'date': get_creation_date(recid, fmt="%Y-%m-%d<br />%H:%i")})
         return
 
     def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
         "Searches database and produces list of last 'rg' records."
 
         if self.restricted_p():
             return websearch_templates.tmpl_box_restricted_content(ln = ln)
 
         if str(self.dbquery).startswith("hostedcollection:"):
             return websearch_templates.tmpl_box_hosted_collection(ln = ln)
 
         if rg == 0:
             # do not show latest additions box
             return ""
 
         # CERN hack: do not display latest additions for some CERN collections:
         if CFG_CERN_SITE and self.name in ['Periodicals', 'Electronic Journals',
                                            'Press Office Photo Selection',
                                            'Press Office Video Selection']:
             return ""
 
         try:
             self.latest_additions_info
             latest_additions_info_p = True
         except:
             latest_additions_info_p = False
 
         if latest_additions_info_p:
             passIDs = []
             for idx in range(0, min(len(self.latest_additions_info), rg)):
                 # CERN hack: display the records in a grid layout, so do not show the related links
                 if CFG_CERN_SITE and self.name in ['Videos']:
                     passIDs.append({'id': self.latest_additions_info[idx]['id'],
                                     'body': self.latest_additions_info[idx]['format'],
                                     'date': self.latest_additions_info[idx]['date']})
                 else:
                     passIDs.append({'id': self.latest_additions_info[idx]['id'],
                                     'body': self.latest_additions_info[idx]['format'] + \
                                      websearch_templates.tmpl_record_links(recid=self.latest_additions_info[idx]['id'],
                                                                               rm='citation',
                                                                               ln=ln),
                                     'date': self.latest_additions_info[idx]['date']})
 
             if self.nbrecs > rg:
                 url = websearch_templates.build_search_url(
                     cc=self.name, jrec=rg+1, ln=ln, aas=aas)
             else:
                 url = ""
             # CERN hack: display the records in a grid layout
             if CFG_CERN_SITE and self.name in ['Videos']:
                 return websearch_templates.tmpl_instant_browse(
                     aas=aas, ln=ln, recids=passIDs, more_link=url, grid_layout=True, father=self)
 
             return websearch_templates.tmpl_instant_browse(
                 aas=aas, ln=ln, recids=passIDs, more_link=url, father=self)
 
         return websearch_templates.tmpl_box_no_records(ln=ln)
 
     def create_searchoptions(self):
         "Produces 'Search options' portal box."
         box = ""
         query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f
                    WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id
                    ORDER BY cff.score DESC""" % self.id
         res = run_sql(query)
         if res:
             for row in res:
                 field_id = row[0]
                 field_code = row[1]
                 field_name = row[2]
                 query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff
                                WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue
                                ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id)
                 res_bis = run_sql(query_bis)
                 if res_bis:
                     values = [{'value' : '', 'text' : 'any' + ' ' + field_name}] # FIXME: internationalisation of "any"
                     for row_bis in res_bis:
                         values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]})
 
                     box += websearch_templates.tmpl_select(
                                  fieldname = field_code,
                                  values = values
                                 )
         return box
 
     def create_sortoptions(self, ln=CFG_SITE_LANG):
         """Produces 'Sort options' portal box."""
 
 
         # load the right message language
         _ = gettext_set_language(ln)
 
         box = ""
         query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
                    WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id
                    ORDER BY cff.score DESC, f.name ASC""" % self.id
         values = [{'value' : '', 'text': "- %s -" % _("latest first")}]
         res = run_sql(query)
         if res:
             for row in res:
                 values.append({'value' : row[0], 'text': get_field_i18nname(row[1], ln)})
         else:
             for tmp in ('title', 'author', 'report number', 'year'):
                 values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
 
         box = websearch_templates.tmpl_select(
                    fieldname = 'sf',
                    css_class = 'address',
                    values = values
                   )
         box += websearch_templates.tmpl_select(
                     fieldname = 'so',
                     css_class = 'address',
                     values = [
                               {'value' : 'a' , 'text' : _("asc.")},
                               {'value' : 'd' , 'text' : _("desc.")}
                              ]
                    )
         return box
 
     def create_rankoptions(self, ln=CFG_SITE_LANG):
         "Produces 'Rank options' portal box."
 
         # load the right message language
         _ = gettext_set_language(ln)
 
         values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}]
         for (code, name) in get_bibrank_methods(self.id, ln):
             values.append({'value' : code, 'text': name})
         box = websearch_templates.tmpl_select(
                    fieldname = 'rm',
                    css_class = 'address',
                    values = values
                   )
         return box
 
     def create_displayoptions(self, ln=CFG_SITE_LANG):
         "Produces 'Display options' portal box."
 
         # load the right message language
         _ = gettext_set_language(ln)
 
         values = []
         for i in ['10', '25', '50', '100', '250', '500']:
             values.append({'value' : i, 'text' : i + ' ' + _("results")})
 
         box = websearch_templates.tmpl_select(
                    fieldname = 'rg',
                    selected = str(CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS),
                    css_class = 'address',
                    values = values
                   )
 
         if self.get_sons():
             box += websearch_templates.tmpl_select(
                         fieldname = 'sc',
                         css_class = 'address',
                         values = [
                                   {'value' : '1' , 'text' : CFG_SCOAP3_SITE and _("split by publisher/journal") or _("split by collection")},
                                   {'value' : '0' , 'text' : _("single list")}
                                  ]
                        )
         return box
 
     def create_formatoptions(self, ln=CFG_SITE_LANG):
         "Produces 'Output format options' portal box."
 
         # load the right message language
         _ = gettext_set_language(ln)
 
         box = ""
         values = []
         query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf
                    WHERE cf.id_collection=%d AND cf.id_format=f.id AND f.visibility='1'
                    ORDER BY cf.score DESC, f.name ASC"""  % self.id
         res = run_sql(query)
         if res:
             for row in res:
                 values.append({'value' : row[0], 'text': row[1]})
         else:
             values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")})
         box = websearch_templates.tmpl_select(
                    fieldname = 'of',
                    css_class = 'address',
                    values = values
                   )
         return box
 
     def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'):
         """Produces 'search within' selection box for the current collection."""
 
 
         # get values
         query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
                    WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id
                    ORDER BY cff.score DESC, f.name ASC"""  % self.id
         res = run_sql(query)
         values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}]
         if res:
             for row in res:
                 values.append({'value' : row[0], 'text' : get_field_i18nname(row[1], ln)})
         else:
             if CFG_CERN_SITE:
                 for tmp in ['title', 'author', 'abstract', 'report number', 'year']:
                     values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
             else:
                 for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'journal', 'year', 'fulltext', 'reference']:
                     values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
 
         return websearch_templates.tmpl_searchwithin_select(
                                                 fieldname = fieldname,
                                                 ln = ln,
                                                 selected = value,
                                                 values = values
                                               )
     def create_searchexample(self):
         "Produces search example(s) for the current collection."
         out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id
         return out
 
     def create_searchfor(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
         "Produces either Simple or Advanced 'Search for' box for the current collection."
         if aas == 2:
             return self.create_searchfor_addtosearch(ln)
         elif aas == 1:
             return self.create_searchfor_advanced(ln)
         elif aas == 0:
             return self.create_searchfor_simple(ln)
         else:
             return self.create_searchfor_light(ln)
 
     def create_searchfor_addtosearch(self, ln=CFG_SITE_LANG):
         "Produces add-to-search 'Search for' box for the current collection."
 
         return websearch_templates.tmpl_searchfor_addtosearch(
           ln=ln,
           collection_id=self.name,
           record_count=self.nbrecs,
           searchwithin= self.create_searchwithin_selection_box(fieldname='f1', ln=ln),
         )
 
     def create_searchfor_light(self, ln=CFG_SITE_LANG):
         "Produces light 'Search for' box for the current collection."
 
         return websearch_templates.tmpl_searchfor_light(
           ln=ln,
           collection_id = self.name,
           collection_name=self.get_name(ln=ln),
           record_count=self.nbrecs,
           example_search_queries=self.get_example_search_queries(),
         )
 
     def create_searchfor_simple(self, ln=CFG_SITE_LANG):
         "Produces simple 'Search for' box for the current collection."
 
         return websearch_templates.tmpl_searchfor_simple(
           ln=ln,
           collection_id = self.name,
           collection_name=self.get_name(ln=ln),
           record_count=self.nbrecs,
           middle_option = self.create_searchwithin_selection_box(ln=ln),
         )
 
     def create_searchfor_advanced(self, ln=CFG_SITE_LANG):
         "Produces advanced 'Search for' box for the current collection."
 
         return websearch_templates.tmpl_searchfor_advanced(
           ln = ln,
           collection_id = self.name,
           collection_name=self.get_name(ln=ln),
           record_count=self.nbrecs,
 
           middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln),
           middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln),
           middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln),
 
           searchoptions = self.create_searchoptions(),
           sortoptions = self.create_sortoptions(ln),
           rankoptions = self.create_rankoptions(ln),
           displayoptions = self.create_displayoptions(ln),
           formatoptions = self.create_formatoptions(ln)
         )
 
     def calculate_reclist(self):
         """
         Calculate, set and return the (reclist,
                                        reclist_with_nonpublic_subcolls,
                                        nbrecs_from_hosted_collections)
         tuple for the given collection."""
 
         if str(self.dbquery).startswith("hostedcollection:"):
             # we don't normally use this function to calculate the reclist
             # for hosted collections. In case we do, recursively for a regular
             # ancestor collection, then quickly return the object attributes.
             return (self.reclist,
                     self.reclist_with_nonpublic_subcolls,
                     self.nbrecs)
 
         if self.calculate_reclist_run_already:
             # do we really have to recalculate? If not,
             # then return the object attributes
             return (self.reclist,
                     self.reclist_with_nonpublic_subcolls,
                     self.nbrecs_from_hosted_collections)
 
         write_message("... calculating reclist of %s" % self.name, verbose=6)
 
         reclist = intbitset() # will hold results for public sons only; good for storing into DB
         reclist_with_nonpublic_subcolls = intbitset() # will hold results for both public and nonpublic sons; good for deducing total
                                                       # number of documents
         nbrecs_from_hosted_collections = 0 # will hold the total number of records from descendant hosted collections
 
         if not self.dbquery:
             # A - collection does not have dbquery, so query recursively all its sons
             #     that are either non-restricted or that have the same restriction rules
             for coll in self.get_sons():
                 coll_reclist,\
                 coll_reclist_with_nonpublic_subcolls,\
                 coll_nbrecs_from_hosted_collection = coll.calculate_reclist()
 
                 if ((coll.restricted_p() is None) or
                     (coll.restricted_p() == self.restricted_p())):
                     # add this reclist ``for real'' only if it is public
                     reclist.union_update(coll_reclist)
                 reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls)
 
                 # increment the total number of records from descendant hosted collections
                 nbrecs_from_hosted_collections += coll_nbrecs_from_hosted_collection
 
         else:
             # B - collection does have dbquery, so compute it:
             #     (note: explicitly remove DELETED records)
             if CFG_CERN_SITE:
                 reclist = search_pattern_parenthesised(None, self.dbquery + \
                                          ' -980__:"DELETED" -980__:"DUMMY"', ap=-9) #ap=-9 for allow queries containing hidden tags
             else:
                 reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"', ap=-9) #ap=-9 allow queries containing hidden tags
             reclist_with_nonpublic_subcolls = copy.deepcopy(reclist)
 
         # store the results:
         self.nbrecs_from_hosted_collections = nbrecs_from_hosted_collections
         self.nbrecs = len(reclist_with_nonpublic_subcolls) + \
                       nbrecs_from_hosted_collections
         self.reclist = reclist
         self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls
         # last but not least, update the speed-up flag:
         self.calculate_reclist_run_already = 1
         # return the two sets, as well as
         # the total number of records from descendant hosted collections:
         return (self.reclist,
                 self.reclist_with_nonpublic_subcolls,
                 self.nbrecs_from_hosted_collections)
 
     def calculate_nbrecs_for_external_collection(self, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
         """Calculate the total number of records, aka nbrecs, for given external collection."""
         #if self.calculate_reclist_run_already:
             # do we have to recalculate?
             #return self.nbrecs
         #write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6)
         if external_collections_dictionary.has_key(self.name):
             engine = external_collections_dictionary[self.name]
             if engine.parser:
                 self.nbrecs_tmp = engine.parser.parse_nbrecs(timeout)
                 if self.nbrecs_tmp >= 0: return self.nbrecs_tmp
                 # the parse_nbrecs() function returns negative values for some specific cases
                 # maybe we can handle these specific cases, some warnings or something
                 # for now the total number of records remains silently the same
                 else: return self.nbrecs
             else: write_message("External collection %s does not have a parser!" % self.name, verbose=6)
         else: write_message("External collection %s not found!" % self.name, verbose=6)
         return 0
         # last but not least, update the speed-up flag:
         #self.calculate_reclist_run_already = 1
 
     def check_nbrecs_for_external_collection(self):
         """Check if the external collections has changed its total number of records, aka nbrecs.
         Rerurns True if the total number of records has changed and False if it's the same"""
 
         write_message("*** self.nbrecs = %s / self.cal...ion = %s ***" % (str(self.nbrecs), str(self.calculate_nbrecs_for_external_collection())), verbose=6)
         write_message("*** self.nbrecs != self.cal...ion = %s ***" % (str(self.nbrecs != self.calculate_nbrecs_for_external_collection()),), verbose=6)
         return self.nbrecs != self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
 
     def set_nbrecs_for_external_collection(self):
         """Set this external collection's total number of records, aka nbrecs"""
 
         if self.calculate_reclist_run_already:
             # do we have to recalculate?
             return
         write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6)
         if self.nbrecs_tmp:
             self.nbrecs = self.nbrecs_tmp
         else:
             self.nbrecs = self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
         # last but not least, update the speed-up flag:
         self.calculate_reclist_run_already = 1
 
     def update_reclist(self):
         "Update the record universe for given collection; nbrecs, reclist of the collection table."
         if self.update_reclist_run_already:
             # do we have to reupdate?
             return 0
         write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs), verbose=6)
         sys.stdout.flush()
         try:
             ## In principle we could skip this update if old_reclist==reclist
             ## however we just update it here in case of race-conditions.
-            run_sql("UPDATE collection SET nbrecs=%s, reclist=%s WHERE id=%s",
+            run_sql("UPDATE collection SET nbrecs=%s, reclist=_binary %s WHERE id=%s",
                     (self.nbrecs, self.reclist.fastdump(), self.id))
             if self.old_reclist != self.reclist:
                 self.reclist_updated_since_start = 1
             else:
                 write_message("... no changes in reclist detected", verbose=6)
         except Error, e:
             print "Database Query Error %d: %s." % (e.args[0], e.args[1])
             sys.exit(1)
         # last but not least, update the speed-up flag:
         self.update_reclist_run_already = 1
         return 0
 
 def perform_display_collection(colID, colname, aas, ln, em, show_help_boxes):
     """Returns the data needed to display a collection page
     The arguments are as follows:
     colID - id of the collection to display
     colname - name of the collection to display
     aas - 0 if simple search, 1 if advanced search
     ln - language of the page
     em - code to display just part of the page
     show_help_boxes - whether to show the help boxes or not"""
     # check and update cache if necessary
     cachedfile = open(r"%s/collections/%s-ln=%s.html" %
                       (CFG_CACHEDIR, colname.replace('/', '___SLASH___'), ln),
                       "rb")
     try:
         data = cPickle.load(cachedfile)
     except ValueError:
         data = get_collection(colname).update_webpage_cache(ln)
     cachedfile.close()
     # check em value to return just part of the page
     if em != "":
         if EM_REPOSITORY["search_box"] not in em:
             data["searchfor_%s" % aas] = ""
         if EM_REPOSITORY["see_also_box"] not in em:
             data["focuson_%s" % aas] = ""
         if EM_REPOSITORY["all_portalboxes"] not in em:
             if EM_REPOSITORY["te_portalbox"] not in em:
                 data["te_portalbox"] = ""
             if EM_REPOSITORY["np_portalbox"] not in em:
                 data["np_portalbox"] = ""
             if EM_REPOSITORY["ne_portalbox"] not in em:
                 data["ne_portalbox"] = ""
             if EM_REPOSITORY["tp_portalbox"] not in em:
                 data["tp_portalbox"] = ""
             if EM_REPOSITORY["lt_portalbox"] not in em:
                 data["lt_portalbox"] = ""
             if EM_REPOSITORY["rt_portalbox"] not in em:
                 data["rt_portalbox"] = ""
     c_body = websearch_templates.tmpl_webcoll_body(ln, colID, data["te_portalbox"],
                 data["searchfor_%s"%aas], data["np_portalbox"], data["narrowsearch_%s"%aas],
                 data["focuson_%s"%aas], data["instantbrowse_%s"%aas], data["ne_portalbox"],
                 em=="" or EM_REPOSITORY["body"] in em)
     if show_help_boxes <= 0:
         data["rt_portalbox"] = ""
     return (c_body, data["navtrail_%s"%aas], data["lt_portalbox"], data["rt_portalbox"],
             data["tp_portalbox"], data["te_portalbox"], data["last_updated"])
 
 def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
     """Returns a date string according to the format string.
        It can handle normal date strings and shifts with respect
        to now."""
     date = time.time()
     shift_re = re.compile("([-\+]{0,1})([\d]+)([dhms])")
     factors = {"d":24*3600, "h":3600, "m":60, "s":1}
     m = shift_re.match(var)
     if m:
         sign = m.groups()[0] == "-" and -1 or 1
         factor = factors[m.groups()[2]]
         value = float(m.groups()[1])
         date = time.localtime(date + sign * factor * value)
         date = strftime(format_string, date)
     else:
         date = time.strptime(var, format_string)
         date = strftime(format_string, date)
     return date
 
 def get_current_time_timestamp():
     """Return timestamp corresponding to the current time."""
     return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 
 def compare_timestamps_with_tolerance(timestamp1,
                                       timestamp2,
                                       tolerance=0):
     """Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form
        '2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument
        (in seconds).  Return -1 if TIMESTAMP1 is less than TIMESTAMP2
        minus TOLERANCE, 0 if they are equal within TOLERANCE limit,
        and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE.
     """
     # remove any trailing .00 in timestamps:
     timestamp1 = re.sub(r'\.[0-9]+$', '', timestamp1)
     timestamp2 = re.sub(r'\.[0-9]+$', '', timestamp2)
     # first convert timestamps to Unix epoch seconds:
     timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S"))
     timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S"))
     # now compare them:
     if timestamp1_seconds < timestamp2_seconds - tolerance:
         return -1
     elif timestamp1_seconds > timestamp2_seconds + tolerance:
         return 1
     else:
         return 0
 
 def get_database_last_updated_timestamp():
     """Return last updated timestamp for collection-related and
        record-related database tables.
     """
     database_tables_timestamps = []
     database_tables_timestamps.append(get_table_update_time('bibrec'))
     ## In INSPIRE bibfmt is on innodb and there is not such configuration
     bibfmt_last_update = run_sql("SELECT max(last_updated) FROM bibfmt")
     if bibfmt_last_update and bibfmt_last_update[0][0]:
         database_tables_timestamps.append(str(bibfmt_last_update[0][0]))
     try:
         database_tables_timestamps.append(get_table_update_time('idxWORD%'))
     except ValueError:
         # There are no indexes in the database. That's OK.
         pass
     database_tables_timestamps.append(get_table_update_time('collection%'))
     database_tables_timestamps.append(get_table_update_time('portalbox'))
     database_tables_timestamps.append(get_table_update_time('field%'))
     database_tables_timestamps.append(get_table_update_time('format%'))
     database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME'))
     database_tables_timestamps.append(get_table_update_time('accROLE_accACTION_accARGUMENT', run_on_slave=True))
     return max(database_tables_timestamps)
 
 def get_cache_last_updated_timestamp():
     """Return last updated cache timestamp."""
     try:
         f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "r")
     except:
         return "1970-01-01 00:00:00"
     timestamp = f.read()
     f.close()
     return timestamp
 
 def set_cache_last_updated_timestamp(timestamp):
     """Set last updated cache timestamp to TIMESTAMP."""
     try:
         f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "w")
     except:
         pass
     f.write(timestamp)
     f.close()
     return timestamp
 
 def main():
     """Main that construct all the bibtask."""
     task_init(authorization_action="runwebcoll",
             authorization_msg="WebColl Task Submission",
             description="""Description:
     webcoll updates the collection cache (record universe for a
     given collection plus web page elements) based on invenio.conf and DB
     configuration parameters. If the collection name is passed as an argument,
     only this collection's cache will be updated. If the recursive option is
     set as well, the collection's descendants will also be updated.\n""",
             help_specific_usage="  -c, --collection\t Update cache for the given "
                      "collection only. [all]\n"
                     "  -r, --recursive\t Update cache for the given collection and all its\n"
                     "\t\t\t descendants (to be used in combination with -c). [no]\n"
                     "  -q, --quick\t\t Skip webpage cache update for those collections whose\n"
                     "\t\t\t reclist was not changed. Note: if you use this option, it is advised\n"
                     "\t\t\t to schedule, e.g. a nightly 'webcoll --force'. [no]\n"
                     "  -f, --force\t\t Force update even if cache is up to date. [no]\n"
                     "  -p, --part\t\t Update only certain cache parts (1=reclist,"
                     " 2=webpage). [both]\n"
                     "  -l, --language\t Update pages in only certain language"
                     " (e.g. fr,it,...). [all]\n",
             version=__revision__,
             specific_params=("c:rqfp:l:", [
                     "collection=",
                     "recursive",
                     "quick",
                     "force",
                     "part=",
                     "language="
                 ]),
             task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter,
             task_submit_check_options_fnc=task_submit_check_options,
             task_run_fnc=task_run_core)
 
 def task_submit_elaborate_specific_parameter(key, value, opts, args):
     """ Given the string key it checks it's meaning, eventually using the value.
     Usually it fills some key in the options dict.
     It must return True if it has elaborated the key, False, if it doesn't
     know that key.
     eg:
     if key in ['-n', '--number']:
         self.options['number'] = value
         return True
     return False
     """
     if key in ("-c", "--collection"):
         task_set_option("collection", value)
     elif key in ("-r", "--recursive"):
         task_set_option("recursive", 1)
     elif key in ("-f", "--force"):
         task_set_option("force", 1)
     elif key in ("-q", "--quick"):
         task_set_option("quick", 1)
     elif key in ("-p", "--part"):
         task_set_option("part", int(value))
     elif key in ("-l", "--language"):
         languages = task_get_option("language", [])
         languages += value.split(',')
         for ln in languages:
             if ln not in CFG_SITE_LANGS:
                 print 'ERROR: "%s" is not a recognized language code' % ln
                 return False
         task_set_option("language", languages)
     else:
         return False
     return True
 
 def task_submit_check_options():
     if task_has_option('collection'):
         coll = get_collection(task_get_option("collection"))
         if coll.id is None:
             print 'ERROR: Collection "%s" does not exist' % coll.name
             return False
     return True
 
 def task_run_core():
     """ Reimplement to add the body of the task."""
 #
 # ------->--->time--->------>
 #  (-1)  |   ( 0)    |  ( 1)
 #        |     |     |
 # [T.db] |  [T.fc]   | [T.db]
 #        |     |     |
 #        |<-tol|tol->|
 #
 # the above is the compare_timestamps_with_tolerance result "diagram"
 # [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp
 # ( -1, 0, 1) stand for the returned value
 # tol stands for the tolerance in seconds
 #
 # When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc
 # and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the
 # collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus
 # slightly before the T.db (practically the time distance between the start of the task and the last call of
 # update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the
 # meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if
 # no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if
 # webcoll runs again it will not be fully ran
 #
     task_run_start_timestamp = get_current_time_timestamp()
     colls = []
     # decide whether we need to run or not, by comparing last updated timestamps:
     write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3)
     write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3)
     if task_has_option("part"):
         write_message("Running cache update part %s only." % task_get_option("part"), verbose=3)
     if check_nbrecs_for_all_external_collections() or task_has_option("force") or \
     compare_timestamps_with_tolerance(get_database_last_updated_timestamp(),
                                         get_cache_last_updated_timestamp(),
                                         CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0:
         ## either forced update was requested or cache is not up to date, so recreate it:
         # firstly, decide which collections to do:
         if task_has_option("collection"):
             coll = get_collection(task_get_option("collection"))
             colls.append(coll)
             if task_has_option("recursive"):
                 r_type_descendants = coll.get_descendants(type='r')
                 colls += r_type_descendants
                 v_type_descendants = coll.get_descendants(type='v')
                 colls += v_type_descendants
         else:
             res = run_sql("SELECT name FROM collection ORDER BY id")
             for row in res:
                 colls.append(get_collection(row[0]))
         # secondly, update collection reclist cache:
         if task_get_option('part', 1) == 1:
             i = 0
             for coll in colls:
                 i += 1
                 write_message("%s / reclist cache update" % coll.name)
                 if str(coll.dbquery).startswith("hostedcollection:"):
                     coll.set_nbrecs_for_external_collection()
                 else:
                     coll.calculate_reclist()
                 coll.update_reclist()
                 task_update_progress("Part 1/2: done %d/%d" % (i, len(colls)))
                 task_sleep_now_if_required(can_stop_too=True)
         # thirdly, update collection webpage cache:
         if task_get_option("part", 2) == 2:
             # Updates cache only for chosen languages or for all available ones if none was chosen
             languages = task_get_option("language", CFG_SITE_LANGS)
             write_message("Cache update for the following languages: %s" % str(languages), verbose=3)
             i = 0
             for coll in colls:
                 i += 1
                 if coll.reclist_updated_since_start or task_has_option("collection") or task_get_option("force") or not task_get_option("quick"):
                     write_message("%s / webpage cache update" % coll.name)
                     for lang in languages:
                         coll.update_webpage_cache(lang)
                 else:
                     write_message("%s / webpage cache seems not to need an update and --quick was used" % coll.name, verbose=2)
                 task_update_progress("Part 2/2: done %d/%d" % (i, len(colls)))
                 task_sleep_now_if_required(can_stop_too=True)
 
         # finally update the cache last updated timestamp:
         # (but only when all collections were updated, not when only
         # some of them were forced-updated as per admin's demand)
         if not task_has_option("collection"):
             set_cache_last_updated_timestamp(task_run_start_timestamp)
             write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3)
     else:
         ## cache up to date, we don't have to run
         write_message("Collection cache is up to date, no need to run.")
     ## we are done:
     return True
 
 ### okay, here we go:
 if __name__ == '__main__':
     main()
diff --git a/modules/websession/lib/session.py b/modules/websession/lib/session.py
index 1ae199a37..9f1f0bd4a 100644
--- a/modules/websession/lib/session.py
+++ b/modules/websession/lib/session.py
@@ -1,629 +1,629 @@
 # -*- coding: utf-8 -*-
 
 # This file is part of Invenio.
-# Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2015 CERN.
+# Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 Session management adapted from mod_python Session class.
 
 Just use L{get_session} to obtain a session object (with a dictionary
 interface, which will let you store permanent information).
 """
 
 from invenio.webinterface_handler_wsgi_utils import Cookie, get_cookie
 
 import cPickle
 import time
 import random
 import re
 import sys
 import os
 if sys.hexversion < 0x2060000:
     from md5 import md5
 else:
     from hashlib import md5
 
 from invenio.dateutils import convert_datestruct_to_datetext
 from invenio.dbquery import blob_to_string
 from invenio.config import (CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER,
                             CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT,
                             CFG_SITE_URL,
                             CFG_SITE_SECURE_URL,
                             CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS,
                             CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS,
                             CFG_WEBSESSION_STORAGE)
 from invenio.websession_config import (CFG_WEBSESSION_COOKIE_NAME,
                                       CFG_WEBSESSION_ONE_DAY,
                                       CFG_WEBSESSION_CLEANUP_CHANCE)
 from invenio.dbquery import run_sql
 from invenio.redisutils import get_redis
 
 
 CFG_FULL_HTTPS = CFG_SITE_URL.lower().startswith("https://")
 
 if CFG_WEBSEARCH_PREV_NEXT_HIT_FOR_GUESTS:
     _CFG_SESSION_NON_USEFUL_KEYS = ('uid', 'user_info')
 else:
     _CFG_SESSION_NON_USEFUL_KEYS = ('uid', 'user_info', 'websearch-last-query', 'websearch-last-query-hits')
 
 def get_session(req, sid=None):
     """
     Obtain a session.
 
     If the session has already been created for the current request,
     returns the already existing session.
 
     @param req: the mod_python request object.
     @type req: mod_python request object
     @param sid: the session identifier of an already existing session.
     @type sid: 32 hexadecimal string
     @return: the session.
     @rtype: InvenioSession
     @raise ValueError: if C{sid} is provided and it doesn't correspond to a
         valid session.
     """
     if sid is not None:
         req._session = InvenioSession(req, sid)
         return req._session
     if not hasattr(req, '_session'):
         req._session = InvenioSession(req, sid)
     return req._session
 
 
 class InvenioSessionBase(dict):
     """
     This class implements a Session handling based on MySQL.
 
     @param req: the mod_python request object.
     @type req: mod_python request object
     @param sid: the session identifier if already known
     @type sid: 32 hexadecimal string
     @ivar _remember_me: if the session cookie should last one day or until
         the browser is closed.
     @type _remember_me: bool
 
     @note: The code is heavily based on ModPython 3.3.1 DBMSession
         implementation.
     @note: This class implements IP verification to prevent basic cookie
         stealing.
     @raise ValueError: if C{sid} is provided and correspond to a broken
         session.
     """
 
     def __init__(self, req, sid=None):
         self._remember_me = False
         self._req, self._sid, self._secret = req, sid, None
         self._new = 1
         self._created = 0
         self._accessed = 0
         self._timeout = 0
         self._invalid = 0
         self._dirty = False
         self._http_ip = None
         self._https_ip = None
         self.__need_https = False
         self._cleanup_function = None
 
         dict.__init__(self)
 
         if not self._sid:
             # check to see if cookie exists
             cookie = get_cookie(req, CFG_WEBSESSION_COOKIE_NAME)
             if cookie:
                 self._sid = cookie.value
             else:
                 stub_cookie = get_cookie(req, CFG_WEBSESSION_COOKIE_NAME + 'stub')
                 self.__need_https = stub_cookie and stub_cookie.value == 'HTTPS'
 
         if self._sid:
             if not _check_sid(self._sid):
                 if sid:
                     # Supplied explicitly by user of the class,
                     # raise an exception and make the user code
                     # deal with it.
                     raise ValueError("Invalid Session ID: sid=%s" % sid)
                 else:
                     # Derived from the cookie sent by browser,
                     # wipe it out so it gets replaced with a
                     # correct value.
                     self._sid = None
 
         if self._sid:
             # attempt to load ourselves
             if self.load():
                 self._new = 0
 
         if self._new:
             # make a new session
             self._sid = _new_sid(self._req)
             remote_ip = self._req.remote_ip
             if self._req.is_https():
                 self._https_ip = remote_ip
             else:
                 self._http_ip = remote_ip
             self._created = time.time()
             self._timeout = CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT * \
                 CFG_WEBSESSION_ONE_DAY
 
         self._accessed = time.time()
 
         # need cleanup?
         if random.randint(1, CFG_WEBSESSION_CLEANUP_CHANCE) == 1:
             self.cleanup()
 
         if self._new and (not self.__need_https or self._req.is_https()):
             ## We want to issue cookies only in case this is a new session
             ## and there is not already a session cookie that is available
             ## only over HTTPS
             for cookie in self.make_cookies():
                 self._req.set_cookie(cookie)
 
     def get_dirty(self):
         """
         Is this session dirty?
         """
         return self._dirty
 
     def set_dirty(self, dummy=True):
         """
         Flag this session as dirty. It takes a parameter, just in order
         to be used within a property
         """
         self._dirty = True
 
     dirty = property(get_dirty, set_dirty)
 
     def __setitem__(self, key, value):
         if self.get(key) != value:
             dict.__setitem__(self, key, value)
             self._dirty = True
 
     def __delitem__(self, key):
         if key in self:
             dict.__delitem__(self, key)
             self._dirty = True
 
     def set_remember_me(self, remember_me=True):
         """
         Set/Unset the L{_remember_me} flag.
 
         @param remember_me: True if the session cookie should last one day or
             until the browser is closed.
         @type remember_me: bool
         """
         self._remember_me = remember_me
         if remember_me:
             self.set_timeout(CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER *
                 CFG_WEBSESSION_ONE_DAY)
         else:
             self.set_timeout(CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT *
                 CFG_WEBSESSION_ONE_DAY)
         for cookie in self.make_cookies():
             self._req.set_cookie(cookie)
 
     def load(self):
         """
         Load the session from the database.
         @return: 1 in case of success, 0 otherwise.
         @rtype: integer
         """
         session_dict = None
         invalid = False
         res = self.load_from_storage(self._sid)
         if res:
             session_dict = cPickle.loads(blob_to_string(res))
             remote_ip = self._req.remote_ip
             if self._req.is_https():
                 if session_dict['_https_ip'] is not None:
                     if ':' in remote_ip:
                         ## IPV6 address, we don't skip bits
                         if session_dict['_https_ip'] != remote_ip:
                             invalid = True
                     else:
                         if _mkip(session_dict['_https_ip']) >> \
                                 CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS != \
                             _mkip(remote_ip) >> \
                                 CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS:
                             invalid = True
                 else:
                     session_dict['_https_ip'] = remote_ip
             else:
                 if session_dict['_http_ip'] is not None:
                     if ':' in remote_ip:
                         ## IPV6 address, we don't skip bits
                         if session_dict['_http_ip'] != remote_ip:
                             invalid = True
                     else:
                         if _mkip(session_dict['_http_ip']) >> \
                                 CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS != \
                             _mkip(remote_ip) >> \
                                 CFG_WEBSESSION_IPADDR_CHECK_SKIP_BITS:
                             invalid = True
                 else:
                     session_dict['_http_ip'] = remote_ip
 
         if session_dict is None:
             return 0
 
         if invalid:
             return 0
 
         if (time.time() - session_dict["_accessed"]) > \
                 session_dict["_timeout"]:
             return 0
 
         self._created  = session_dict["_created"]
         self._accessed = session_dict["_accessed"]
         self._timeout  = session_dict["_timeout"]
         self._remember_me = session_dict["_remember_me"]
         self.update(session_dict["_data"])
         return 1
 
     def is_useful(self):
         """
         Return True if the session contains some key considered
         useful (i.e. that deserve being preserved)
         """
         for key in self:
             if key not in _CFG_SESSION_NON_USEFUL_KEYS:
                 return True
         return False
 
     def save(self):
         """
         Save the session to the database.
         """
         uid = self.get('uid', -1)
         if (not self.__need_https or self._req.is_https()) and not self._invalid and self._sid and self._dirty and (uid > 0 or self.is_useful()):
             ## We store something only for real users or useful sessions.
             session_dict = {"_data" : self.copy(),
                     "_created" : self._created,
                     "_accessed": self._accessed,
                     "_timeout" : self._timeout,
                     "_http_ip" : self._http_ip,
                     "_https_ip" : self._https_ip,
                     "_remember_me" : self._remember_me
             }
             session_object = cPickle.dumps(session_dict, -1)
 
             self.save_in_storage(self._sid,
                                  session_object,
                                  self._timeout,
                                  uid)
 
             for cookie in self.make_cookies():
                 self._req.set_cookie(cookie)
         ## No more dirty :-)
         self._dirty = False
 
     def delete(self):
         """
         Delete the session.
         """
         self.delete_from_storage(self._sid)
         self.clear()
 
     def invalidate(self):
         """
         Declare the session as invalid.
         """
         cookies = self.make_cookies()
         for cookie in cookies:
             cookie.expires = 0
             self._req.set_cookie(cookie)
         self.delete()
         self._invalid = 1
         if hasattr(self._req, '_session'):
             delattr(self._req, '_session')
 
     def make_cookies(self):
         """
         Create the necessary cookies to implement secure session handling
         (possibly over HTTPS).
 
         @return: a list of cookies.
         """
         cookies = []
         uid = self.get('uid', -1)
         if uid > 0 and CFG_SITE_SECURE_URL.startswith("https://"):
             stub_cookie = Cookie(CFG_WEBSESSION_COOKIE_NAME + 'stub', 'HTTPS', HttpOnly=True)
         else:
             stub_cookie = Cookie(CFG_WEBSESSION_COOKIE_NAME + 'stub', 'NO', HttpOnly=True)
         cookies.append(stub_cookie)
         if self._req.is_https() or not CFG_SITE_SECURE_URL.startswith("https://") or uid <= 0:
             cookie = Cookie(CFG_WEBSESSION_COOKIE_NAME, self._sid, HttpOnly=True)
             if CFG_SITE_SECURE_URL.startswith("https://") and uid > 0:
                 cookie.secure = True
             cookies.append(cookie)
         for cookie in cookies:
             cookie.path = '/'
             if self._remember_me:
                 cookie.expires = time.time() + self._timeout
 
         return cookies
 
 
     def initial_http_ip(self):
         """
         @return: the initial ip addressed for the HTTP protocol for which this
         session was issued.
         @rtype: string
         @note: it returns None if this session has always been used through
             HTTPS requests.
         """
         return self._http_ip
 
     def initial_https_ip(self):
         """
         @return: the initial ip addressed for the HTTPS protocol for which this
         session was issued.
         @rtype: string
         @note: it returns None if this session has always been used through
             HTTP requests.
         """
         return self._https_ip
 
     def is_new(self):
         """
         @return: True if the session has just been created.
         @rtype: bool
         """
         return not not self._new
 
     def sid(self):
         """
         @return: the session identifier.
         @rtype: 32 hexadecimal string
         """
         return self._sid
 
     def created(self):
         """
         @return: the UNIX timestamp for when the session has been created.
         @rtype: double
         """
         return self._created
 
     def last_accessed(self):
         """
         @return: the UNIX timestamp for when the session has been last
             accessed.
         @rtype: double
         """
         return self._accessed
 
     def timeout(self):
         """
         @return: the number of seconds from the last accessed timestamp,
             after which the session is invalid.
         @rtype: double
         """
         return self._timeout
 
     def set_timeout(self, secs):
         """
         Set the number of seconds from the last accessed timestamp,
             after which the session is invalid.
         @param secs: the number of seconds.
         @type secs: double
         """
         self._timeout = secs
 
     def cleanup(self):
         """
         Perform the database session cleanup.
         """
         if self._cleanup_function:
             self._req.register_cleanup(self._cleanup_function)
         self._req.log_error("InvenioSession: registered database cleanup.")
 
     ## NOTE: Let's disable __del__ to avoid garbage collection not to
     ## be able to delete circular references involving the session
     ## We can .save() anyway in good points, such as at the end of
     ## of the application request
     #def __del__(self):
         #self.save()
 
     def get_need_https(self):
         return self.__need_https
 
     ## This property will be True if the connection need to be set to HTTPS
     ## in order for the session to be successfully read. This can actually
     ## be checked by not having a cookie, but just having the stub_cookie.
     ## The default cookie is only sent via HTTPS, while the stub_cookie
     ## is also sent via HTTP and contains the uid, of the user. So if there
     ## is actually a stub cookie and its value is different than -1 this
     ## property will be True, meaning the server should redirect the client
     ## to an HTTPS connection if she really wants to access authenticated
     ## resources.
     need_https = property(get_need_https)
 
 def _init_rnd():
     """
     Initialize random number generators.
     This is key in multithreaded env, see Python docs for random.
     @return: the generators.
     @rtype: list of generators
     """
 
     # query max number of threads
     gennum = 10
 
     # make generators
     # this bit is from Python lib reference
     random_generator = random.Random(time.time())
     result = [random_generator]
     for dummy in range(gennum - 1):
         laststate = random_generator.getstate()
         random_generator = random.Random()
         random_generator.setstate(laststate)
         random_generator.jumpahead(1000000)
         result.append(random_generator)
 
     return result
 
 _RANDOM_GENERATORS = _init_rnd()
 _RANDOM_ITERATOR = iter(_RANDOM_GENERATORS)
 
 def _get_generator():
     """
     get rnd_iter.next(), or start over
     if we reached the end of it
     @return: the next random number.
     @rtype: double
     """
     global _RANDOM_ITERATOR
     try:
         return _RANDOM_ITERATOR.next()
     except StopIteration:
         # the small potential for two threads doing this
         # seems does not warrant use of a lock
         _RANDOM_ITERATOR = iter(_RANDOM_GENERATORS)
         return _RANDOM_ITERATOR.next()
 
 _RE_VALIDATE_SID = re.compile('[0-9a-f]{32}$')
 def _check_sid(sid):
     """
     Check the validity of the session identifier.
     The sid must be 32 characters long, and consisting of the characters
     0-9 and a-f.
 
     The sid may be passed in a cookie from the client and as such
     should not be trusted. This is particularly important in
     FileSession, where the session filename is derived from the sid.
     A sid containing '/' or '.' characters could result in a directory
     traversal attack
 
     @param sid: the session identifier.
     @type sid: string
     @return: True if the session identifier is valid.
     @rtype: bool
     """
     return not not _RE_VALIDATE_SID.match(sid)
 
 def _new_sid(req):
     """
     Make a number based on current time, pid, remote ip
     and two random ints, then hash with md5. This should
     be fairly unique and very difficult to guess.
 
     @param req: the mod_python request object.
     @type req: mod_python request object.
     @return: the session identifier.
     @rtype: 32 hexadecimal string
 
     @warning: The current implementation of _new_sid returns an
         md5 hexdigest string. To avoid a possible directory traversal
         attack in FileSession the sid is validated using
         the _check_sid() method and the compiled regex
         validate_sid_re. The sid will be accepted only if len(sid) == 32
         and it only contains the characters 0-9 and a-f.
 
         If you change this implementation of _new_sid, make sure to also
         change the validation scheme, as well as the test_Session_illegal_sid()
         unit test in test/test.py.
     """
 
     the_time = long(time.time()*10000)
     pid = os.getpid()
     random_generator = _get_generator()
     rnd1 = random_generator.randint(0, 999999999)
     rnd2 = random_generator.randint(0, 999999999)
     remote_ip = req.remote_ip
 
     return md5("%d%d%d%d%s" % (
         the_time,
         pid,
         rnd1,
         rnd2,
         remote_ip)
     ).hexdigest()
 
 def _mkip(ip):
     """
     Compute a numerical value for a dotted IP
     """
     num = 0L
     for i in ip.split('.'):
         num = (num << 8) + int(i)
     return num
 
 
 
 class InvenioSessionMySQL(InvenioSessionBase):
     def __init__(self, req, sid=None):
 
         def cb_session_cleanup(data=None):
             """
             Session cleanup procedure which to be executed at the end
             of the request handling.
             """
             run_sql("""DELETE LOW_PRIORITY FROM session
                        WHERE session_expiry <= UTC_TIMESTAMP()""")
 
         self.cleanup_function = cb_session_cleanup
         super(InvenioSessionMySQL, self).__init__(req, sid)
 
     def load_from_storage(self, sid):
         ret = run_sql("""SELECT session_object FROM session
                          WHERE session_key = %s""", [sid])
         if ret:
             return ret[0][0]
 
     def delete_from_storage(self, sid):
         return run_sql("""DELETE LOW_PRIORITY FROM session
                           WHERE session_key=%s""", [sid])
 
     def save_in_storage(self, sid, session_object, timeout, uid):
         session_key = sid
         session_expiry = time.time() + timeout + CFG_WEBSESSION_ONE_DAY
         session_expiry = convert_datestruct_to_datetext(time.gmtime(session_expiry))
 
         run_sql("""INSERT INTO session(
                                     session_key,
                                     session_expiry,
                                     session_object,
                                     uid
-                    ) VALUES (%s, %s, %s, %s)
+                    ) VALUES (%s, %s, _binary %s, %s)
                    ON DUPLICATE KEY UPDATE
                         session_expiry=%s,
-                        session_object=%s,
+                        session_object=_binary %s,
                         uid=%s
         """, (session_key, session_expiry, session_object, uid,
             session_expiry, session_object, uid))
 
 
 class InvenioSessionRedis(InvenioSessionBase):
 
     def generate_key(self, sid):
         return 'session_%s' % sid
 
     def load_from_storage(self, sid):
         return get_redis().get(self.generate_key(sid))
 
     def delete_from_storage(self, sid):
         return get_redis().delete(self.generate_key(sid))
 
     def save_in_storage(self, sid, session_object, timeout, uid):  # pylint: disable=W0613
         return get_redis().setex(self.generate_key(sid),
                                  session_object,
                                  timeout)
 
 if CFG_WEBSESSION_STORAGE == 'mysql':
     InvenioSession = InvenioSessionMySQL
 elif CFG_WEBSESSION_STORAGE == 'redis':
     InvenioSession = InvenioSessionRedis
diff --git a/requirements.txt b/requirements.txt
index 160ec6b0b..77a74e64f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,31 +1,31 @@
 # Invenio requirements.
 MySQL-python==1.2.5
 rdflib==2.4.2
 reportlab==2.5
 python-dateutil<=1.9999
 python-magic==0.4.2
-http://www.reportlab.com/ftp/pyRXP-1.16-daily-unix.tar.gz
+http://invenio-software.org/download/python/pyRXP-1.16-daily-unix.tar.gz
 numpy==1.7.0
 lxml==3.1.2
 mechanize==0.2.5
 python-Levenshtein==0.10.2
 PyStemmer==1.3.0
-https://py-editdist.googlecode.com/files/py-editdist-0.3.tar.gz
+http://invenio-software.org/download/python/py-editdist-0.3.tar.gz
 feedparser==5.1.3
 BeautifulSoup==3.2.1
 beautifulsoup4==4.1.3
 python-twitter==2.0
 msgpack-python==0.3.0
 pyparsing==1.5.6
 requests==2.2.0
 PyPDF2==1.19
 rauth==0.6.2
 unidecode==0.04.14
 python-openid==2.2.5
 qrcode==4.0.4
 pillow==2.3.0
 jinja2==2.7.2
 redis==2.9.0
 nydus==0.10.6
 Cerberus==0.5
 matplotlib==1.0.1
diff --git a/scripts/create-instance.sh b/scripts/create-instance.sh
index 39257fd04..d52334771 100755
--- a/scripts/create-instance.sh
+++ b/scripts/create-instance.sh
@@ -1,264 +1,264 @@
 #!/usr/bin/env bash
 #
 # This file is part of Invenio.
 # Copyright (C) 2016 CERN.
 #
 # Invenio is free software; you can redistribute it
 # and/or modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be
 # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the
 # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 # MA 02111-1307, USA.
 #
 # In applying this license, CERN does not
 # waive the privileges and immunities granted to it by virtue of its status
 # as an Intergovernmental Organization or submit itself to any jurisdiction.
 
 # quit on errors:
 set -o errexit
 
 # check environment variables:
 if [ "${INVENIO_MYSQL_HOST}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_HOST before runnning this script."
     echo "[ERROR] Example: export INVENIO_MYSQL_HOST=192.168.50.11"
     exit 1
 fi
 if [ "${INVENIO_MYSQL_DBNAME}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBNAME before runnning this script."
     echo "[ERROR] Example: INVENIO_MYSQL_DBNAME=invenio1"
     exit 1
 fi
 if [ "${INVENIO_MYSQL_DBUSER}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBUSER before runnning this script."
     echo "[ERROR] Example: INVENIO_MYSQL_DBUSER=invenio1"
     exit 1
 fi
 if [ "${INVENIO_MYSQL_DBPASS}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBPASS before runnning this script."
     echo "[ERROR] Example: INVENIO_MYSQL_DBPASS=dbpass123"
     exit 1
 fi
 if [ "${INVENIO_WEB_HOST}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_WEB_HOST before runnning this script."
     echo "[ERROR] Example: export INVENIO_WEB_HOST=192.168.50.10"
     exit 1
 fi
 if [ "${INVENIO_WEB_DSTDIR}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_WEB_DSTDIR before runnning this script."
     echo "[ERROR] Example: export INVENIO_WEB_DSTDIR=/opt/invenio"
     exit 1
 fi
 if [ "${INVENIO_WEB_USER}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_WEB_USER before runnning this script."
     echo "[ERROR] Example: export INVENIO_WEB_USER=www-data"
     exit 1
 fi
 
 # check optional environment variables:
 INVENIO_WEB_SMTP_PORT=${INVENIO_WEB_SMTP_PORT:=25}
 
 # quit on unbound symbols:
 set -o nounset
 
 # runs inside virtual environment?
 VIRTUAL_ENV=${VIRTUAL_ENV:=}
 
 # runs as root or needs sudo?
 if [[ "$EUID" -ne 0 ]]; then
     sudo='sudo'
 else
     sudo=''
 fi
 
 create_apache_vhost_ubuntu_precise () {
     sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ssl-cert
     sudo mkdir -p /etc/apache2/ssl
     if [ ! -e /etc/apache2/ssl/apache.pem ]; then
         sudo DEBIAN_FRONTEND=noninteractive /usr/sbin/make-ssl-cert \
             /usr/share/ssl-cert/ssleay.cnf /etc/apache2/ssl/apache.pem
     fi
     if [ ! -L /etc/apache2/sites-available/invenio.conf ]; then
         sudo ln -fs "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" \
             /etc/apache2/sites-available/invenio.conf
     fi
     if [ ! -e "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" ]; then
         # create them empty for the time being so that apache would start
         sudo mkdir -p "${INVENIO_WEB_DSTDIR}/etc/apache/"
         sudo touch "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf"
         sudo chown -R "${INVENIO_WEB_USER}.${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}"
     fi
     if [ ! -L /etc/apache2/sites-available/invenio-ssl.conf ]; then
         sudo ln -fs "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" \
             /etc/apache2/sites-available/invenio-ssl.conf
     fi
     if [ ! -e "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" ]; then
         # create them empty for the time being so that apache would start
         sudo mkdir -p "${INVENIO_WEB_DSTDIR}/etc/apache/"
         sudo touch "${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf"
         sudo chown -R "${INVENIO_WEB_USER}.${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}"
     fi
     if [ -e /etc/apache2/sites-available/default-ssl ]; then
         sudo /usr/sbin/a2dissite "*default*"
     fi
     sudo /usr/sbin/a2ensite "invenio*"
     sudo /usr/sbin/a2enmod ssl
     sudo /usr/sbin/a2enmod version || echo "[WARNING] Ignoring 'a2enmod version' command; hoping IfVersion is built-in."
     sudo /usr/sbin/a2enmod xsendfile
     sudo /etc/init.d/apache2 restart
 }
 
 create_apache_vhost_centos6 () {
     if ! grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" /etc/httpd/conf/httpd.conf; then
         echo "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" | sudo tee -a /etc/httpd/conf/httpd.conf
     fi
     if ! grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" /etc/httpd/conf/httpd.conf; then
         echo "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" | sudo tee -a /etc/httpd/conf/httpd.conf
     fi
     if ! grep -q "TraceEnable off" /etc/httpd/conf/httpd.conf; then
         echo "TraceEnable off" | sudo tee -a /etc/httpd/conf/httpd.conf
     fi
     if ! grep -q "SSLProtocol all -SSLv2" /etc/httpd/conf/httpd.conf; then
         echo "SSLProtocol all -SSLv2" | sudo tee -a /etc/httpd/conf/httpd.conf
     fi
     sudo sed -i 's,^Alias /error/,#Alias /error/,g' /etc/httpd/conf/httpd.conf
 }
 
 create_symlinks () {
     $sudo mkdir -p "${INVENIO_WEB_DSTDIR}"
     $sudo chown "${INVENIO_WEB_USER}.${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}"
     $sudo -u "${INVENIO_WEB_USER}" mkdir -p "${INVENIO_WEB_DSTDIR}/lib/python/invenio"
     for pythonversion in python2.4 python2.6 python2.7; do
         for libversion in lib lib64 local/lib local/lib64; do
             for packageversion in site-packages dist-packages; do
                 if [ -d "/usr/$libversion/$pythonversion/$packageversion/" ] && [ ! -L "/usr/$libversion/$pythonversion/$packageversion/invenio" ]; then
                     $sudo ln -s "${INVENIO_WEB_DSTDIR}/lib/python/invenio" "/usr/$libversion/$pythonversion/$packageversion/invenio"
                 fi
             done
         done
     done
 }
 
 install_sources () {
     cd "${INVENIO_SRCDIR}"
     rm -rf autom4te.cache/
     aclocal
     automake -a
     autoconf
     ./configure --prefix="${INVENIO_WEB_DSTDIR}"
     make clean -s
     make -s
     sudo -u "${INVENIO_WEB_USER}" make -s install
-    #sudo -u "${INVENIO_WEB_USER}" make -s install-jquery-plugins
+    sudo -u "${INVENIO_WEB_USER}" make -s install-jquery-plugins
     sudo -u "${INVENIO_WEB_USER}" make -s install-mathjax-plugin
     sudo -u "${INVENIO_WEB_USER}" make -s install-ckeditor-plugin
     sudo -u "${INVENIO_WEB_USER}" make -s install-pdfa-helper-files
     sudo -u "${INVENIO_WEB_USER}" make -s install-mediaelement
 }
 
 create_openoffice_tmp_space () {
     sudo mkdir -p "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files"
     sudo chown -R nobody "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files"
     sudo chmod -R 755 "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files"
 }
 
 configure_instance () {
     # create invenio-local.conf
     echo "[Invenio]
 CFG_SITE_URL = http://${INVENIO_WEB_HOST}
 CFG_SITE_SECURE_URL = https://${INVENIO_WEB_HOST}
 CFG_DATABASE_HOST = ${INVENIO_MYSQL_HOST}
 CFG_DATABASE_NAME = ${INVENIO_MYSQL_DBNAME}
 CFG_DATABASE_USER = ${INVENIO_MYSQL_DBUSER}
 CFG_DATABASE_PASS = ${INVENIO_MYSQL_DBPASS}
 CFG_SITE_ADMIN_EMAIL = ${INVENIO_ADMIN_EMAIL}
 CFG_SITE_SUPPORT_EMAIL = ${INVENIO_ADMIN_EMAIL}
 CFG_WEBALERT_ALERT_ENGINE_EMAIL = ${INVENIO_ADMIN_EMAIL}
 CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = ${INVENIO_ADMIN_EMAIL}
 CFG_WEBCOMMENT_DEFAULT_MODERATOR = ${INVENIO_ADMIN_EMAIL}
 CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL = ${INVENIO_ADMIN_EMAIL}
 CFG_BIBCATALOG_SYSTEM_EMAIL_ADDRESS = ${INVENIO_ADMIN_EMAIL}
 CFG_BIBSCHED_PROCESS_USER = ${INVENIO_WEB_USER}
 CFG_MISCUTIL_SMTP_PORT = ${INVENIO_WEB_SMTP_PORT}
 " | \
         sudo -u "${INVENIO_WEB_USER}" tee "${INVENIO_WEB_DSTDIR}/etc/invenio-local.conf"
 
     # update instance with this information:
     sudo -u "${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --update-all
 }
 
 create_tables () {
     sudo -u "${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --create-tables --yes-i-know
 }
 
 create_apache_configuration () {
     sudo -u "${INVENIO_WEB_USER}" VIRTUAL_ENV="${VIRTUAL_ENV}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --create-apache-conf
 }
 
 restart_apache_ubuntu_precise () {
     $sudo /etc/init.d/apache2 restart
 }
 
 restart_apache_centos6 () {
     $sudo /etc/init.d/httpd restart
 }
 
 main () {
 
     # detect OS distribution and release version:
     if hash lsb_release 2> /dev/null; then
         os_distribution=$(lsb_release -i | cut -f 2)
         os_release=$(lsb_release -r | cut -f 2 | grep -oE '[0-9]+\.' | cut -d. -f1 | head -1)
     elif [ -e /etc/redhat-release ]; then
         os_distribution=$(cut -d ' ' -f 1 /etc/redhat-release)
         os_release=$(grep -oE '[0-9]+\.' /etc/redhat-release | cut -d. -f1 | head -1)
     else
         os_distribution="UNDETECTED"
         os_release="UNDETECTED"
     fi
 
     # call appropriate provisioning functions:
     if [ "$os_distribution" = "Ubuntu" ]; then
         if [ "$os_release" = "12" ]; then
             create_apache_vhost_ubuntu_precise
             create_symlinks
             install_sources
             create_openoffice_tmp_space
             configure_instance
             create_tables
             create_apache_configuration
             restart_apache_ubuntu_precise
         else
             echo "[ERROR] Sorry, unsupported release ${os_release}."
             exit 1
         fi
     elif [ "$os_distribution" = "CentOS" ]; then
         if [ "$os_release" = "6" ]; then
             create_apache_vhost_centos6
             create_symlinks
             install_sources
             create_openoffice_tmp_space
             configure_instance
             create_tables
             create_apache_configuration
             restart_apache_centos6
             exit 1
         else
             echo "[ERROR] Sorry, unsupported release ${os_release}."
             exit 1
         fi
     else
         echo "[ERROR] Sorry, unsupported distribution ${os_distribution}."
         exit 1
     fi
 
 }
 
 main
diff --git a/scripts/drop-instance.sh b/scripts/drop-instance.sh
index ae23a0443..8f0a81369 100755
--- a/scripts/drop-instance.sh
+++ b/scripts/drop-instance.sh
@@ -1,184 +1,185 @@
 #!/usr/bin/env bash
 #
 # This file is part of Invenio.
 # Copyright (C) 2015, 2016 CERN.
 #
 # Invenio is free software; you can redistribute it
 # and/or modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be
 # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the
 # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 # MA 02111-1307, USA.
 #
 # In applying this license, CERN does not
 # waive the privileges and immunities granted to it by virtue of its status
 # as an Intergovernmental Organization or submit itself to any jurisdiction.
 
 # quit on errors:
 set -o errexit
 
 # check environment variables:
 if [ "${INVENIO_MYSQL_HOST}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_HOST before runnning this script."
     echo "[ERROR] Example: export INVENIO_MYSQL_HOST=192.168.50.11"
     exit 1
 fi
 if [ "${INVENIO_MYSQL_DBNAME}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBNAME before runnning this script."
     echo "[ERROR] Example: INVENIO_MYSQL_DBNAME=invenio1"
     exit 1
 fi
 if [ "${INVENIO_MYSQL_DBUSER}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBUSER before runnning this script."
     echo "[ERROR] Example: INVENIO_MYSQL_DBUSER=invenio1"
     exit 1
 fi
 if [ "${INVENIO_MYSQL_DBPASS}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_MYSQL_DBPASS before runnning this script."
     echo "[ERROR] Example: INVENIO_MYSQL_DBPASS=dbpass123"
     exit 1
 fi
 if [ "${INVENIO_WEB_HOST}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_WEB_HOST before runnning this script."
     echo "[ERROR] Example: export INVENIO_WEB_HOST=192.168.50.10"
     exit 1
 fi
 if [ "${INVENIO_WEB_DSTDIR}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_WEB_DSTDIR before runnning this script."
     echo "[ERROR] Example: export INVENIO_WEB_DSTDIR=/opt/invenio"
     exit 1
 fi
 if [ "${INVENIO_WEB_USER}" = "" ]; then
     echo "[ERROR] Please set environment variable INVENIO_WEB_USER before runnning this script."
     echo "[ERROR] Example: export INVENIO_WEB_USER=www-data"
     exit 1
 fi
 
 # quit on unbound symbols:
 set -o nounset
 
 # runs as root or needs sudo?
 if [[ "$EUID" -ne 0 ]]; then
     sudo='sudo'
 else
     sudo=''
 fi
 
 
 start_apache_ubuntu_precise () {
     $sudo /etc/init.d/apache2 start
 }
 
 stop_apache_ubuntu_precise () {
     $sudo /etc/init.d/apache2 stop
 }
 
 start_apache_centos6 () {
     $sudo /etc/init.d/httpd start
 }
 
 stop_apache_centos6 () {
     $sudo /etc/init.d/httpd stop
 }
 
 drop_apache_vhost_ubuntu_precise () {
     stop_apache_ubuntu_precise
     if [ -e /etc/apache2/sites-available/default-ssl ]; then
         $sudo /usr/sbin/a2ensite "*default*"
     fi
     if [ -L /etc/apache2/sites-enabled/invenio.conf ]; then
         $sudo /usr/sbin/a2dissite "invenio*"
     fi
     start_apache_ubuntu_precise
 }
 
 drop_apache_vhost_centos6 () {
     stop_apache_centos6
     if grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf" /etc/httpd/conf/httpd.conf; then
         sudo sed -i "s,^Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf,#Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost.conf,g" /etc/httpd/conf/httpd.conf
     fi
     if grep -q "Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf" /etc/httpd/conf/httpd.conf; then
         sudo sed -i "s,^Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf,#Include ${INVENIO_WEB_DSTDIR}/etc/apache/invenio-apache-vhost-ssl.conf,g" /etc/httpd/conf/httpd.conf
     fi
     sudo sed -i 's,^#Alias /error/,Alias /error/,g' /etc/httpd/conf/httpd.conf
     start_apache_centos6
 }
 
 drop_symlinks () {
     for pythonversion in python2.4 python2.6 python2.7; do
         for libversion in lib lib64 local/lib local/lib64; do
             for packageversion in site-packages dist-packages; do
                 if [ -d /usr/$libversion/$pythonversion/$packageversion/ ] && [ ! -L /usr/$libversion/$pythonversion/$packageversion/invenio ]; then
                     $sudo rm /usr/$libversion/$pythonversion/$packageversion/invenio
                 fi
             done
         done
     done
 }
 
 drop_instance_folder () {
     $sudo rm -rf "${INVENIO_WEB_DSTDIR}/var/tmp/ooffice-tmp-files"
-    $sudo -u "${INVENIO_WEB_USER}" rm -rf "${INVENIO_WEB_DSTDIR}/*"
+    # shellcheck disable=SC2086
+    $sudo -u "${INVENIO_WEB_USER}" rm -rf ${INVENIO_WEB_DSTDIR}/*
 }
 
 drop_instance_tables () {
     if [ -e "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" ]; then
         $sudo -u "${INVENIO_WEB_USER}" "${INVENIO_WEB_DSTDIR}/bin/inveniocfg" --drop-tables --yes-i-know
     fi
 }
 
 main () {
 
     # detect OS distribution and release version:
     if hash lsb_release 2> /dev/null; then
         os_distribution=$(lsb_release -i | cut -f 2)
         os_release=$(lsb_release -r | cut -f 2 | grep -oE '[0-9]+\.' | cut -d. -f1 | head -1)
     elif [ -e /etc/redhat-release ]; then
         os_distribution=$(cut -d ' ' -f 1 /etc/redhat-release)
         os_release=$(grep -oE '[0-9]+\.' /etc/redhat-release | cut -d. -f1 | head -1)
     else
         os_distribution="UNDETECTED"
         os_release="UNDETECTED"
     fi
 
     # call appropriate provisioning functions:
     if [ "$os_distribution" = "Ubuntu" ]; then
         if [ "$os_release" = "12" ]; then
             stop_apache_ubuntu_precise
             drop_instance_tables
             start_apache_ubuntu_precise
             drop_apache_vhost_ubuntu_precise
             drop_instance_folder
             drop_symlinks
         else
             echo "[ERROR] Sorry, unsupported release ${os_release}."
             exit 1
         fi
     elif [ "$os_distribution" = "CentOS" ]; then
         if [ "$os_release" = "6" ]; then
             stop_apache_centos6
             drop_instance_tables
             start_apache_centos6
             drop_apache_vhost_centos6
             drop_instance_folder
             drop_symlinks
         else
             echo "[ERROR] Sorry, unsupported release ${os_release}."
             exit 1
         fi
     else
         echo "[ERROR] Sorry, unsupported distribution ${os_distribution}."
         exit 1
     fi
 
 }
 
 main