diff --git a/Makefile.am b/Makefile.am index 121fb75bc..005f28b0a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,559 +1,561 @@ ## This file is part of Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. confignicedir = $(sysconfdir)/build confignice_SCRIPTS=config.nice SUBDIRS = po config modules EXTRA_DIST = UNINSTALL THANKS RELEASE-NOTES configure-tests.py config.nice.in \ config.rpath # current MathJax version and packages # See also modules/miscutil/lib/htmlutils.py (get_mathjax_header) MJV = 1.1 MATHJAX = https://github.com/mathjax/MathJax/zipball/v$(MJV) # current CKeditor version CKV = 3.6.1 CKEDITOR = ckeditor_$(CKV).zip # current MediaElement.js version MEV = master MEDIAELEMENT = http://github.com/johndyer/mediaelement/zipball/$(MEV) # git-version-get stuff: BUILT_SOURCES = $(top_srcdir)/.version $(top_srcdir)/.version: echo $(VERSION) > $@-t && mv $@-t $@ dist-hook: echo $(VERSION) > $(distdir)/.tarball-version check-custom-templates: $(PYTHON) $(top_srcdir)/modules/webstyle/lib/template.py --check-custom-templates $(top_srcdir) kwalitee-check: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --stats $(top_srcdir) kwalitee-check-errors-only: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-errors $(top_srcdir) kwalitee-check-variables: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-variables $(top_srcdir) kwalitee-check-indentation: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-indentation $(top_srcdir) kwalitee-check-sql-queries: @$(PYTHON) $(top_srcdir)/modules/miscutil/lib/kwalitee.py --check-sql $(top_srcdir) etags: \rm -f $(top_srcdir)/TAGS (cd $(top_srcdir) && find $(top_srcdir) -name "*.py" -print | xargs etags) install-data-local: for d in / /cache /log /tmp /tmp-shared /data /run ; do \ mkdir -p $(localstatedir)$$d ; \ done @echo "************************************************************" @echo "** Invenio software has been successfully installed! **" @echo "** **" @echo "** You may proceed to customizing your installation now. **" @echo "************************************************************" install-mathjax-plugin: @echo "***********************************************************" @echo "** Installing MathJax plugin, please wait... **" @echo "***********************************************************" rm -rf /tmp/invenio-mathjax-plugin mkdir /tmp/invenio-mathjax-plugin mkdir -p ${prefix}/var/www/MathJax (cd /tmp/invenio-mathjax-plugin && \ wget '$(MATHJAX)' -O mathjax.zip --no-check-certificate && \ unzip -q mathjax.zip && cd mathjax-MathJax-* && cp -ur * \ ${prefix}/var/www/MathJax) rm -fr /tmp/invenio-mathjax-plugin @echo "************************************************************" @echo "** The MathJax plugin was successfully installed. **" @echo "** Please do not forget to properly set the option **" @echo "** CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS in invenio.conf. **" @echo "************************************************************" uninstall-mathjax-plugin: @rm -rvf ${prefix}/var/www/MathJax @echo "***********************************************************" @echo "** The MathJax plugin was successfully uninstalled. **" @echo "***********************************************************" install-jscalendar-plugin: @echo "***********************************************************" @echo "** Installing jsCalendar plugin, please wait... **" @echo "***********************************************************" rm -rf /tmp/invenio-jscalendar-plugin mkdir /tmp/invenio-jscalendar-plugin (cd /tmp/invenio-jscalendar-plugin && \ wget 'http://www.dynarch.com/static/jscalendar-1.0.zip' && \ unzip -u jscalendar-1.0.zip && \ mkdir -p ${prefix}/var/www/jsCalendar && \ cp jscalendar-1.0/img.gif ${prefix}/var/www/jsCalendar/jsCalendar.gif && \ cp jscalendar-1.0/calendar.js ${prefix}/var/www/jsCalendar/ && \ cp jscalendar-1.0/calendar-setup.js ${prefix}/var/www/jsCalendar/ && \ cp jscalendar-1.0/lang/calendar-en.js ${prefix}/var/www/jsCalendar/ && \ cp jscalendar-1.0/calendar-blue.css ${prefix}/var/www/jsCalendar/) rm -fr /tmp/invenio-jscalendar-plugin @echo "***********************************************************" @echo "** The jsCalendar plugin was successfully installed. **" @echo "***********************************************************" uninstall-jscalendar-plugin: @rm -rvf ${prefix}/var/www/jsCalendar @echo "***********************************************************" @echo "** The jsCalendar plugin was successfully uninstalled. **" @echo "***********************************************************" install-jquery-plugins: install-jquery-plugins @echo "***********************************************************" @echo "** Installing various jQuery plugins, please wait... **" @echo "***********************************************************" mkdir -p ${prefix}/var/www/js mkdir -p $(prefix)/var/www/css (cd ${prefix}/var/www/js && \ wget http://jqueryjs.googlecode.com/files/jquery-1.3.1.min.js && \ mv jquery-1.3.1.min.js jquery.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/jquery-1.4.4.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.effects.core.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.effects.highlight.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.widget.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.tabs.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.7.3/ui/minified/ui.slider.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.7.3/ui/minified/ui.sortable.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.button.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.dialog.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.mouse.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.draggable.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.position.min.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.8.10/ui/minified/jquery.ui.resizable.min.js && \ wget http://ajax.googleapis.com/ajax/libs/jqueryui/1.8.11/jquery-ui.min.js && \ wget http://www.appelsiini.net/download/jquery.jeditable.mini.js && \ wget http://github.com/malsup/form/raw/master/jquery.form.js --no-check-certificate && \ wget http://jquery-multifile-plugin.googlecode.com/svn/trunk/jquery.MultiFile.pack.js && \ wget http://autobahn.tablesorter.com/jquery.tablesorter.zip && \ wget http://www.uploadify.com/wp-content/uploads/uploadify-v2.1.4.zip -O uploadify.zip && \ wget http://www.datatables.net/download/build/jquery.dataTables.min.js && \ wget http://keith-wood.name/zip/jquery.bookmark.package-1.4.0.zip && \ unzip jquery.tablesorter.zip && \ rm jquery.tablesorter.zip && \ rm -rf uploadify && \ unzip -u uploadify.zip -d uploadify && \ wget http://flot.googlecode.com/files/flot-0.6.zip && \ wget http://trentrichardson.com/examples/timepicker/js/jquery-ui-timepicker-addon.js && \ unzip -u flot-0.6.zip && \ mv flot/jquery.flot.selection.min.js flot/jquery.flot.min.js flot/excanvas.min.js ./ && \ rm flot-0.6.zip && rm -r flot && \ mv uploadify/swfobject.js ./ && \ mv uploadify/cancel.png uploadify/uploadify.css uploadify/uploadify.allglyphs.swf uploadify/uploadify.fla uploadify/uploadify.swf ../img/ && \ mv uploadify/jquery.uploadify.v2.1.4.min.js ./jquery.uploadify.min.js && \ rm uploadify.zip && rm -r uploadify && \ wget --no-check-certificate https://github.com/douglascrockford/JSON-js/raw/master/json2.js && \ wget http://jquery-ui.googlecode.com/svn/tags/1.7.3/ui/minified/ui.datepicker.min.js && \ wget -O jquery.hotkeys.min.js http://js-hotkeys.googlecode.com/files/jquery.hotkeys-0.7.8-packed.js && \ wget http://jquery.bassistance.de/treeview/jquery.treeview.zip && \ unzip jquery.treeview.zip -d jquery-treeview && \ rm jquery.treeview.zip && \ wget http://invenio-software.org/download/jquery/v1.5/js/jquery.ajaxPager.js && \ wget http://jqueryui.com/download/jquery-ui-1.7.3.custom.zip && \ unzip jquery-ui-1.7.3.custom.zip development-bundle/ui/ui.core.js js/jquery-ui-1.7.3.custom.min.js && \ mv development-bundle/ui/ui.core.js ui.core.js && \ mv js/jquery-ui-1.7.3.custom.min.js jquery-ui-1.7.3.custom.min.js && \ rm -rf development-bundle && \ rm jquery-ui-1.7.3.custom.zip && \ unzip jquery.bookmark.package-1.4.0.zip && \ rm -f jquery.bookmark.ext.* bookmarks-big.png bookmarkBasic.html jquery.bookmark.js jquery.bookmark.pack.js && \ mv bookmarks.png ../img/ && \ mv jquery.bookmark.css ../css/ && \ rm -f jquery.bookmark.package-1.4.0.zip && \ mkdir -p ${prefix}/var/www/img && \ cd ${prefix}/var/www/img && \ wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.7.3/themes/base/ && \ wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.7.3/themes/smoothness/ && \ wget -r -np -nH --cut-dirs=4 -A "png,css" -P jquery-ui/themes http://jquery-ui.googlecode.com/svn/tags/1.7.3/themes/redmond/ && \ wget --no-check-certificate -O datatables_jquery-ui.css https://github.com/DataTables/DataTables/raw/master/media/css/demo_table_jui.css && \ wget http://jquery-ui.googlecode.com/svn/tags/1.7.3/themes/redmond/jquery-ui.css && \ wget http://jquery-ui.googlecode.com/svn/tags/1.7.3/demos/images/calendar.gif && \ wget -r -np -nH --cut-dirs=5 -A "png" http://jquery-ui.googlecode.com/svn/tags/1.7.3/themes/redmond/images/) @echo "***********************************************************" @echo "** The jQuery plugins were successfully installed. **" @echo "***********************************************************" uninstall-jquery-plugins: (cd ${prefix}/var/www/js && \ rm -f jquery.min.js && \ rm -f jquery.effects.core.min.js && \ rm -f jquery.effects.highlight.min.js && \ rm -f jquery.MultiFile.pack.js && \ rm -f jquery.jeditable.mini.js && \ rm -f jquery.flot.selection.min.js && \ rm -f jquery.flot.min.js && \ rm -f excanvas.min.js && \ rm -f jquery-ui-timepicker-addon.min.js && \ rm -f jquery.tablesorter.js && \ rm -f jquery.tablesorter.pager.js && \ rm -f ui.datepicker.min.js && \ rm -f json2.js && \ rm -f jquery.uploadify.min.js && \ rm -f ui.slider.min.js && \ rm -f ui.sortable.min.js && \ rm -f jquery.ui.tabs.min.js && \ rm -rf tablesorter && \ rm -f jquery.hotkeys.min.js && \ rm -rf jquery-treeview && \ rm -f jquery.ajaxPager.js && \ rm -f jquery.form.js && \ rm -f jquery.dataTables.min.js && \ rm -f ui.core.js && \ rm -f jquery.bookmark.min.js && \ rm -f jquery-ui.min.js) (cd ${prefix}/var/www/img && \ rm -f cancel.png uploadify.css uploadify.swf uploadify.allglyphs.swf uploadify.fla && \ rm -f datatables_jquery-ui.css \ rm -f bookmarks.png) && \ (cd ${prefix}/var/www/css && \ rm -f jquery.bookmark.css) @echo "***********************************************************" @echo "** The jquery plugins were successfully uninstalled. **" @echo "***********************************************************" install-ckeditor-plugin: @echo "***********************************************************" @echo "** Installing CKeditor plugin, please wait... **" @echo "***********************************************************" rm -rf ${prefix}/lib/python/invenio/ckeditor/ rm -rf /tmp/invenio-ckeditor-plugin mkdir /tmp/invenio-ckeditor-plugin (cd /tmp/invenio-ckeditor-plugin && \ wget 'http://download.cksource.com/CKEditor/CKEditor/CKEditor%20$(CKV)/$(CKEDITOR)' && \ unzip -u -d ${prefix}/var/www $(CKEDITOR)) && \ find ${prefix}/var/www/ckeditor/ -depth -name '_*' -exec rm -rf {} \; && \ find ${prefix}/var/www/ckeditor/ckeditor* -maxdepth 0 ! -name "ckeditor.js" -exec rm -r {} \; && \ rm -fr /tmp/invenio-ckeditor-plugin @echo "* Installing Invenio-specific CKeditor config..." (cd $(top_srcdir)/modules/webstyle/etc && make install) @echo "***********************************************************" @echo "** The CKeditor plugin was successfully installed. **" @echo "** Please do not forget to properly set the option **" @echo "** CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR in invenio.conf. **" @echo "***********************************************************" uninstall-ckeditor-plugin: @rm -rvf ${prefix}/var/www/ckeditor @rm -rvf ${prefix}/lib/python/invenio/ckeditor @echo "***********************************************************" @echo "** The CKeditor plugin was successfully uninstalled. **" @echo "***********************************************************" install-pdfa-helper-files: @echo "***********************************************************" @echo "** Installing PDF/A helper files, please wait... **" @echo "***********************************************************" wget 'http://invenio-software.org/download/invenio-demo-site-files/ISOCoatedsb.icc' -O ${prefix}/etc/websubmit/file_converter_templates/ISOCoatedsb.icc @echo "***********************************************************" @echo "** The PDF/A helper files were successfully installed. **" @echo "***********************************************************" install-mediaelement: @echo "***********************************************************" @echo "** MediaElement.js, please wait... **" @echo "***********************************************************" rm -rf /tmp/mediaelement mkdir /tmp/mediaelement wget 'http://github.com/johndyer/mediaelement/zipball/master' -O '/tmp/mediaelement/mediaelement.zip' --no-check-certificate unzip -u -d '/tmp/mediaelement' '/tmp/mediaelement/mediaelement.zip' rm -rf ${prefix}/var/www/mediaelement mkdir ${prefix}/var/www/mediaelement mv /tmp/mediaelement/johndyer-mediaelement-*/build/* ${prefix}/var/www/mediaelement rm -rf /tmp/mediaelement @echo "***********************************************************" @echo "** MediaElement.js was successfully installed. **" @echo "***********************************************************" uninstall-pdfa-helper-files: rm -f ${prefix}/etc/websubmit/file_converter_templates/ISOCoatedsb.icc @echo "***********************************************************" @echo "** The PDF/A helper files were successfully uninstalled. **" @echo "***********************************************************" update-v0.3.0-tables update-v0.3.1-tables: echo "ALTER TABLE idxINDEXNAME CHANGE id_idxINDEX id_idxINDEX mediumint(9) unsigned NOT NULL FIRST;" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkMETHODNAME CHANGE id_rnkMETHOD id_rnkMETHOD mediumint(9) unsigned NOT NULL FIRST;" | ${prefix}/bin/dbexec echo "ALTER TABLE collectionname CHANGE id_collection id_collection mediumint(9) unsigned NOT NULL FIRST;" | ${prefix}/bin/dbexec echo "ALTER TABLE formatname CHANGE id_format id_format mediumint(9) unsigned NOT NULL FIRST;" | ${prefix}/bin/dbexec echo "ALTER TABLE fieldname CHANGE id_field id_field mediumint(9) unsigned NOT NULL FIRST;" | ${prefix}/bin/dbexec echo "INSERT INTO accACTION (id,name,description,allowedkeywords,optional) VALUES (NULL,'runbibrank','run BibRank','','no');" | ${prefix}/bin/dbexec echo "INSERT INTO accACTION (id,name,description,allowedkeywords,optional) VALUES (NULL,'cfgbibrank','configure BibRank','','no');" | ${prefix}/bin/dbexec update-v0.3.2-tables: echo "ALTER TABLE sbmCOLLECTION_sbmDOCTYPE CHANGE id_son id_son char(10) NOT NULL default '0';" | ${prefix}/bin/dbexec update-v0.3.3-tables: ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "ALTER TABLE flxLINKTYPEPARAMS CHANGE pname pname varchar(78) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkMETHOD DROP star_category_ranges;" | ${prefix}/bin/dbexec echo "DROP TABLE rnkSET;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK CHANGE arguments arguments LONGTEXT;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK CHANGE status status varchar(50);" | ${prefix}/bin/dbexec update-v0.5.0-tables: ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "ALTER TABLE session ADD INDEX uid (uid);" | ${prefix}/bin/dbexec echo "UPDATE idxINDEXNAME SET ln='cs' WHERE ln='cz';" | ${prefix}/bin/dbexec echo "UPDATE rnkMETHODNAME SET ln='cs' WHERE ln='cz';" | ${prefix}/bin/dbexec echo "UPDATE collectionname SET ln='cs' WHERE ln='cz';" | ${prefix}/bin/dbexec echo "UPDATE collection_portalbox SET ln='cs' WHERE ln='cz';" | ${prefix}/bin/dbexec echo "UPDATE formatname SET ln='cs' WHERE ln='cz';" | ${prefix}/bin/dbexec echo "UPDATE fieldname SET ln='cs' WHERE ln='cz';" | ${prefix}/bin/dbexec echo "UPDATE idxINDEXNAME SET ln='sv' WHERE ln='se';" | ${prefix}/bin/dbexec echo "UPDATE rnkMETHODNAME SET ln='sv' WHERE ln='se';" | ${prefix}/bin/dbexec echo "UPDATE collectionname SET ln='sv' WHERE ln='se';" | ${prefix}/bin/dbexec echo "UPDATE collection_portalbox SET ln='sv' WHERE ln='se';" | ${prefix}/bin/dbexec echo "UPDATE formatname SET ln='sv' WHERE ln='se';" | ${prefix}/bin/dbexec echo "UPDATE fieldname SET ln='sv' WHERE ln='se';" | ${prefix}/bin/dbexec update-v0.7.1-tables: echo "DROP TABLE oaiHARVEST;" | ${prefix}/bin/dbexec ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "INSERT INTO accACTION (id,name,description,allowedkeywords,optional) VALUES (NULL,'cfgbibharvest','configure BibHarvest','','no');" | ${prefix}/bin/dbexec echo "INSERT INTO accACTION (id,name,description,allowedkeywords,optional) VALUES (NULL,'runoaiharvest','run BibHarvest oaiharvest','','no');" | ${prefix}/bin/dbexec echo "INSERT INTO accACTION (id,name,description,allowedkeywords,optional) VALUES (NULL,'cfgwebcomment','configure WebComment','','no');" | ${prefix}/bin/dbexec echo "INSERT INTO accACTION (id,name,description,allowedkeywords,optional) VALUES (NULL,'runoaiarchive','run BibHarvest oaiarchive','','no');" | ${prefix}/bin/dbexec echo "INSERT INTO accACTION (id,name,description,allowedkeywords,optional) VALUES (NULL,'runbibedit','run BibEdit','','no');" | ${prefix}/bin/dbexec echo "ALTER TABLE user ADD nickname varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE user ADD last_login datetime NOT NULL default '0000-00-00 00:00:00';" | ${prefix}/bin/dbexec echo "ALTER TABLE user ADD INDEX nickname (nickname);" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmFIELD CHANGE subname subname varchar(13) default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE user_query_basket CHANGE alert_name alert_name varchar(30) NOT NULL default '';" | ${prefix}/bin/dbexec echo "TRUNCATE TABLE session;" | ${prefix}/bin/dbexec @echo "**********************************************************" @echo "** Do not forget to run the basket migration now: **" @echo "** @PYTHON@ modules/webbasket/lib/webbasket_migration_kit.py " @echo "** Please see the RELEASE-NOTES for details. **" @echo "**********************************************************" @echo "INSERT INTO oaiARCHIVE (id, setName, setSpec, setDescription, setDefinition, setRecList) SELECT id, setName, setSpec, CONCAT_WS('', setDescription), setDefinition, setRecList FROM oaiSET;" update-v0.90.0-tables: ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "ALTER TABLE format ADD COLUMN (description varchar(255) default '');" | ${prefix}/bin/dbexec echo "ALTER TABLE format ADD COLUMN (content_type varchar(255) default '');" | ${prefix}/bin/dbexec update-v0.90.1-tables: ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "ALTER TABLE schTASK ADD INDEX status (status);" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK ADD INDEX runtime (runtime);" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmCATEGORIES ADD COLUMN score TINYINT UNSIGNED NOT NULL DEFAULT 0;" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmCATEGORIES ADD PRIMARY KEY (doctype, sname);" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmCATEGORIES ADD KEY doctype (doctype);" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiHARVEST ADD COLUMN setspecs TEXT NOT NULL DEFAULT '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE setDescription setDescription text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE p1 p1 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE f1 f1 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE m1 m1 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE p2 p2 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE f2 f2 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE m2 m2 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE p3 p3 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE f3 f3 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiARCHIVE CHANGE m3 m3 text NOT NULL default '';" | ${prefix}/bin/dbexec echo "UPDATE bibdoc SET status=0 WHERE status='';" | ${prefix}/bin/dbexec echo "UPDATE bibdoc SET status=1 WHERE status='deleted';" | ${prefix}/bin/dbexec echo "ALTER TABLE fmtKNOWLEDGEBASES add COLUMN kbtype char default NULL;" | ${prefix}/bin/dbexec update-v0.92.0-tables: echo "UPDATE bibdoc SET status=0 WHERE status='';" | ${prefix}/bin/dbexec echo "UPDATE bibdoc SET status=1 WHERE status='deleted';" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK CHANGE arguments arguments mediumblob;" | ${prefix}/bin/dbexec echo "UPDATE user SET note=1 WHERE nickname='admin' AND note IS NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE usergroup CHANGE name name varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE usergroup ADD login_method varchar(255) NOT NULL default 'INTERNAL';" | ${prefix}/bin/dbexec echo "ALTER TABLE usergroup ADD UNIQUE KEY login_method_name (login_method(70), name);" | ${prefix}/bin/dbexec echo "ALTER TABLE user CHANGE settings settings blob default NULL;" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Get_Recid', 'This function gets the recid for a document with a given report-number (as stored in the global variable rn).');" | ${prefix}/bin/dbexec update-v0.92.1-tables: echo "DROP TABLE rnkCITATIONDATA;" | ${prefix}/bin/dbexec ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "UPDATE bibdoc SET status='DELETED' WHERE status='1';" | ${prefix}/bin/dbexec echo "UPDATE bibdoc SET status='' WHERE status='0';" | ${prefix}/bin/dbexec echo "ALTER TABLE bibrec ADD KEY creation_date (creation_date);" | ${prefix}/bin/dbexec echo "ALTER TABLE bibrec ADD KEY modification_date (modification_date);" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc ADD KEY creation_date (creation_date);" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc ADD KEY modification_date (modification_date);" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc ADD KEY docname (docname);" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiHARVEST CHANGE postprocess postprocess varchar(20) NOT NULL default 'h';" | ${prefix}/bin/dbexec echo "ALTER TABLE oaiHARVEST ADD COLUMN bibfilterprogram varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE idxINDEXNAME CHANGE ln ln char(5) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE idxINDEX ADD COLUMN stemming_language VARCHAR(10) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkMETHODNAME CHANGE ln ln char(5) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkDOWNLOADS CHANGE id_bibdoc id_bibdoc mediumint(9) unsigned default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkDOWNLOADS CHANGE file_format file_format varchar(10) NULL default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE collectionname CHANGE ln ln char(5) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE collection_portalbox CHANGE ln ln char(5) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE format ADD COLUMN visibility TINYINT NOT NULL default 1;" | ${prefix}/bin/dbexec echo "ALTER TABLE formatname CHANGE ln ln char(5) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE fieldname CHANGE ln ln char(5) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE accROLE ADD COLUMN firerole_def_ser blob NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE accROLE ADD COLUMN firerole_def_src text NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE user_accROLE ADD COLUMN expiration datetime NOT NULL default '9999-12-31 23:59:59';" | ${prefix}/bin/dbexec echo "ALTER TABLE user DROP INDEX id, ADD PRIMARY KEY id (id);" | ${prefix}/bin/dbexec echo -e 'from invenio.dbquery import run_sql;\ map(lambda index_id: run_sql("ALTER TABLE idxPHRASE%02dF CHANGE term term TEXT NULL DEFAULT NULL, DROP INDEX term, ADD INDEX term (term (50))" % index_id[0]), run_sql("select id from idxINDEX"))' | $(PYTHON) echo "INSERT INTO rnkCITATIONDATA VALUES (1,'citationdict','','');" | ${prefix}/bin/dbexec echo "INSERT INTO rnkCITATIONDATA VALUES (2,'reversedict','','');" | ${prefix}/bin/dbexec echo "INSERT INTO rnkCITATIONDATA VALUES (3,'selfcitdict','','');" | ${prefix}/bin/dbexec update-v0.99.0-tables: ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "ALTER TABLE bibdoc ADD COLUMN more_info mediumblob NULL default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK ADD COLUMN priority tinyint(4) NOT NULL default 0;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK ADD KEY priority (priority);" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA DROP PRIMARY KEY;" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA ADD PRIMARY KEY (id);" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA CHANGE id id mediumint(8) unsigned NOT NULL auto_increment;" | ${prefix}/bin/dbexec echo "ALTER TABLE rnkCITATIONDATA ADD UNIQUE KEY object_name (object_name);" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmPARAMETERS CHANGE value value text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmAPPROVAL ADD note text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE hstDOCUMENT CHANGE docsize docsize bigint(15) unsigned NOT NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtACTIONHISTORY CHANGE client_host client_host int(10) unsigned default NULL;" | ${prefix}/bin/dbexec update-v0.99.1-tables: @echo "Nothing to do; table structure did not change between v0.99.1 and v0.99.2." update-v0.99.2-tables: @echo "Nothing to do; table structure did not change between v0.99.2 and v0.99.3." update-v0.99.3-tables: # from v0.99.3 to v1.0.0-rc0 echo "RENAME TABLE oaiARCHIVE TO oaiREPOSITORY;" | ${prefix}/bin/dbexec ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "INSERT INTO knwKB (id,name,description,kbtype) SELECT id,name,description,'' FROM fmtKNOWLEDGEBASES;" | ${prefix}/bin/dbexec echo "INSERT INTO knwKBRVAL (id,m_key,m_value,id_knwKB) SELECT id,m_key,m_value,id_fmtKNOWLEDGEBASES FROM fmtKNOWLEDGEBASEMAPPINGS;" | ${prefix}/bin/dbexec echo "ALTER TABLE sbmPARAMETERS CHANGE name name varchar(40) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc CHANGE docname docname varchar(250) COLLATE utf8_bin NOT NULL default 'file';" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc CHANGE status status text NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE bibdoc ADD COLUMN text_extraction_date datetime NOT NULL default '0000-00-00';" | ${prefix}/bin/dbexec echo "ALTER TABLE collection DROP COLUMN restricted;" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK CHANGE host host varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE hstTASK CHANGE host host varchar(255) NOT NULL default '';" | ${prefix}/bin/dbexec echo "ALTER TABLE bib85x DROP INDEX kv, ADD INDEX kv (value(100));" | ${prefix}/bin/dbexec echo "UPDATE clsMETHOD SET location='http://invenio-software.org/download/invenio-demo-site-files/HEP.rdf' WHERE name='HEP' AND location='';" | ${prefix}/bin/dbexec echo "UPDATE clsMETHOD SET location='http://invenio-software.org/download/invenio-demo-site-files/NASA-subjects.rdf' WHERE name='NASA-subjects' AND location='';" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET name='runoairepository', description='run oairepositoryupdater task' WHERE name='runoaiarchive';" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET name='cfgoaiharvest', description='configure OAI Harvest' WHERE name='cfgbibharvest';" | ${prefix}/bin/dbexec echo "ALTER TABLE accARGUMENT CHANGE value value varchar(255);" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET allowedkeywords='doctype,act,categ' WHERE name='submit';" | ${prefix}/bin/dbexec echo "INSERT INTO accARGUMENT(keyword,value) VALUES ('categ','*');" | ${prefix}/bin/dbexec echo "INSERT INTO accROLE_accACTION_accARGUMENT(id_accROLE,id_accACTION,id_accARGUMENT,argumentlistid) SELECT DISTINCT raa.id_accROLE,raa.id_accACTION,accARGUMENT.id,raa.argumentlistid FROM accROLE_accACTION_accARGUMENT as raa JOIN accACTION on id_accACTION=accACTION.id,accARGUMENT WHERE accACTION.name='submit' and accARGUMENT.keyword='categ' and accARGUMENT.value='*';" | ${prefix}/bin/dbexec echo "UPDATE accACTION SET allowedkeywords='name,with_editor_rights' WHERE name='cfgwebjournal';" | ${prefix}/bin/dbexec echo "INSERT INTO accARGUMENT(keyword,value) VALUES ('with_editor_rights','yes');" | ${prefix}/bin/dbexec echo "INSERT INTO accROLE_accACTION_accARGUMENT(id_accROLE,id_accACTION,id_accARGUMENT,argumentlistid) SELECT DISTINCT raa.id_accROLE,raa.id_accACTION,accARGUMENT.id,raa.argumentlistid FROM accROLE_accACTION_accARGUMENT as raa JOIN accACTION on id_accACTION=accACTION.id,accARGUMENT WHERE accACTION.name='cfgwebjournal' and accARGUMENT.keyword='with_editor_rights' and accARGUMENT.value='yes';" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC CHANGE id id int(15) unsigned NOT NULL auto_increment;" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC ADD external_id int(15) NOT NULL default '0';" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC ADD collection_id int(15) unsigned NOT NULL default '0';" | ${prefix}/bin/dbexec echo "ALTER TABLE bskEXTREC ADD original_url text;" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD status char(2) NOT NULL default 'ok';" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD KEY status (status);" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Photos_to_Storage','Attach/edit the pictures uploaded with the \"create_photos_manager_interface()\" function');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFIELDDESC VALUES ('Upload_Photos',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a photos upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Photos_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\n\r\nfrom invenio.websubmit_functions.ParamFile import ParamFromFile\r\nfrom invenio.websubmit_functions.Move_Photos_to_Storage import read_param_file, create_photos_manager_interface, get_session_id\r\n\r\n# Retrieve session id\r\ntry:\r\n # User info is defined only in MBI/MPI actions...\r\n session_id = get_session_id(None, uid, user_info) \r\nexcept:\r\n session_id = get_session_id(req, uid, {})\r\n\r\n# Retrieve context\r\nindir = curdir.split(\'/\')[-3]\r\ndoctype = curdir.split(\'/\')[-2]\r\naccess = curdir.split(\'/\')[-1]\r\n\r\n# Get the record ID, if any\r\nsysno = ParamFromFile(\"%s/%s\" % (curdir,\'SN\')).strip()\r\n\r\n\"\"\"\r\nModify below the configuration of the photos manager interface.\r\nNote: \'can_reorder_photos\' parameter is not yet fully taken into consideration\r\n\r\nDocumentation of the function is available by running:\r\necho -e \'from invenio.websubmit_functions.Move_Photos_to_Storage import create_photos_manager_interface as f\\nprint f.__doc__\' | python\r\n\"\"\"\r\ntext += create_photos_manager_interface(sysno, session_id, uid,\r\n doctype, indir, curdir, access,\r\n can_delete_photos=True,\r\n can_reorder_photos=True,\r\n can_upload_photos=True,\r\n editor_width=700,\r\n editor_height=400,\r\n initial_slider_value=100,\r\n max_slider_value=200,\r\n min_slider_value=80)','0000-00-00','0000-00-00',NULL,NULL,0);" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Photos_to_Storage','iconsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFIELDDESC VALUES ('Upload_Files',NULL,'','R',NULL,NULL,NULL,NULL,NULL,'\"\"\"\r\nThis is an example of element that creates a file upload interface.\r\nClone it, customize it and integrate it into your submission. Then add function \r\n\'Move_Uploaded_Files_to_Storage\' to your submission functions list, in order for files \r\nuploaded with this interface to be attached to the record. More information in \r\nthe WebSubmit admin guide.\r\n\"\"\"\r\nfrom invenio.websubmit_managedocfiles import create_file_upload_interface\r\nfrom invenio.websubmit_functions.Shared_Functions import ParamFromFile\r\n\r\nindir = ParamFromFile(os.path.join(curdir, \'indir\'))\r\ndoctype = ParamFromFile(os.path.join(curdir, \'doctype\'))\r\naccess = ParamFromFile(os.path.join(curdir, \'access\'))\r\ntry:\r\n sysno = int(ParamFromFile(os.path.join(curdir, \'SN\')).strip())\r\nexcept:\r\n sysno = -1\r\nln = ParamFromFile(os.path.join(curdir, \'ln\'))\r\n\r\n\"\"\"\r\nRun the following to get the list of parameters of function \'create_file_upload_interface\':\r\necho -e \'from invenio.websubmit_managedocfiles import create_file_upload_interface as f\\nprint f.__doc__\' | python\r\n\"\"\"\r\ntext = create_file_upload_interface(recid=sysno,\r\n print_outside_form_tag=False,\r\n include_headers=True,\r\n ln=ln,\r\n doctypes_and_desc=[(\'main\',\'Main document\'),\r\n (\'additional\',\'Figure, schema, etc.\')],\r\n can_revise_doctypes=[\'*\'],\r\n can_describe_doctypes=[\'main\'],\r\n can_delete_doctypes=[\'additional\'],\r\n can_rename_doctypes=[\'main\'],\r\n sbm_indir=indir, sbm_doctype=doctype, sbm_access=access)[1]\r\n','0000-00-00','0000-00-00',NULL,NULL,0);" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','forceFileRevision');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Create_Upload_Files_Interface','Display generic interface to add/revise/delete files. To be used before function \"Move_Uploaded_Files_to_Storage\"');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Uploaded_Files_to_Storage','Attach files uploaded with \"Create_Upload_Files_Interface\"')" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','elementNameToDoctype');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createIconDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','createRelatedFormats');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','iconsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Revised_Files_to_Storage','keepPreviousVersionDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_Revised_Files_to_Storage','Revise files initially uploaded with \"Move_Files_to_Storage\"')" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','minsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','doctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictions');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDeleteDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canReviseDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canDescribeDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canCommentDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canKeepDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canAddFormatDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRestrictDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canRenameDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','canNameNewFiles');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','createRelatedFormats');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','keepDefault');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','showLinks');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','fileLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','filenameLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','descriptionLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','commentLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','restrictionLabel');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','startDoc');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','endDoc');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','defaultFilenameDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Create_Upload_Files_Interface','maxFilesDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','iconsize');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Uploaded_Files_to_Storage','createIconDoctypes');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Report_Number_Generation','nblength');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Second_Report_Number_Generation','2nd_nb_length');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Get_Recid','record_search_pattern');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Move_FCKeditor_Files_to_Storage','Transfer files attached to the record with the FCKeditor');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_FCKeditor_Files_to_Storage','input_fields');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','layer');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','layer');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Replace_Single_File_Approval','switch_file');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Stamp_Uploaded_Files','switch_file');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_restrictions');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Files_to_Storage','paths_and_doctypes');" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD round_name varchar(255) NOT NULL default ''" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD restriction varchar(50) NOT NULL default ''" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD in_reply_to_id_cmtRECORDCOMMENT int(15) unsigned NOT NULL default '0'" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD KEY in_reply_to_id_cmtRECORDCOMMENT (in_reply_to_id_cmtRECORDCOMMENT);" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD in_reply_to_id_bskRECORDCOMMENT int(15) unsigned NOT NULL default '0'" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD KEY in_reply_to_id_bskRECORDCOMMENT (in_reply_to_id_bskRECORDCOMMENT);" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD reply_order_cached_data blob NULL default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD reply_order_cached_data blob NULL default NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE cmtRECORDCOMMENT ADD INDEX (reply_order_cached_data(40));" | ${prefix}/bin/dbexec echo "ALTER TABLE bskRECORDCOMMENT ADD INDEX (reply_order_cached_data(40));" | ${prefix}/bin/dbexec echo -e 'from invenio.webcommentadminlib import migrate_comments_populate_threads_index;\ migrate_comments_populate_threads_index()' | $(PYTHON) echo -e 'from invenio.access_control_firerole import repair_role_definitions;\ repair_role_definitions()' | $(PYTHON) update-v1.0.0-rc0-tables: # from v1.0.0-rc0 to next release ${prefix}/bin/dbexec < $(top_srcdir)/modules/miscutil/sql/tabcreate.sql echo "INSERT INTO sbmALLFUNCDESCR VALUES ('Set_Embargo','Set an embargo on all the documents of a given record.');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Set_Embargo','date_file');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Set_Embargo','date_format');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('User_is_Record_Owner_or_Curator','curator_role');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('User_is_Record_Owner_or_Curator','curator_flag');" | ${prefix}/bin/dbexec echo "INSERT INTO sbmFUNDESC VALUES ('Move_Photos_to_Storage','iconformat');" | ${prefix}/bin/dbexec echo "INSERT INTO format (name, code, description, content_type, visibility) VALUES ('Podcast', 'xp', 'Sample format suitable for multimedia feeds, such as podcasts', 'application/rss+xml', 0);" | ${prefix}/bin/dbexec echo "ALTER TABLE accMAILCOOKIE ADD INDEX expiration (expiration);" | ${prefix}/bin/dbexec echo "UPDATE sbmFUNDESC SET function='Move_CKEditor_Files_to_Storage' WHERE function='Move_FCKeditor_Files_to_Storage';" | ${prefix}/bin/dbexec echo "UPDATE sbmALLFUNCDESCR SET function='Move_CKEditor_Files_to_Storage', description='Transfer files attached to the record with the CKEditor' WHERE function='Move_FCKeditor_Files_to_Storage';" | ${prefix}/bin/dbexec echo "UPDATE sbmFUNCTIONS SET function='Move_CKEditor_Files_to_Storage' WHERE function='Move_FCKeditor_Files_to_Storage';" | ${prefix}/bin/dbexec echo "ALTER TABLE schTASK CHANGE proc proc varchar(255) NOT NULL;" | ${prefix}/bin/dbexec echo "ALTER TABLE session CHANGE session_object session_object longblob;" | ${prefix}/bin/dbexec + echo "ALTER TABLE oaiREPOSITORY CHANGE setSpec setSpec varchar(255) NOT NULL default 'GLOBAL_SET';" | ${prefix}/bin/dbexec + echo "UPDATE oaiREPOSITORY SET setSpec='GLOBAL_SET' WHERE setSpec='';" | ${prefix}/bin/dbexec CLEANFILES = *~ *.pyc *.tmp diff --git a/THANKS b/THANKS index 9efbb02d5..2f7205c2a 100644 --- a/THANKS +++ b/THANKS @@ -1,170 +1,173 @@ Invenio THANKS ============== Several people outside the Invenio Development Team contributed to the project: - Thierry Thomas Patches for compiling old CDSware 0.3.x sources on FreeBSD. - Guido Pelzer Contributions to the German translation. German stopword list. - Valerio Gracco Contributions to the Italian translation. - Tullio Basaglia Contributions to the Italian translation. - Flavio C. Coelho Contributions to the Portuguese translation. - Lyuba Vasilevskaya Contributions to the Russian translation. - Maria Gomez Marti Contributions to the Spanish translation. - Magaly Bascones Dominguez Contributions to the Spanish translation. - Urban Andersson Contributions to the Swedish translation. - Eric Grand Contributions to the French translation. - Theodoropoulos Theodoros Contributions to the Greek translation, Greek stopword list, XML RefWorks output format. - Vasyl Ostrovskyi Contributions to the Ukrainian translation. - Ferran Jorba Contributions to the Catalan and Spanish translations. Cleanup of the old PHP-based BibFormat Admin Guide. Several minor patches. - Beatriu Piera Translation of the Search Guide into Catalan and Spanish. - Anonymous contributor (name withheld by request) Contributions to the Japanese translation. - Anonymous contributor (name withheld by request) Contributions to the Spanish translation. - Alen Vodopijevec Contributions to the Croatian translation. - Jasna Marković Contributions to the Croatian translation. - Kam-ming Ku Contributions to the Chinese translations (zh_CN, zh_TW). - Benedikt Koeppel Contributions to the German translation. - Toru Tsuboyama Contributions to the Japanese translation. - Mike Marino Several minor patches and suggestions. - Zbigniew Szklarz Contributions to the Polish translation. - Iaroslav Gaponenko Contributions to the Russian translation. - Yana Osborne Contributions to the Russian translation. - Zbigniew Leonowicz Contributions to the Polish translation. - Makiko Matsumoto and Takao Ishigaki Contributions to the Japanese translation. - Eva Papp Contributions to the Hungarian translation. - Nino Jejelava Contributions to the Georgian translation. - Cristian Bacchi Improvements to the browse interface. - Genis Musulmanbekov Contributions to the Russian translation. - Andrey Tremba Contributions to the Russian translation. The URL handler was inspired by the Quixote Web Framework which is ``Copyright (c) 2004 Corporation for National Research Initiatives; All Rights Reserved''. The session handler was adapted from the mod_python session implementation. Javascript Quicktags scripts from Alex King are used to provide additional capabilities to the edition of BibFormat templates through the web admin interface. The indexer engine uses the Martin Porter Stemming Algorithm and its Vivake Gupta free Python implementation. The CSS style for rounded corners box used in detailed record pages adapted from Francky Lleyneman liquidcorners CSS. The NASA_Subjects.rdf files has been retrieved from the American National Aeronautics and Space Administration (NASA) who kindly provide this for free re-use. The tiger test picure used in automated demo picture submission was converted from Ghostscript's 'tiger.eps'. Some icon images were taken from (i) the Silk icon set, (ii) the Function icon set, (iii) the activity indicator icon, and (iv) the Open Icon Libray. The unoconv.py script has been adapted from UNOCONV by Dag Wieers. PDFA_def.ps has been adapted from the GPL distribution of GhostScript. The ISOCoatedsb.icc ICC profile has been retrieved from the European Color Initiative. The PEP8 conformance checking script (pep8.py) was written by Johann C. Rocholl . The pep8.py version included with Invenio was downloaded from on 2009-06-14. The git-version-gen script was taken from gnulib 20100704+stable-1. The LaTeX-to-Unicode translation table was compiled from: FX, Lea Wiemann , The scientificchar plugin for the CKEditor was adapted from the specialchar plugin from Frederico Knabben. +The oai2.xsl.v1.0 OAI to HTML XSLT Style Sheet was taken from EPrints. + + - end of file - diff --git a/config/invenio.conf b/config/invenio.conf index 4b1b1f87b..b2b4dc8dd 100644 --- a/config/invenio.conf +++ b/config/invenio.conf @@ -1,1525 +1,1600 @@ ## This file is part of Invenio. ## Copyright (C) 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ################################################### ## About 'invenio.conf' and 'invenio-local.conf' ## ################################################### ## The 'invenio.conf' file contains the vanilla default configuration ## parameters of a Invenio installation, as coming out of the ## distribution. The file should be self-explanatory. Once installed ## in its usual location (usually /opt/invenio/etc), you could in ## principle go ahead and change the values according to your local ## needs, but this is not advised. ## ## If you would like to customize some of these parameters, you should ## rather create a file named 'invenio-local.conf' in the same ## directory where 'invenio.conf' lives and you should write there ## only the customizations that you want to be different from the ## vanilla defaults. ## ## Here is a realistic, minimalist, yet production-ready example of ## what you would typically put there: ## ## $ cat /opt/invenio/etc/invenio-local.conf ## [Invenio] ## CFG_SITE_NAME = John Doe's Document Server ## CFG_SITE_NAME_INTL_fr = Serveur des Documents de John Doe ## CFG_SITE_URL = http://your.site.com ## CFG_SITE_SECURE_URL = https://your.site.com ## CFG_SITE_ADMIN_EMAIL = john.doe@your.site.com ## CFG_SITE_SUPPORT_EMAIL = john.doe@your.site.com ## CFG_WEBALERT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = john.doe@your.site.com ## CFG_WEBCOMMENT_DEFAULT_MODERATOR = john.doe@your.site.com ## CFG_DATABASE_HOST = localhost ## CFG_DATABASE_NAME = invenio ## CFG_DATABASE_USER = invenio ## CFG_DATABASE_PASS = my123p$ss ## ## You should override at least the parameters mentioned above and the ## parameters mentioned in the `Part 1: Essential parameters' below in ## order to define some very essential runtime parameters such as the ## name of your document server (CFG_SITE_NAME and ## CFG_SITE_NAME_INTL_*), the visible URL of your document server ## (CFG_SITE_URL and CFG_SITE_SECURE_URL), the email address of the ## local Invenio administrator, comment moderator, and alert engine ## (CFG_SITE_SUPPORT_EMAIL, CFG_SITE_ADMIN_EMAIL, etc), and last but ## not least your database credentials (CFG_DATABASE_*). ## ## The Invenio system will then read both the default invenio.conf ## file and your customized invenio-local.conf file and it will ## override any default options with the ones you have specified in ## your local file. This cascading of configuration parameters will ## ease your future upgrades. [Invenio] ################################### ## Part 1: Essential parameters ## ################################### ## This part defines essential Invenio internal parameters that ## everybody should override, like the name of the server or the email ## address of the local Invenio administrator. ## CFG_DATABASE_* - specify which MySQL server to use, the name of the ## database to use, and the database access credentials. CFG_DATABASE_HOST = localhost CFG_DATABASE_PORT = 3306 CFG_DATABASE_NAME = invenio CFG_DATABASE_USER = invenio CFG_DATABASE_PASS = my123p$ss ## CFG_SITE_URL - specify URL under which your installation will be ## visible. For example, use "http://your.site.com". Do not leave ## trailing slash. CFG_SITE_URL = http://localhost ## CFG_SITE_SECURE_URL - specify secure URL under which your ## installation secure pages such as login or registration will be ## visible. For example, use "https://your.site.com". Do not leave ## trailing slash. If you don't plan on using HTTPS, then you may ## leave this empty. CFG_SITE_SECURE_URL = https://localhost ## CFG_SITE_NAME -- the visible name of your Invenio installation. CFG_SITE_NAME = Atlantis Institute of Fictive Science ## CFG_SITE_NAME_INTL -- the international versions of CFG_SITE_NAME ## in various languages. (See also CFG_SITE_LANGS below.) CFG_SITE_NAME_INTL_en = Atlantis Institute of Fictive Science CFG_SITE_NAME_INTL_fr = Atlantis Institut des Sciences Fictives CFG_SITE_NAME_INTL_de = Atlantis Institut der fiktiven Wissenschaft CFG_SITE_NAME_INTL_es = Atlantis Instituto de la Ciencia Fictive CFG_SITE_NAME_INTL_ca = Institut Atlantis de Ciència Fictícia CFG_SITE_NAME_INTL_pt = Instituto Atlantis de Ciência Fictícia CFG_SITE_NAME_INTL_it = Atlantis Istituto di Scienza Fittizia CFG_SITE_NAME_INTL_ru = Институт Фиктивных Наук Атлантиды CFG_SITE_NAME_INTL_sk = Atlantis Inštitút Fiktívnych Vied CFG_SITE_NAME_INTL_cs = Atlantis Institut Fiktivních Věd CFG_SITE_NAME_INTL_no = Atlantis Institutt for Fiktiv Vitenskap CFG_SITE_NAME_INTL_sv = Atlantis Institut för Fiktiv Vetenskap CFG_SITE_NAME_INTL_el = Ινστιτούτο Φανταστικών Επιστημών Ατλαντίδος CFG_SITE_NAME_INTL_uk = Інститут вигаданих наук в Атлантісі CFG_SITE_NAME_INTL_ja = Fictive 科学のAtlantis の協会 CFG_SITE_NAME_INTL_pl = Instytut Fikcyjnej Nauki Atlantis CFG_SITE_NAME_INTL_bg = Институт за фиктивни науки Атлантис CFG_SITE_NAME_INTL_hr = Institut Fiktivnih Znanosti Atlantis CFG_SITE_NAME_INTL_zh_CN = 阿特兰提斯虚拟科学学院 CFG_SITE_NAME_INTL_zh_TW = 阿特蘭提斯虛擬科學學院 CFG_SITE_NAME_INTL_hu = Kitalált Tudományok Atlantiszi Intézete CFG_SITE_NAME_INTL_af = Atlantis Instituut van Fiktiewe Wetenskap CFG_SITE_NAME_INTL_gl = Instituto Atlantis de Ciencia Fictive CFG_SITE_NAME_INTL_ro = Institutul Atlantis al Ştiinţelor Fictive CFG_SITE_NAME_INTL_rw = Atlantis Ishuri Rikuru Ry'ubuhanga CFG_SITE_NAME_INTL_ka = ატლანტიდის ფიქტიური მეცნიერების ინსტიტუტი CFG_SITE_NAME_INTL_lt = Fiktyvių Mokslų Institutas Atlantis CFG_SITE_NAME_INTL_ar = معهد أطلنطيس للعلوم الافتراضية ## CFG_SITE_LANG -- the default language of the interface: ' CFG_SITE_LANG = en ## CFG_SITE_LANGS -- list of all languages the user interface should ## be available in, separated by commas. The order specified below ## will be respected on the interface pages. A good default would be ## to use the alphabetical order. Currently supported languages ## include Afrikaans, Arabic, Bulgarian, Catalan, Czech, German, Georgian, ## Greek, English, Spanish, French, Croatian, Hungarian, Galician, ## Italian, Japanese, Kinyarwanda, Lithuanian, Norwegian, Polish, ## Portuguese, Romanian, Russian, Slovak, Swedish, Ukrainian, Chinese ## (China), Chinese (Taiwan), so that the eventual maximum you can ## currently select is ## "af,ar,bg,ca,cs,de,el,en,es,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW". CFG_SITE_LANGS = af,ar,bg,ca,cs,de,el,en,es,fr,hr,gl,ka,it,rw,lt,hu,ja,no,pl,pt,ro,ru,sk,sv,uk,zh_CN,zh_TW ## CFG_SITE_SUPPORT_EMAIL -- the email address of the support team for ## this installation: CFG_SITE_SUPPORT_EMAIL = info@invenio-software.org ## CFG_SITE_ADMIN_EMAIL -- the email address of the 'superuser' for ## this installation. Enter your email address below and login with ## this address when using Invenio inistration modules. You ## will then be automatically recognized as superuser of the system. CFG_SITE_ADMIN_EMAIL = info@invenio-software.org ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES -- list of email addresses to ## which an email should be sent in case of emergency (e.g. bibsched ## queue has been stopped because of an error). Configuration ## dictionary allows for different recipients based on weekday and ## time-of-day. Example: ## ## CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = { ## 'Sunday 22:00-06:00': '0041761111111@email2sms.foo.com', ## '06:00-18:00': 'team-in-europe@foo.com,0041762222222@email2sms.foo.com', ## '18:00-06:00': 'team-in-usa@foo.com', ## '*': 'john.doe.phone@foo.com'} ## ## If you want the emergency email notifications to always go to the ## same address, just use the wildcard line in the above example. CFG_SITE_EMERGENCY_EMAIL_ADDRESSES = {} ## CFG_SITE_ADMIN_EMAIL_EXCEPTIONS -- set this to 0 if you do not want ## to receive any captured exception via email to CFG_SITE_ADMIN_EMAIL ## address. Captured exceptions will still be available in ## var/log/invenio.err file. Set this to 1 if you want to receive ## some of the captured exceptions (this depends on the actual place ## where the exception is captured). Set this to 2 if you want to ## receive all captured exceptions. CFG_SITE_ADMIN_EMAIL_EXCEPTIONS = 1 ## CFG_SITE_RECORD -- what is the URI part representing detailed ## record pages? We recomment to leave the default value `record' ## unchanged. CFG_SITE_RECORD = record ## CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER -- set this to ## the number of seconds after which to reset the exception notification ## counter. A given repetitive exception is notified via email with a ## logarithmic strategy: the first time it is seen it is sent via email, ## then the second time, then the fourth, then the eighth and so forth. ## If the number of seconds elapsed since the last time it was notified ## is greater than CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER ## then the internal counter is reset in order not to have exception ## notification become more and more rare. CFG_ERRORLIB_RESET_EXCEPTION_NOTIFICATION_COUNTER_AFTER = 14400 ## CFG_CERN_SITE -- do we want to enable CERN-specific code? ## Put "1" for "yes" and "0" for "no". CFG_CERN_SITE = 0 ## CFG_INSPIRE_SITE -- do we want to enable INSPIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_INSPIRE_SITE = 0 ## CFG_ADS_SITE -- do we want to enable ADS-specific code? ## Put "1" for "yes" and "0" for "no". CFG_ADS_SITE = 0 ## CFG_OPENAIRE_SITE -- do we want to enable OpenAIRE-specific code? ## Put "1" for "yes" and "0" for "no". CFG_OPENAIRE_SITE = 0 ## CFG_DEVEL_SITE -- is this a development site? If it is, you might ## prefer that it does not do certain things. For example, you might ## not want WebSubmit to send certain emails or trigger certain ## processes on a development site. ## Put "1" for "yes" (this is a development site) or "0" for "no" ## (this isn't a development site.) CFG_DEVEL_SITE = 0 ################################ ## Part 2: Web page style ## ################################ ## The variables affecting the page style. The most important one is ## the 'template skin' you would like to use and the obfuscation mode ## for your email addresses. Please refer to the WebStyle Admin Guide ## for more explanation. The other variables are listed here mostly ## for backwards compatibility purposes only. ## CFG_WEBSTYLE_TEMPLATE_SKIN -- what template skin do you want to ## use? CFG_WEBSTYLE_TEMPLATE_SKIN = default ## CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE. How do we "protect" ## email addresses from undesired automated email harvesters? This ## setting will not affect 'support' and 'admin' emails. ## NOTE: there is no ultimate solution to protect against email ## harvesting. All have drawbacks and can more or less be ## circumvented. Choose you preferred mode ([t] means "transparent" ## for the user): ## -1: hide all emails. ## [t] 0 : no protection, email returned as is. ## foo@example.com => foo@example.com ## 1 : basic email munging: replaces @ by [at] and . by [dot] ## foo@example.com => foo [at] example [dot] com ## [t] 2 : transparent name mangling: characters are replaced by ## equivalent HTML entities. ## foo@example.com => foo@example.com ## [t] 3 : javascript insertion. Requires Javascript enabled on client ## side. ## 4 : replaces @ and . characters by gif equivalents. ## foo@example.com => foo [at] example [dot] com CFG_WEBSTYLE_EMAIL_ADDRESSES_OBFUSCATION_MODE = 2 ## CFG_WEBSTYLE_INSPECT_TEMPLATES -- Do we want to debug all template ## functions so that they would return HTML results wrapped in ## comments indicating which part of HTML page was created by which ## template function? Useful only for debugging Pythonic HTML ## template. See WebStyle Admin Guide for more information. CFG_WEBSTYLE_INSPECT_TEMPLATES = 0 ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP -- eventual global HTML ## left top box: CFG_WEBSTYLE_CDSPAGEBOXLEFTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM -- eventual global ## HTML left bottom box: CFG_WEBSTYLE_CDSPAGEBOXLEFTBOTTOM = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP -- eventual global ## HTML right top box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTTOP = ## (deprecated) CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM -- eventual global ## HTML right bottom box: CFG_WEBSTYLE_CDSPAGEBOXRIGHTBOTTOM = ## CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST -- when certain HTTP status ## codes are raised to the WSGI handler, the corresponding exceptions ## and error messages can be sent to the system administrator for ## inspecting. This is useful to detect and correct errors. The ## variable represents a comma-separated list of HTTP statuses that ## should alert admin. Wildcards are possible. If the status is ## followed by an "r", it means that a referer is required to exist ## (useful to distinguish broken known links from URL typos when 404 ## errors are raised). CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST = 404r,400,5*,41* ## CFG_WEBSTYLE_HTTP_USE_COMPRESSION -- whether to enable deflate ## compression of your HTTP/HTTPS connections. This will affect the Apache ## configuration snippets created by inveniocfg --create-apache-conf and ## the OAI-PMH Identify response. CFG_WEBSTYLE_HTTP_USE_COMPRESSION = 0 ################################## ## Part 3: WebSearch parameters ## ################################## ## This section contains some configuration parameters for WebSearch ## module. Please note that WebSearch is mostly configured on ## run-time via its WebSearch Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (Note that you may modify them ## afterwards too, though.) ## CFG_WEBSEARCH_SEARCH_CACHE_SIZE -- how many queries we want to ## cache in memory per one Apache httpd process? This cache is used ## mainly for "next/previous page" functionality, but it caches also ## "popular" user queries if more than one user happen to search for ## the same thing. Note that large numbers may lead to great memory ## consumption. We recommend a value not greater than 100. CFG_WEBSEARCH_SEARCH_CACHE_SIZE = 0 ## CFG_WEBSEARCH_FIELDS_CONVERT -- if you migrate from an older ## system, you may want to map field codes of your old system (such as ## 'ti') to Invenio/MySQL ("title"). Use Python dictionary syntax ## for the translation table, e.g. {'wau':'author', 'wti':'title'}. ## Usually you don't want to do that, and you would use empty dict {}. CFG_WEBSEARCH_FIELDS_CONVERT = {} ## CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the light search interface, in ## characters. CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 CFG_WEBSEARCH_LIGHTSEARCH_PATTERN_BOX_WIDTH = 60 ## CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH -- width of the search ## pattern window in the simple search interface, in characters. CFG_WEBSEARCH_SIMPLESEARCH_PATTERN_BOX_WIDTH = 40 ## CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH -- width of the ## search pattern window in the advanced search interface, in ## characters. CFG_WEBSEARCH_ADVANCEDSEARCH_PATTERN_BOX_WIDTH = 30 ## CFG_WEBSEARCH_NB_RECORDS_TO_SORT -- how many records do we still ## want to sort? For higher numbers we print only a warning and won't ## perform any sorting other than default 'latest records first', as ## sorting would be very time consuming then. We recommend a value of ## not more than a couple of thousands. CFG_WEBSEARCH_NB_RECORDS_TO_SORT = 1000 ## CFG_WEBSEARCH_CALL_BIBFORMAT -- if a record is being displayed but ## it was not preformatted in the "HTML brief" format, do we want to ## call BibFormatting on the fly? Put "1" for "yes" and "0" for "no". ## Note that "1" will display the record exactly as if it were fully ## preformatted, but it may be slow due to on-the-fly processing; "0" ## will display a default format very fast, but it may not have all ## the fields as in the fully preformatted HTML brief format. Note ## also that this option is active only for old (PHP) formats; the new ## (Python) formats are called on the fly by default anyway, since ## they are much faster. When usure, please set "0" here. CFG_WEBSEARCH_CALL_BIBFORMAT = 0 ## CFG_WEBSEARCH_USE_ALEPH_SYSNOS -- do we want to make old SYSNOs ## visible rather than MySQL's record IDs? You may use this if you ## migrate from a different e-doc system, and you store your old ## system numbers into 970__a. Put "1" for "yes" and "0" for ## "no". Usually you don't want to do that, though. CFG_WEBSEARCH_USE_ALEPH_SYSNOS = 0 ## CFG_WEBSEARCH_I18N_LATEST_ADDITIONS -- Put "1" if you want the ## "Latest Additions" in the web collection pages to show ## internationalized records. Useful only if your brief BibFormat ## templates contains internationalized strings. Otherwise put "0" in ## order not to slow down the creation of latest additions by WebColl. CFG_WEBSEARCH_I18N_LATEST_ADDITIONS = 0 ## CFG_WEBSEARCH_INSTANT_BROWSE -- the number of records to display ## under 'Latest Additions' in the web collection pages. CFG_WEBSEARCH_INSTANT_BROWSE = 10 ## CFG_WEBSEARCH_INSTANT_BROWSE_RSS -- the number of records to ## display in the RSS feed. CFG_WEBSEARCH_INSTANT_BROWSE_RSS = 25 ## CFG_WEBSEARCH_RSS_I18N_COLLECTIONS -- comma-separated list of ## collections that feature an internationalized RSS feed on their ## main seach interface page created by webcoll. Other collections ## will have RSS feed using CFG_SITE_LANG. CFG_WEBSEARCH_RSS_I18N_COLLECTIONS = ## CFG_WEBSEARCH_RSS_TTL -- number of minutes that indicates how long ## a feed cache is valid. CFG_WEBSEARCH_RSS_TTL = 360 ## CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS -- maximum number of request kept ## in cache. If the cache is filled, following request are not cached. CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS = 1000 ## CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD -- up to how many author names ## to print explicitely; for more print "et al". Note that this is ## used in default formatting that is seldomly used, as usually ## BibFormat defines all the format. The value below is only used ## when BibFormat fails, for example. CFG_WEBSEARCH_AUTHOR_ET_AL_THRESHOLD = 3 ## CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS -- whether to show or ## not collection grandsons in Narrow Search boxes (sons are shown by ## default, grandsons are configurable here). Use 0 for no and 1 for ## yes. CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS = 1 ## CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX -- shall we ## create help links for Ellis, Nick or Ellis, Nicholas and friends ## when Ellis, N was searched for? Useful if you have one author ## stored in the database under several name formats, namely surname ## comma firstname and surname comma initial cataloging policy. Use 0 ## for no and 1 for yes. CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX = 1 ## CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS -- MathJax is a JavaScript ## library that renders (La)TeX mathematical formulas in the client ## browser. This parameter must contain a comma-separated list of ## output formats for which to apply the MathJax rendering, for example ## "hb,hd". If the list is empty, MathJax is disabled. CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS = ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT -- when searching ## external collections (e.g. SPIRES, CiteSeer, etc), how many seconds ## do we wait for reply before abandonning? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT = 5 ## CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS -- how many ## results do we fetch? CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_MAXRESULTS = 10 ## CFG_WEBSEARCH_SPLIT_BY_COLLECTION -- do we want to split the search ## results by collection or not? Use 0 for not, 1 for yes. CFG_WEBSEARCH_SPLIT_BY_COLLECTION = 1 ## CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS -- the default number of ## records to display per page in the search results pages. CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS = 10 ## CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS -- in order to limit denial of ## service attacks the total number of records per group displayed as a ## result of a search query will be limited to this number. Only the superuser ## queries will not be affected by this limit. CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS = 200 ## CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL -- logged in users ## might have rights to access some restricted collections. This variable ## tweaks the kind of support the system will automatically provide to the ## user with respect to searching into these restricted collections. ## Set this to 0 in order to have the user to explicitly activate restricted ## collections in order to search into them. Set this to 1 in order to ## propose to the user the list of restricted collections to which he/she has ## rights (note: this is not yet implemented). Set this to 2 in order to ## silently add all the restricted collections to which the user has rights to ## to any query. ## Note: the system will discover which restricted collections a user has ## rights to, at login time. The time complexity of this procedure is ## proportional to the number of restricted collections. E.g. for a system ## with ~50 restricted collections, you might expect ~1s of delay in the ## login time, when this variable is set to a value higher than 0. CFG_WEBSEARCH_PERMITTED_RESTRICTED_COLLECTIONS_LEVEL = 0 ## CFG_WEBSEARCH_SHOW_COMMENT_COUNT -- do we want to show the 'N comments' ## links on the search engine pages? (useful only when you have allowed ## commenting) CFG_WEBSEARCH_SHOW_COMMENT_COUNT = 1 ## CFG_WEBSEARCH_SHOW_REVIEW_COUNT -- do we want to show the 'N reviews' ## links on the search engine pages? (useful only when you have allowed ## reviewing) CFG_WEBSEARCH_SHOW_REVIEW_COUNT = 1 ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS -- how many full-text snippets to ## display for full-text searches? CFG_WEBSEARCH_FULLTEXT_SNIPPETS = 4 ## CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS -- how many context words ## to display around the pattern in the snippet? CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS = 4 ## CFG_WEBSEARCH_WILDCARD_LIMIT -- some of the queries, wildcard ## queries in particular (ex: cern*, a*), but also regular expressions ## (ex: [a-z]+), may take a long time to respond due to the high ## number of hits. You can limit the number of terms matched by a ## wildcard by setting this variable. A negative value or zero means ## that none of the queries will be limited (which may be wanted by ## also prone to denial-of-service kind of attacks). CFG_WEBSEARCH_WILDCARD_LIMIT = 50000 ## CFG_WEBSEARCH_SYNONYM_KBRS -- defines which knowledge bases are to ## be used for which index in order to provide runtime synonym lookup ## of user-supplied terms, and what massaging function should be used ## upon search pattern before performing the KB lookup. (Can be one ## of `exact', 'leading_to_comma', `leading_to_number'.) CFG_WEBSEARCH_SYNONYM_KBRS = { 'journal': ['SEARCH-SYNONYM-JOURNAL', 'leading_to_number'], } ## CFG_SOLR_URL -- optionally, you may use Solr to serve full-text ## queries. If so, please specify the URL of your Solr instance. ## (example: http://localhost:8080/sorl) CFG_SOLR_URL = ## CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT -- specify the limit when ## the previous/next/back hit links are to be displayed on detailed record pages. ## In order to speeding up list manipulations, if a search returns lots of hits, ## more than this limit, then do not loose time calculating next/previous/back ## hits at all, but display page directly without these. ## Note also that Invenio installations that do not like ## to have the next/previous hit link functionality would be able to set this ## variable to zero and not see anything. CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT = 1000 ## CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY -- when a record belongs to more than one ## restricted collection, if the viewrestcoll policy is set to "ALL" (default) ## then the user must be authorized to all the restricted collections, in ## order to be granted access to the specific record. If the policy is set to ## "ANY", then the user need to be authorized to only one of the collections ## in order to be granted access to the specific record. CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY = ALL ####################################### ## Part 4: BibHarvest OAI parameters ## ####################################### ## This part defines parameters for the Invenio OAI gateway. ## Useful if you are running Invenio as OAI data provider. ## CFG_OAI_ID_FIELD -- OAI identifier MARC field: CFG_OAI_ID_FIELD = 909COo ## CFG_OAI_SET_FIELD -- OAI set MARC field: CFG_OAI_SET_FIELD = 909COp +## CFG_OAI_SET_FIELD -- previous OAI set MARC field: +CFG_OAI_PREVIOUS_SET_FIELD = 909COq + ## CFG_OAI_DELETED_POLICY -- OAI deletedrecordspolicy -## (no/transient/persistent). -CFG_OAI_DELETED_POLICY = no +## (no/transient/persistent): +CFG_OAI_DELETED_POLICY = persistent ## CFG_OAI_ID_PREFIX -- OAI identifier prefix: CFG_OAI_ID_PREFIX = atlantis.cern.ch ## CFG_OAI_SAMPLE_IDENTIFIER -- OAI sample identifier: -CFG_OAI_SAMPLE_IDENTIFIER = oai:atlantis.cern.ch:CERN-TH-4036 +CFG_OAI_SAMPLE_IDENTIFIER = oai:atlantis.cern.ch:123 ## CFG_OAI_IDENTIFY_DESCRIPTION -- description for the OAI Identify verb: CFG_OAI_IDENTIFY_DESCRIPTION = - - oai - atlantis.cern.ch - : - oai:atlantis.cern.ch:CERN-TH-4036 - - - http://atlantis.cern.ch/ Free and unlimited use by anybody with obligation to refer to original record Full content, i.e. preprints may not be harvested by robots Submission restricted. Submitted documents are subject of approval by OAI repository admins. ## CFG_OAI_LOAD -- OAI number of records in a response: -CFG_OAI_LOAD = 1000 +CFG_OAI_LOAD = 500 ## CFG_OAI_EXPIRE -- OAI resumptionToken expiration time: CFG_OAI_EXPIRE = 90000 ## CFG_OAI_SLEEP -- service unavailable between two consecutive ## requests for CFG_OAI_SLEEP seconds: -CFG_OAI_SLEEP = 10 +CFG_OAI_SLEEP = 2 + +## CFG_OAI_METADATA_FORMATS -- mapping between accepted metadataPrefixes and +## the corresponding output format to use, its schema and its metadataNamespace. +CFG_OAI_METADATA_FORMATS = { + 'marcxml': ('XOAIMARC', 'http://www.openarchives.org/OAI/1.1/dc.xsd', 'http://purl.org/dc/elements/1.1/'), + 'oai_dc': ('XOAIDC', 'http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd', 'http://www.loc.gov/MARC21/slim'), + } + +## CFG_OAI_FRIENDS -- list of OAI baseURL of friend repositories. See: +## +CFG_OAI_FRIENDS = http://cdsweb.cern.ch/oai2d,http://openaire.cern.ch/oai2d,http://export.arxiv.org/oai2 + +## The following subfields are a completition to +## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. If CFG_OAI_PROVENANCE_BASEURL_SUBFIELD is +## set for a record, then the corresponding field is considered has being +## harvested via OAI-PMH + +## CFG_OAI_PROVENANCE_BASEURL_SUBFIELD -- baseURL of the originDescription or a +## record +CFG_OAI_PROVENANCE_BASEURL_SUBFIELD = u + +## CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD -- datestamp of the originDescription +## or a record +CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD = d + +## CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD -- metadataNamespace of the +## originDescription or a record +CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD = m + +## CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD -- originDescription of the +## originDescription or a record +CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD = d + +## CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD -- harvestDate of the +## originDescription or a record +CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD = h + +## CFG_OAI_PROVENANCE_ALTERED_SUBFIELD -- altered flag of the +## originDescription or a record +CFG_OAI_PROVENANCE_ALTERED_SUBFIELD = t + +## NOTE: the following parameters are experimenta +## ----------------------------------------------------------------------------- +## CFG_OAI_RIGHTS_FIELD -- MARC field dedicated to storing Copyright information +CFG_OAI_RIGHTS_FIELD = 542__ + +## CFG_OAI_RIGHTS_HOLDER_SUBFIELD -- MARC subfield dedicated to storing the +## Copyright holder information +CFG_OAI_RIGHTS_HOLDER_SUBFIELD = d + +## CFG_OAI_RIGHTS_DATE_SUBFIELD -- MARC subfield dedicated to storing the +## Copyright date information +CFG_OAI_RIGHTS_DATE_SUBFIELD = g + +## CFG_OAI_RIGHTS_URI_SUBFIELD -- MARC subfield dedicated to storing the URI +## (URL or URN, more detailed statement about copyright status) information +CFG_OAI_RIGHTS_URI_SUBFIELD = u + +## CFG_OAI_RIGHTS_CONTACT_SUBFIELD -- MARC subfield dedicated to storing the +## Copyright holder contact information +CFG_OAI_RIGHTS_CONTACT_SUBFIELD = e + +## CFG_OAI_RIGHTS_STATEMENT_SUBFIELD -- MARC subfield dedicated to storing the +## Copyright statement as presented on the resource +CFG_OAI_RIGHTS_STATEMENT_SUBFIELD = f + +## CFG_OAI_LICENSE_FIELD -- MARC field dedicated to storing terms governing +## use and reproduction (license) +CFG_OAI_LICENSE_FIELD = 540__ + +## CFG_OAI_LICENSE_TERMS_SUBFIELD -- MARC subfield dedicated to storing the +## Terms governing use and reproduction, e.g. CC License +CFG_OAI_LICENSE_TERMS_SUBFIELD = a + +## CFG_OAI_LICENSE_PUBLISHER_SUBFIELD -- MARC subfield dedicated to storing the +## person or institution imposing the license (author, publisher) +CFG_OAI_LICENSE_PUBLISHER_SUBFIELD = b + +## CFG_OAI_LICENSE_URI_SUBFIELD -- MARC subfield dedicated to storing the URI +## URI +CFG_OAI_LICENSE_URI_SUBFIELD = u +##------------------------------------------------------------------------------ + ################################## ## Part 5: WebSubmit parameters ## ################################## ## This section contains some configuration parameters for WebSubmit ## module. Please note that WebSubmit is mostly configured on ## run-time via its WebSubmit Admin web interface. The parameters ## below are the ones that you do not probably want to modify during ## the runtime. ## CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT -- the fulltext ## documents are stored under "/opt/invenio/var/data/files/gX/Y" ## directories where X is 0,1,... and Y stands for bibdoc ID. Thusly ## documents Y are grouped into directories X and this variable ## indicates the maximum number of documents Y stored in each ## directory X. This limit is imposed solely for filesystem ## performance reasons in order not to have too many subdirectories in ## a given directory. CFG_WEBSUBMIT_FILESYSTEM_BIBDOC_GROUP_LIMIT = 5000 ## CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS -- a comma-separated ## list of document extensions not listed in Python standard mimetype ## library that should be recognized by Invenio. CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS = hpg,link,lis,llb,mat,mpp,msg,docx,docm,xlsx,xlsm,xlsb,pptx,pptm,ppsx,ppsm ## CFG_BIBDOCFILE_USE_XSENDFILE -- if your web server supports ## XSendfile header, you may want to enable this feature in order for ## to Invenio tell the web server to stream files for download (after ## proper authorization checks) by web server's means. This helps to ## liberate Invenio worker processes from being busy with sending big ## files to clients. The web server will take care of that. Note: ## this feature is still somewhat experimental. Note: when enabled ## (set to 1), then you have to also regenerate Apache vhost conf ## snippets (inveniocfg --update-config-py --create-apache-conf). CFG_BIBDOCFILE_USE_XSENDFILE = 0 ## CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY -- a number between 0 and ## 1 that indicates probability with which MD5 checksum will be ## verified when streaming bibdocfile-managed files. (0.1 will cause ## the check to be performed once for every 10 downloads) CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY = 0.1 ## CFG_OPENOFFICE_SERVER_HOST -- the host where an OpenOffice Server is ## listening to. If localhost an OpenOffice server will be started ## automatically if it is not already running. ## Note: if you set this to an empty value this will disable the usage of ## OpenOffice for converting documents. ## If you set this to something different than localhost you'll have to take ## care to have an OpenOffice server running on the corresponding host and ## to install the same OpenOffice release both on the client and on the server ## side. ## In order to launch an OpenOffice server on a remote machine, just start ## the usual 'soffice' executable in this way: ## $> soffice -headless -nologo -nodefault -norestore -nofirststartwizard \ ## .. -accept=socket,host=HOST,port=PORT;urp;StarOffice.ComponentContext CFG_OPENOFFICE_SERVER_HOST = localhost ## CFG_OPENOFFICE_SERVER_PORT -- the port where an OpenOffice Server is ## listening to. CFG_OPENOFFICE_SERVER_PORT = 2002 ## CFG_OPENOFFICE_USER -- the user that will be used to launch the OpenOffice ## client. It is recommended to set this to a user who don't own files, like ## e.g. 'nobody'. You should also authorize your Apache server user to be ## able to become this user, e.g. by adding to your /etc/sudoers the following ## line: ## "apache ALL=(nobody) NOPASSWD: ALL" ## provided that apache is the username corresponding to the Apache user. ## On some machine this might be apache2 or www-data. CFG_OPENOFFICE_USER = nobody ################################# ## Part 6: BibIndex parameters ## ################################# ## This section contains some configuration parameters for BibIndex ## module. Please note that BibIndex is mostly configured on run-time ## via its BibIndex Admin web interface. The parameters below are the ## ones that you do not probably want to modify very often during the ## runtime. ## CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY -- when fulltext indexing, do ## you want to index locally stored files only, or also external URLs? ## Use "0" to say "no" and "1" to say "yes". CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY = 1 ## CFG_BIBINDEX_REMOVE_STOPWORDS -- when indexing, do we want to remove ## stopwords? Use "0" to say "no" and "1" to say "yes". CFG_BIBINDEX_REMOVE_STOPWORDS = 0 ## CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS -- characters considered as ## alphanumeric separators of word-blocks inside words. You probably ## don't want to change this. CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS = \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ ## CFG_BIBINDEX_CHARS_PUNCTUATION -- characters considered as punctuation ## between word-blocks inside words. You probably don't want to ## change this. CFG_BIBINDEX_CHARS_PUNCTUATION = \.\,\:\;\?\!\" ## CFG_BIBINDEX_REMOVE_HTML_MARKUP -- should we attempt to remove HTML markup ## before indexing? Use 1 if you have HTML markup inside metadata ## (e.g. in abstracts), use 0 otherwise. CFG_BIBINDEX_REMOVE_HTML_MARKUP = 0 ## CFG_BIBINDEX_REMOVE_LATEX_MARKUP -- should we attempt to remove LATEX markup ## before indexing? Use 1 if you have LATEX markup inside metadata ## (e.g. in abstracts), use 0 otherwise. CFG_BIBINDEX_REMOVE_LATEX_MARKUP = 0 ## CFG_BIBINDEX_MIN_WORD_LENGTH -- minimum word length allowed to be added to ## index. The terms smaller then this amount will be discarded. ## Useful to keep the database clean, however you can safely leave ## this value on 0 for up to 1,000,000 documents. CFG_BIBINDEX_MIN_WORD_LENGTH = 0 ## CFG_BIBINDEX_URLOPENER_USERNAME and CFG_BIBINDEX_URLOPENER_PASSWORD -- ## access credentials to access restricted URLs, interesting only if ## you are fulltext-indexing files located on a remote server that is ## only available via username/password. But it's probably better to ## handle this case via IP or some convention; the current scheme is ## mostly there for demo only. CFG_BIBINDEX_URLOPENER_USERNAME = mysuperuser CFG_BIBINDEX_URLOPENER_PASSWORD = mysuperpass ## CFG_INTBITSET_ENABLE_SANITY_CHECKS -- ## Enable sanity checks for integers passed to the intbitset data ## structures. It is good to enable this during debugging ## and to disable this value for speed improvements. CFG_INTBITSET_ENABLE_SANITY_CHECKS = False ## CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES -- regular expression that matches ## docnames for which OCR is desired (set this to .* in order to enable ## OCR in general, set this to empty in order to disable it.) CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES = scan-.* ## CFG_BIBINDEX_SPLASH_PAGES -- key-value mapping where the key corresponds ## to a regular expression that matches the URLs of the splash pages of ## a given service and the value is a regular expression of the set of URLs ## referenced via tags in the HTML content of the splash pages that are ## referring to documents that need to be indexed. ## NOTE: for backward compatibility reasons you can set this to a simple ## regular expression that will directly be used as the unique key of the ## map, with corresponding value set to ".*" (in order to match any URL) CFG_BIBINDEX_SPLASH_PAGES = { "http://documents\.cern\.ch/setlink\?.*": ".*", "http://ilcagenda\.linearcollider\.org/subContributionDisplay\.py\?.*|http://ilcagenda\.linearcollider\.org/contributionDisplay\.py\?.*": "http://ilcagenda\.linearcollider\.org/getFile\.py/access\?.*|http://ilcagenda\.linearcollider\.org/materialDisplay\.py\?.*", } ## CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES -- do we want ## the author word index to exclude first names to keep only last ## names? If set to True, then for the author `Bernard, Denis', only ## `Bernard' will be indexed in the word index, not `Denis'. Note ## that if you change this variable, you have to re-index the author ## index via `bibindex -w author -R'. CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES = False ## CFG_BIBINDEX_SYNONYM_KBRS -- defines which knowledge bases are to ## be used for which index in order to provide index-time synonym ## lookup, and what massaging function should be used upon search ## pattern before performing the KB lookup. (Can be one of `exact', ## 'leading_to_comma', `leading_to_number'.) CFG_BIBINDEX_SYNONYM_KBRS = { 'global': ['INDEX-SYNONYM-TITLE', 'exact'], 'title': ['INDEX-SYNONYM-TITLE', 'exact'], } ####################################### ## Part 7: Access control parameters ## ####################################### ## This section contains some configuration parameters for the access ## control system. Please note that WebAccess is mostly configured on ## run-time via its WebAccess Admin web interface. The parameters ## below are the ones that you do not probably want to modify very ## often during the runtime. (If you do want to modify them during ## runtime, for example te deny access temporarily because of backups, ## you can edit access_control_config.py directly, no need to get back ## here and no need to redo the make process.) ## CFG_ACCESS_CONTROL_LEVEL_SITE -- defines how open this site is. ## Use 0 for normal operation of the site, 1 for read-only site (all ## write operations temporarily closed), 2 for site fully closed, ## 3 for also disabling any database connection. ## Useful for site maintenance. CFG_ACCESS_CONTROL_LEVEL_SITE = 0 ## CFG_ACCESS_CONTROL_LEVEL_GUESTS -- guest users access policy. Use ## 0 to allow guest users, 1 not to allow them (all users must login). CFG_ACCESS_CONTROL_LEVEL_GUESTS = 0 ## CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS -- account registration and ## activation policy. When 0, users can register and accounts are ## automatically activated. When 1, users can register but admin must ## activate the accounts. When 2, users cannot register nor update ## their email address, only admin can register accounts. When 3, ## users cannot register nor update email address nor password, only ## admin can register accounts. When 4, the same as 3 applies, nor ## user cannot change his login method. When 5, then the same as 4 ## applies, plus info about how to get an account is hidden from the ## login page. CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN -- limit account ## registration to certain email addresses? If wanted, give domain ## name below, e.g. "cern.ch". If not wanted, leave it empty. CFG_ACCESS_CONTROL_LIMIT_REGISTRATION_TO_DOMAIN = ## CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS -- send a ## notification email to the administrator when a new account is ## created? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_ADMIN_ABOUT_NEW_ACCOUNTS = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT -- send a ## notification email to the user when a new account is created in order to ## to verify the validity of the provided email address? Use ## 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_NEW_ACCOUNT = 1 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION -- send a ## notification email to the user when a new account is activated? ## Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_ACTIVATION = 0 ## CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION -- send a ## notification email to the user when a new account is deleted or ## account demand rejected? Use 0 for no, 1 for yes. CFG_ACCESS_CONTROL_NOTIFY_USER_ABOUT_DELETION = 0 ## CFG_APACHE_PASSWORD_FILE -- the file where Apache user credentials ## are stored. Must be an absolute pathname. If the value does not ## start by a slash, it is considered to be the filename of a file ## located under prefix/var/tmp directory. This is useful for the ## demo site testing purposes. For the production site, if you plan ## to restrict access to some collections based on the Apache user ## authentication mechanism, you should put here an absolute path to ## your Apache password file. CFG_APACHE_PASSWORD_FILE = demo-site-apache-user-passwords ## CFG_APACHE_GROUP_FILE -- the file where Apache user groups are ## defined. See the documentation of the preceding config variable. CFG_APACHE_GROUP_FILE = demo-site-apache-user-groups ################################### ## Part 8: WebSession parameters ## ################################### ## This section contains some configuration parameters for tweaking ## session handling. ## CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT -- number of days after which a session ## and the corresponding cookie is considered expired. CFG_WEBSESSION_EXPIRY_LIMIT_DEFAULT = 2 ## CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER -- number of days after which a session ## and the corresponding cookie is considered expired, when the user has ## requested to permanently stay logged in. CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER = 365 ## CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS -- when user requested ## a password reset, for how many days is the URL valid? CFG_WEBSESSION_RESET_PASSWORD_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS -- when an account ## activation email was sent, for how many days is the URL valid? CFG_WEBSESSION_ADDRESS_ACTIVATION_EXPIRE_IN_DAYS = 3 ## CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS -- when ## user won't confirm his email address and not complete ## registeration, after how many days will it expire? CFG_WEBSESSION_NOT_CONFIRMED_EMAIL_ADDRESS_EXPIRE_IN_DAYS = 10 ## CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS -- when set to 1, the session ## system allocates the same uid=0 to all guests users regardless of where they ## come from. 0 allocate a unique uid to each guest. CFG_WEBSESSION_DIFFERENTIATE_BETWEEN_GUESTS = 0 ################################ ## Part 9: BibRank parameters ## ################################ ## This section contains some configuration parameters for the ranking ## system. ## CFG_BIBRANK_SHOW_READING_STATS -- do we want to show reading ## similarity stats? ('People who viewed this page also viewed') CFG_BIBRANK_SHOW_READING_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_STATS -- do we want to show the download ## similarity stats? ('People who downloaded this document also ## downloaded') CFG_BIBRANK_SHOW_DOWNLOAD_STATS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS -- do we want to show download ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS = 1 ## CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION -- do we ## want to show a graph representing the distribution of client IPs ## downloading given document? CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION = 0 ## CFG_BIBRANK_SHOW_CITATION_LINKS -- do we want to show the 'Cited ## by' links? (useful only when you have citations in the metadata) CFG_BIBRANK_SHOW_CITATION_LINKS = 1 ## CFG_BIBRANK_SHOW_CITATION_STATS -- de we want to show citation ## stats? ('Cited by M recors', 'Co-cited with N records') CFG_BIBRANK_SHOW_CITATION_STATS = 1 ## CFG_BIBRANK_SHOW_CITATION_GRAPHS -- do we want to show citation ## history graph? (0=no | 1=classic/gnuplot | 2=flot) CFG_BIBRANK_SHOW_CITATION_GRAPHS = 1 #################################### ## Part 10: WebComment parameters ## #################################### ## This section contains some configuration parameters for the ## commenting and reviewing facilities. ## CFG_WEBCOMMENT_ALLOW_COMMENTS -- do we want to allow users write ## public comments on records? CFG_WEBCOMMENT_ALLOW_COMMENTS = 1 ## CFG_WEBCOMMENT_ALLOW_REVIEWS -- do we want to allow users write ## public reviews of records? CFG_WEBCOMMENT_ALLOW_REVIEWS = 1 ## CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS -- do we want to allow short ## reviews, that is just the attribution of stars without submitting ## detailed review text? CFG_WEBCOMMENT_ALLOW_SHORT_REVIEWS = 0 ## CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN -- if users ## report a comment to be abusive, how many they have to be before the ## site admin is alerted? CFG_WEBCOMMENT_NB_REPORTS_BEFORE_SEND_EMAIL_TO_ADMIN = 5 ## CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW -- how many comments do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_COMMENTS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW -- how many reviews do ## we display in the detailed record page upon welcome? CFG_WEBCOMMENT_NB_REVIEWS_IN_DETAILED_VIEW = 1 ## CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL -- do we notify the site ## admin after every comment? CFG_WEBCOMMENT_ADMIN_NOTIFICATION_LEVEL = 1 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple comment submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_COMMENTS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS -- how many ## elapsed seconds do we consider enough when checking for possible ## multiple review submissions by a user? CFG_WEBCOMMENT_TIMELIMIT_PROCESSING_REVIEWS_IN_SECONDS = 20 ## CFG_WEBCOMMENT_USE_RICH_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments? CFG_WEBCOMMENT_USE_RICH_TEXT_EDITOR = False ## CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBCOMMENT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBCOMMENT_DEFAULT_MODERATOR -- if no rules are ## specified to indicate who is the comment moderator of ## a collection, this person will be used as default CFG_WEBCOMMENT_DEFAULT_MODERATOR = info@invenio-software.org ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS -- do we want to allow the use ## of MathJax plugin to render latex input in comments? CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS = 1 ## CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION -- allow comment author to ## delete its own comment? CFG_WEBCOMMENT_AUTHOR_DELETE_COMMENT_OPTION = 1 # CFG_WEBCOMMENT_EMAIL_REPLIES_TO -- which field of the record define # email addresses that should be notified of newly submitted comments, # and for which collection. Use collection names as keys, and list of # tags as values CFG_WEBCOMMENT_EMAIL_REPLIES_TO = { 'Articles': ['506__d', '506__m'], } # CFG_WEBCOMMENT_RESTRICTION_DATAFIELD -- which field of the record # define the restriction (must be linked to WebAccess # 'viewrestrcomment') to apply to newly submitted comments, and for # which collection. Use collection names as keys, and one tag as value CFG_WEBCOMMENT_RESTRICTION_DATAFIELD = { 'Articles': '5061_a', 'Pictures': '5061_a', 'Theses': '5061_a', } # CFG_WEBCOMMENT_ROUND_DATAFIELD -- which field of the record define # the current round of comment for which collection. Use collection # name as key, and one tag as value CFG_WEBCOMMENT_ROUND_DATAFIELD = { 'Articles': '562__c', 'Pictures': '562__c', } # CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE -- max file size per attached # file, in bytes. Choose 0 if you don't want to limit the size CFG_WEBCOMMENT_MAX_ATTACHMENT_SIZE = 5242880 # CFG_WEBCOMMENT_MAX_ATTACHED_FILES -- maxium number of files that can # be attached per comment. Choose 0 if you don't want to limit the # number of files. File uploads can be restricted with action # "attachcommentfile". CFG_WEBCOMMENT_MAX_ATTACHED_FILES = 5 # CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH -- how many levels of # indentation discussions can be. This can be used to ensure that # discussions will not go into deep levels of nesting if users don't # understand the difference between "reply to comment" and "add # comment". When the depth is reached, any "reply to comment" is # conceptually converted to a "reply to thread" (i.e. reply to this # parent's comment). Use -1 for no limit, 0 for unthreaded (flat) # discussions. CFG_WEBCOMMENT_MAX_COMMENT_THREAD_DEPTH = 1 ################################## ## Part 11: BibSched parameters ## ################################## ## This section contains some configuration parameters for the ## bibliographic task scheduler. ## CFG_BIBSCHED_REFRESHTIME -- how often do we want to refresh ## bibsched monitor? (in seconds) CFG_BIBSCHED_REFRESHTIME = 5 ## CFG_BIBSCHED_LOG_PAGER -- what pager to use to view bibsched task ## logs? CFG_BIBSCHED_LOG_PAGER = /bin/more ## CFG_BIBSCHED_GC_TASKS_OLDER_THAN -- after how many days to perform the ## gargbage collector of BibSched queue (i.e. removing/moving task to archive). CFG_BIBSCHED_GC_TASKS_OLDER_THAN = 30 ## CFG_BIBSCHED_GC_TASKS_TO_REMOVE -- list of BibTask that can be safely ## removed from the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_REMOVE = bibindex,bibreformat,webcoll,bibrank,inveniogc ## CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE -- list of BibTasks that should be safely ## archived out of the BibSched queue once they are DONE. CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE = bibupload,oaiarchive ## CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS -- maximum number of BibTasks ## that can run concurrently. ## NOTE: concurrent tasks are still considered as an experimental ## feature. Please keep this value set to 1 on production environments. CFG_BIBSCHED_MAX_NUMBER_CONCURRENT_TASKS = 1 ## CFG_BIBSCHED_PROCESS_USER -- bibsched and bibtask processes must ## usually run under the same identity as the Apache web server ## process in order to share proper file read/write privileges. If ## you want to force some other bibsched/bibtask user, e.g. because ## you are using a local `invenio' user that belongs to your ## `www-data' Apache user group and so shares writing rights with your ## Apache web server process in this way, then please set its username ## identity here. Otherwise we shall check whether your ## bibsched/bibtask processes are run under the same identity as your ## Apache web server process (in which case you can leave the default ## empty value here). CFG_BIBSCHED_PROCESS_USER = ## CFG_BIBSCHED_NODE_TASKS -- specific nodes may be configured to ## run only specific tasks; if you want this, then this variable is a ## dictionary of the form {'hostname1': ['task1', 'task2']}. The ## default is that any node can run any task. CFG_BIBSCHED_NODE_TASKS = {} ################################### ## Part 12: WebBasket parameters ## ################################### ## CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS -- a safety limit for ## a maximum number of displayed baskets CFG_WEBBASKET_MAX_NUMBER_OF_DISPLAYED_BASKETS = 20 ## CFG_WEBBASKET_USE_RICH_TEXT_EDITOR -- enable the WYSIWYG ## Javascript-based editor when user edits comments in WebBasket? CFG_WEBBASKET_USE_RICH_TEXT_EDITOR = False ################################## ## Part 13: WebAlert parameters ## ################################## ## This section contains some configuration parameters for the ## automatic email notification alert system. ## CFG_WEBALERT_ALERT_ENGINE_EMAIL -- the email address from which the ## alert emails will appear to be sent: CFG_WEBALERT_ALERT_ENGINE_EMAIL = info@invenio-software.org ## CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL -- how many records ## at most do we send in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_RECORDS_IN_ALERT_EMAIL = 20 ## CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL -- number of ## chars per line in an outgoing alert email? CFG_WEBALERT_MAX_NUM_OF_CHARS_PER_LINE_IN_ALERT_EMAIL = 72 ## CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES -- when sending alert ## emails fails, how many times we retry? CFG_WEBALERT_SEND_EMAIL_NUMBER_OF_TRIES = 3 ## CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES -- when sending ## alert emails fails, what is the sleeptime between tries? (in ## seconds) CFG_WEBALERT_SEND_EMAIL_SLEEPTIME_BETWEEN_TRIES = 300 #################################### ## Part 14: WebMessage parameters ## #################################### ## CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE -- how large web messages do we ## allow? CFG_WEBMESSAGE_MAX_SIZE_OF_MESSAGE = 20000 ## CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES -- how many messages for a ## regular user do we allow in its inbox? CFG_WEBMESSAGE_MAX_NB_OF_MESSAGES = 30 ## CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS -- how many days before ## we delete orphaned messages? CFG_WEBMESSAGE_DAYS_BEFORE_DELETE_ORPHANS = 60 ################################## ## Part 15: MiscUtil parameters ## ################################## ## CFG_MISCUTIL_SQL_USE_SQLALCHEMY -- whether to use SQLAlchemy.pool ## in the DB engine of Invenio. It is okay to enable this flag ## even if you have not installed SQLAlchemy. Note that Invenio will ## loose some perfomance if this option is enabled. CFG_MISCUTIL_SQL_USE_SQLALCHEMY = False ## CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT -- how many queries can we run ## inside run_sql_many() in one SQL statement? The limit value ## depends on MySQL's max_allowed_packet configuration. CFG_MISCUTIL_SQL_RUN_SQL_MANY_LIMIT = 10000 ## CFG_MISCUTIL_SMTP_HOST -- which server to use as outgoing mail server to ## send outgoing emails generated by the system, for example concerning ## submissions or email notification alerts. CFG_MISCUTIL_SMTP_HOST = localhost ## CFG_MISCUTIL_SMTP_PORT -- which port to use on the outgoing mail server ## defined in the previous step. CFG_MISCUTIL_SMTP_PORT = 25 ## CFG_MISCUTILS_DEFAULT_PROCESS_TIMEOUT -- the default number of seconds after ## which a process launched trough shellutils.run_process_with_timeout will ## be killed. This is useful to catch runaway processes. CFG_MISCUTIL_DEFAULT_PROCESS_TIMEOUT = 300 ## CFG_MATHJAX_HOSTING -- if you plan to use MathJax to display TeX ## formulas on HTML web pages, you can specify whether you wish to use ## 'local' hosting or 'cdn' hosting of MathJax libraries. (If set to ## 'local', you have to run 'make install-mathjax-plugin' as described ## in the INSTALL guide.) If set to 'local', users will use your site ## to download MathJax sources. If set to 'cdn', users will use ## centralized MathJax CDN servers instead. Please note that using ## CDN is suitable only for small institutions or for MathJax ## sponsors; see the MathJax website for more details. (Also, please ## note that if you plan to use MathJax on your site, you have to ## adapt CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS and ## CFG_WEBCOMMENT_USE_MATHJAX_IN_COMMENTS configuration variables ## elsewhere in this file.) CFG_MATHJAX_HOSTING = local ################################# ## Part 16: BibEdit parameters ## ################################# ## CFG_BIBEDIT_TIMEOUT -- when a user edits a record, this record is ## locked to prevent other users to edit it at the same time. ## How many seconds of inactivity before the locked record again will be free ## for other people to edit? CFG_BIBEDIT_TIMEOUT = 3600 ## CFG_BIBEDIT_LOCKLEVEL -- when a user tries to edit a record which there ## is a pending bibupload task for in the queue, this shouldn't be permitted. ## The lock level determines how thouroughly the queue should be investigated ## to determine if this is the case. ## Level 0 - always permits editing, doesn't look at the queue ## (unsafe, use only if you know what you are doing) ## Level 1 - permits editing if there are no queued bibedit tasks for this record ## (safe with respect to bibedit, but not for other bibupload maintenance jobs) ## Level 2 - permits editing if there are no queued bibupload tasks of any sort ## (safe, but may lock more than necessary if many cataloguers around) ## Level 3 - permits editing if no queued bibupload task concerns given record ## (safe, most precise locking, but slow, ## checks for 001/EXTERNAL_SYSNO_TAG/EXTERNAL_OAIID_TAG) ## The recommended level is 3 (default) or 2 (if you use maintenance jobs often). CFG_BIBEDIT_LOCKLEVEL = 3 ## CFG_BIBEDIT_PROTECTED_FIELDS -- a comma-separated list of fields that BibEdit ## will not allow to be added, edited or deleted. Wildcards are not supported, ## but conceptually a wildcard is added at the end of every field specification. ## Examples: ## 500A - protect all MARC fields with tag 500 and first indicator A ## 5 - protect all MARC fields in the 500-series. ## 909C_a - protect subfield a in tag 909 with first indicator C and empty ## second indicator ## Note that 001 is protected by default, but if protection of other ## identifiers or automated fields is a requirement, they should be added to ## this list. CFG_BIBEDIT_PROTECTED_FIELDS = ## CFG_BIBEDIT_QUEUE_CHECK_METHOD -- how do we want to check for ## possible queue locking situations to prevent cataloguers from ## editing a record that may be waiting in the queue? Use 'bibrecord' ## for exact checking (always works, but may be slow), use 'regexp' ## for regular expression based checking (very fast, but may be ## inaccurate). When unsure, use 'bibrecord'. CFG_BIBEDIT_QUEUE_CHECK_METHOD = bibrecord ## CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE -- a dictionary ## containing which collections will be extended with a given template ## while being displayed in BibEdit UI. CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE = { 'Poetry' : 'poem'} ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING -- maximum number of records ## that can be modified instantly using the multi-record editor. Above ## this limit, modifications will only be executed in limited hours. CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING = 2000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING -- maximum number of records ## that can be send for modification without having a superadmin role. ## If the number of records is between CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING ## and this number, the modifications will take place only in limited hours. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING = 20000 ## CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME -- Allowed time to ## execute modifications on records, when the number exceeds ## CFG_BIBEDITMULTI_LIMIT_INSTANT_PROCESSING. CFG_BIBEDITMULTI_LIMIT_DELAYED_PROCESSING_TIME = 22:00-05:00 ################################### ## Part 17: BibUpload parameters ## ################################### ## CFG_BIBUPLOAD_REFERENCE_TAG -- where do we store references? CFG_BIBUPLOAD_REFERENCE_TAG = 999 ## CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG -- where do we store external ## system numbers? Useful for matching when our records come from an ## external digital library system. CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG = 970__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG -- where do we store OAI ID tags ## of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI ID stored in this tag (kind of like ## external system number too). CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG = 035__a ## CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG -- where do we store OAI SRC ## tags of harvested records? Useful for matching when we harvest stuff ## via OAI that we do not want to reexport via Invenio OAI; so records ## may have only the source OAI SRC stored in this tag (kind of like ## external system number too). Note that the field should be the same of ## CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG = 035__9 ## CFG_BIBUPLOAD_STRONG_TAGS -- a comma-separated list of tags that ## are strong enough to resist the replace mode. Useful for tags that ## might be created from an external non-metadata-like source, ## e.g. the information about the number of copies left. CFG_BIBUPLOAD_STRONG_TAGS = 964 ## CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS -- a comma-separated list ## of tags that contain provenance information that should be checked ## in the bibupload correct mode via matching provenance codes. (Only ## field instances of the same provenance information would be acted ## upon.) Please specify the whole tag info up to subfield codes. CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS = 6531_9 ## CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS -- a comma-separated list of system ## paths from which it is allowed to take fulltextes that will be uploaded via ## FFT (CFG_TMPDIR is included by default). CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS = /tmp,/home ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS -- a dictionary containing external ## URLs that can be accessed by Invenio and specific HTTP headers that will be ## used for each URL. ## The keys of the dictionary are regular expressions matching a set of URLs, ## the values are dictionaries of headers as consumed by urllib2.Request. If a ## regular expression matching all URLs is created at the end of the list, it ## means that Invenio will download all URLs. Otherwise Invenio will just ## download authorized URLs. ## CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ## ('http://myurl.com/.*', {'User-Agent': 'Me'}), ## ('http://yoururl.com/.*', {'User-Agent': 'You', 'Accept': 'text/plain'}), ## ('http://.*', {'User-Agent': 'Invenio'}), ## ] CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS = [ ('http://.*', {'User-Agent': 'Invenio'}), ] ## CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE -- do we want to serialize ## internal representation of records (Pythonic record structure) into ## the database? This can improve internal processing speed of some ## operations at the price of somewhat bigger disk space usage. ## If you change this value after some records have already been added ## to your installation, you may want to run: ## $ /opt/invenio/bin/inveniocfg --reset-recstruct-cache ## in order to either erase the cache thus freeing database space, ## or to fill the cache for all records that have not been cached yet. CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE = 1 ## CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY -- a comma-separated list ## indicating which fields match the file names of the documents to be ## uploaded. ## The matching will be done in the same order as the list provided. CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY = reportnumber,recid ## CFG_BATCHUPLOADER_DAEMON_DIR -- Directory where the batchuploader daemon ## will look for the subfolders metadata and document by default. ## If path is relative, CFG_PREFIX will be joined as a prefix CFG_BATCHUPLOADER_DAEMON_DIR = var/batchupload ## CFG_BATCHUPLOADER_WEB_ROBOT_AGENT -- Comma-separated list to specify the ## agents permitted when calling batch uploader web interface ## cdsweb.cern.ch/batchuploader/robotupload ## if using a curl, eg: curl xxx -A invenio_webupload CFG_BATCHUPLOADER_WEB_ROBOT_AGENT = invenio_webupload ## CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS -- Access list specifying for each ## IP address, which collections are allowed using batch uploader robot ## interface. CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS = { '10.0.0.1': ['BOOK', 'REPORT'], # Example 1 '10.0.0.2': ['POETRY', 'PREPRINT'], # Example 2 } #################################### ## Part 18: BibCatalog parameters ## #################################### ## CFG_BIBCATALOG_SYSTEM -- set desired catalog system. For example, RT. CFG_BIBCATALOG_SYSTEM = ## RT CONFIGURATION ## CFG_BIBCATALOG_SYSTEM_RT_CLI -- path to the RT CLI client CFG_BIBCATALOG_SYSTEM_RT_CLI = /usr/bin/rt ## CFG_BIBCATALOG_SYSTEM_RT_URL -- Base URL of the remote RT system CFG_BIBCATALOG_SYSTEM_RT_URL = http://localhost/rt3 ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER -- Set the username for a default RT account ## on remote system, with limited privileges, in order to only create and modify own tickets. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER = ## CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD -- Set the password for the default RT account ## on remote system. CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD = #################################### ## Part 19: BibFormat parameters ## #################################### ## CFG_BIBFORMAT_HIDDEN_TAGS -- comma-separated list of MARC tags that ## are not shown to users not having cataloging authorizations. CFG_BIBFORMAT_HIDDEN_TAGS = 595 ## CFG_BIBFORMAT_ADDTHIS_ID -- if you want to use the AddThis service from ## , set this value to the pubid parameter as ## provided by the service (e.g. ra-4ff80aae118f4dad), and add a call to ## formatting element in your formats, for example ## Default_HTML_detailed.bft. CFG_BIBFORMAT_ADDTHIS_ID = #################################### ## Part 20: BibMatch parameters ## #################################### ## CFG_BIBMATCH_LOCAL_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on LOCAL system. CFG_BIBMATCH_LOCAL_SLEEPTIME = 0.0 ## CFG_BIBMATCH_REMOTE_SLEEPTIME -- Determines the amount of seconds to sleep ## between search queries on REMOTE systems. CFG_BIBMATCH_REMOTE_SLEEPTIME = 2.0 ## CFG_BIBMATCH_FUZZY_WORDLIMITS -- Determines the amount of words to extract ## from a certain fields value during fuzzy matching mode. Add/change field ## and appropriate number to the dictionary to configure this. CFG_BIBMATCH_FUZZY_WORDLIMITS = { '100__a': 2, '245__a': 4 } ## CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT -- Determines the amount of empty results ## to accept during fuzzy matching mode. CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT = 1 ## CFG_BIBMATCH_QUERY_TEMPLATES -- Here you can set the various predefined querystrings ## used to standardize common matching queries. By default the following templates ## are given: ## title - standard title search. Taken from 245__a (default) ## title-author - title and author search (i.e. this is a title AND author a) ## Taken from 245__a and 100__a ## reportnumber - reportnumber search (i.e. reportnumber:REP-NO-123). CFG_BIBMATCH_QUERY_TEMPLATES = { 'title' : '[title]', 'title-author' : '[title] [author]', 'reportnumber' : 'reportnumber:[reportnumber]' } ###################################### ## Part 21: BibAuthorID parameters ## ###################################### # CFG_BIBAUTHORID_MAX_PROCESSES is the max number of processes # that may be spawned by the disambiguation algorithm CFG_BIBAUTHORID_MAX_PROCESSES = 12 # CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS is the max number of threads # to parallelize sql queries during personID tables updates CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS = 12 # CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA is the minimum confidence needed # when backtracking automatically disambiguated authors to persons. # Values in [0,1] CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA = 0.5 # CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA is the threshold for # the confidence in a paper by the disambiguation algorithm to have it # automatically connected to a personID. Papers below the thresholds are # left disconnected from persons if not already connected in other ways. # values in [0,1] CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA = 0.5 # CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH minimum threshold for # disambiguated authors and persons: if less compatible than this the update # process will create a new person to associate to the found disambiguated author. CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH = 0.5 # CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N is a fallback mechanism # to force a merge if a certain percentage of papers is compatible no matter # what the confidences on the automatically disambiguated author looks like CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N = 0.5 # CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY defines the user info # keys for externally claimed records in an remote-login scenario--e.g. from arXiv.org # e.g. "external_arxivids" for arXiv SSO CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY = # CFG_BIBAUTHORID_ATTACH_VA_TO_MULTIPLE_RAS determines if the authorid # algorithm is allowed to attach a virtual author to multiple # real authors in the last run of the orphan processing. # Comma separated list of values. CFG_BIBAUTHORID_ATTACH_VA_TO_MULTIPLE_RAS = False # CFG_BIBAUTHORID_AID_ENABLED # Globally enable AuthorID Interfaces. # If False: No guest, user or operator will have access to the system. CFG_BIBAUTHORID_ENABLED = True # CFG_BIBAUTHORID_AID_ON_AUTHORPAGES # Enable AuthorID information on the author pages. CFG_BIBAUTHORID_ON_AUTHORPAGES = True # CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL defines the eMail address # all ticket requests concerning authors will be sent to. CFG_BIBAUTHORID_AUTHOR_TICKET_ADMIN_EMAIL = info@invenio-software.org #CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE defines if the optional arXive stub page is skipped CFG_BIBAUTHORID_UI_SKIP_ARXIV_STUB_PAGE = False ###################################### ## Part 22: BibClassify parameters ## ###################################### # CFG_BIBCLASSIFY_WEB_MAXKW -- maximum number of keywords to display # in the Keywords tab web page. CFG_BIBCLASSIFY_WEB_MAXKW = 100 ######################################## ## Part 23: Plotextractor parameters ## ######################################## ## CFG_PLOTEXTRACTOR_SOURCE_BASE_URL -- for acquiring source tarballs for plot ## extraction, where should we look? If nothing is set, we'll just go ## to arXiv, but this can be a filesystem location, too CFG_PLOTEXTRACTOR_SOURCE_BASE_URL = http://arxiv.org/ ## CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the tarballs sit CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER = e-print/ ## CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER -- for acquiring source tarballs for plot ## extraction, subfolder where the pdf sit CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER = pdf/ ## CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT -- a float representing the number of seconds ## to wait between each download of pdf and/or tarball from source URL. CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT = 2.0 ## CFG_PLOTEXTRACTOR_CONTEXT_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of characters in each direction to extract ## context from. Default 750. CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT = 750 ## CFG_PLOTEXTRACTOR_DISALLOWED_TEX -- when extracting context of plots from TeX ## sources, this is the list of TeX tags that will trigger 'end of context'. CFG_PLOTEXTRACTOR_DISALLOWED_TEX = begin,end,section,includegraphics,caption,acknowledgements ## CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of words in each direction. Default 75. CFG_PLOTEXTRACTOR_CONTEXT_WORD_LIMIT = 75 ## CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of sentences in each direction. Default 2. CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT = 2 ###################################### ## Part 24: WebStat parameters ## ###################################### # CFG_WEBSTAT_BIBCIRCULATION_START_YEAR defines the start date of the BibCirculation # statistics. Value should have the format 'yyyy'. If empty, take all existing data. CFG_WEBSTAT_BIBCIRCULATION_START_YEAR = ########################## ## THAT's ALL, FOLKS! ## ########################## diff --git a/configure.ac b/configure.ac index 0de604829..7be5c1417 100644 --- a/configure.ac +++ b/configure.ac @@ -1,881 +1,882 @@ ## This file is part of Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ## This is Invenio main configure.ac file. If you change this ## file, then please run "autoreconf" to regenerate the "configure" ## script. ## Initialize autoconf and automake: AC_INIT([invenio], m4_esyscmd([./git-version-gen .tarball-version]), [info@invenio-software.org]) AM_INIT_AUTOMAKE([tar-ustar]) ## By default we shall install into /opt/invenio. (Do not use ## AC_PREFIX_DEFAULT for this, because it would not work well with ## the localstatedir hack below.) test "${prefix}" = NONE && prefix=/opt/invenio ## Remove eventual trailing slashes from the prefix value: test "${prefix%/}" != "" && prefix=${prefix%/} ## Check for install: AC_PROG_INSTALL ## Check for gettext support: AM_GNU_GETTEXT(external) AM_GNU_GETTEXT_VERSION(0.14.4) ## Check for MySQL client: AC_MSG_CHECKING(for mysql) AC_ARG_WITH(mysql, AC_HELP_STRING([--with-mysql], [path to a specific MySQL binary (optional)]), MYSQL=${withval}) if test -n "$MYSQL"; then AC_MSG_RESULT($MYSQL) else AC_PATH_PROG(MYSQL, mysql) if test -z "$MYSQL"; then AC_MSG_ERROR([ MySQL command-line client was not found in your PATH. Please install it first. Available from .]) fi fi ## Check for Python: AC_MSG_CHECKING(for python) AC_ARG_WITH(python, AC_HELP_STRING([--with-python], [path to a specific Python binary (optional)]), PYTHON=${withval}) if test -n "$PYTHON"; then AC_MSG_RESULT($PYTHON) else AC_PATH_PROG(PYTHON, python) if test -z "$PYTHON"; then AC_MSG_ERROR([ Python was not found in your PATH. Please either install it in your PATH or specify --with-python configure option. Python is available from .]) fi fi ## Check for OpenOffice.org Python binary: AC_MSG_CHECKING(for OpenOffice.org Python binary) AC_ARG_WITH(openoffice-python, AC_HELP_STRING([--with-openoffice-python], [path to a specific OpenOffice.org Python binary (optional)]), OPENOFFICE_PYTHON=`which ${withval}`) if test -z "$OPENOFFICE_PYTHON"; then OPENOFFICE_PYTHON=`locate -l 1 -r "o.*office/program/python$"` OPENOFFICE_PYTHON="$PYTHON $OPENOFFICE_PYTHON" if test -n "$OPENOFFICE_PYTHON" && ($OPENOFFICE_PYTHON -c "import uno" 2> /dev/null); then AC_MSG_RESULT($OPENOFFICE_PYTHON) else AC_MSG_WARN([ You have not specified the path ot the OpenOffice.org Python binary. OpenOffice.org and Microsoft Office document conversion and fulltext indexing will not be available. We recommend you to install OpenOffice.org first and to rerun the configure script. OpenOffice.org is available from .]) fi elif ($OPENOFFICE_PYTHON -c "import uno" 2> /dev/null); then AC_MSG_RESULT($OPENOFFICE_PYTHON) else AC_MSG_ERROR([ The specified OpenOffice.org Python binary is not correctly configured. Please specify the correct path to the specific OpenOffice Python binary (OpenOffice.org is available from ).]) fi ## Check for Python version and modules: AC_MSG_CHECKING(for required Python modules) $PYTHON ${srcdir}/configure-tests.py if test $? -ne 0; then AC_MSG_ERROR([Please fix the above Python problem before continuing.]) fi AC_MSG_RESULT(found) ## Check for PHP: AC_PATH_PROG(PHP, php) ## Check for gzip: AC_PATH_PROG(GZIP, gzip) if test -z "$GZIP"; then AC_MSG_WARN([ Gzip was not found in your PATH. It is used in the WebSubmit module to compress the data submitted in an archive. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Gzip is available from .]) fi ## Check for gunzip: AC_PATH_PROG(GUNZIP, gunzip) if test -z "$GUNZIP"; then AC_MSG_WARN([ Gunzip was not found in your PATH. It is used in the WebSubmit module to correctly deal with submitted compressed files. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Gunzip is available from .]) fi ## Check for tar: AC_PATH_PROG(TAR, tar) if test -z "$TAR"; then AC_MSG_WARN([ Tar was not found in your PATH. It is used in the WebSubmit module to pack the submitted data into an archive. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Tar is available from .]) fi ## Check for wget: AC_PATH_PROG(WGET, wget) if test -z "$WGET"; then AC_MSG_WARN([ wget was not found in your PATH. It is used for the fulltext file retrieval. You can continue without it but we recomend you to install it first and to rerun the configure script. wget is available from .]) fi ## Check for md5sum: AC_PATH_PROG(MD5SUM, md5sum) if test -z "$MD5SUM"; then AC_MSG_WARN([ md5sum was not found in your PATH. It is used for the fulltext file checksum verification. You can continue without it but we recomend you to install it first and to rerun the configure script. md5sum is available from .]) fi ## Check for ps2pdf: AC_PATH_PROG(PS2PDF, ps2pdf) if test -z "$PS2PDF"; then AC_MSG_WARN([ ps2pdf was not found in your PATH. It is used in the WebSubmit module to convert submitted PostScripts into PDF. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. ps2pdf is available from .]) fi ## Check for pdflatex: AC_PATH_PROG(PDFLATEX, pdflatex) if test -z "$PDFLATEX"; then AC_MSG_WARN([ pdflatex was not found in your PATH. It is used in the WebSubmit module to stamp PDF files. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script.]) fi ## Check for tiff2pdf: AC_PATH_PROG(TIFF2PDF, tiff2pdf) if test -z "$TIFF2PDF"; then AC_MSG_WARN([ tiff2pdf was not found in your PATH. It is used in the WebSubmit module to convert submitted TIFF file into PDF. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. tiff2pdf is available from .]) fi ## Check for gs: AC_PATH_PROG(GS, gs) if test -z "$GS"; then AC_MSG_WARN([ gs was not found in your PATH. It is used in the WebSubmit module to convert submitted PostScripts into PDF. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. gs is available from .]) fi ## Check for pdftotext: AC_PATH_PROG(PDFTOTEXT, pdftotext) if test -z "$PDFTOTEXT"; then AC_MSG_WARN([ pdftotext was not found in your PATH. It is used for the fulltext indexation of PDF files. You can continue without it but you may miss fulltext searching capability of Invenio. We recomend you to install it first and to rerun the configure script. pdftotext is available from . ]) fi ## Check for pdftotext: AC_PATH_PROG(PDFINFO, pdfinfo) if test -z "$PDFINFO"; then AC_MSG_WARN([ pdfinfo was not found in your PATH. It is used for gathering information on PDF files. You can continue without it but you may miss this feature of Invenio. We recomend you to install it first and to rerun the configure script. pdftotext is available from . ]) fi ## Check for pdftk: AC_PATH_PROG(PDFTK, pdftk) if test -z "$PDFTK"; then AC_MSG_WARN([ pdftk was not found in your PATH. It is used for the fulltext file stamping. You can continue without it but you may miss this feature of Invenio. We recomend you to install it first and to rerun the configure script. pdftk is available from . ]) fi ## Check for pdf2ps: AC_PATH_PROG(PDF2PS, pdf2ps) if test -z "$PDF2PS"; then AC_MSG_WARN([ pdf2ps was not found in your PATH. It is used in the WebSubmit module to convert submitted PDFs into PostScript. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdf2ps is available from .]) fi ## Check for pdftops: AC_PATH_PROG(PDFTOPS, pdftops) if test -z "$PDFTOPS"; then AC_MSG_WARN([ pdftops was not found in your PATH. It is used in the WebSubmit module to convert submitted PDFs into PostScript. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdftops is available from .]) fi ## Check for pdfopt: AC_PATH_PROG(PDFOPT, pdfopt) if test -z "$PDFOPT"; then AC_MSG_WARN([ pdfopt was not found in your PATH. It is used in the WebSubmit module to linearized submitted PDFs. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdfopt is available from .]) fi ## Check for pdfimages: AC_PATH_PROG(PDFTOPPM, pdftoppm) if test -z "$PDFTOPPM"; then AC_MSG_WARN([ pdftoppm was not found in your PATH. It is used in the WebSubmit module to extract images from PDFs for OCR. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pdftoppm is available from .]) fi ## Check for pdfimages: AC_PATH_PROG(PAMFILE, pdftoppm) if test -z "$PAMFILE"; then AC_MSG_WARN([ pamfile was not found in your PATH. It is used in the WebSubmit module to retrieve the size of images extracted from PDFs for OCR. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. pamfile is available as part of the netpbm utilities from: .]) fi ## Check for ocroscript: AC_PATH_PROG(OCROSCRIPT, ocroscript) if test -z "$OCROSCRIPT"; then AC_MSG_WARN([ If you plan to run OCR on your PDFs, then please install ocroscript now. Otherwise you can safely continue. You have also an option to install ocroscript later and edit invenio-local.conf to let Invenio know the path to ocroscript. ocroscript is available as part of OCROpus from . NOTE: Since OCROpus is being actively developed and its api is continuosly changing, please install relase 0.3.1]) fi ## Check for pstotext: AC_PATH_PROG(PSTOTEXT, pstotext) if test -z "$PSTOTEXT"; then AC_MSG_WARN([ pstotext was not found in your PATH. It is used for the fulltext indexation of PDF and PostScript files. Please install pstotext. Otherwise you can safely continue. You have also an option to install pstotext later and edit invenio-local.conf to let Invenio know the path to pstotext. pstotext is available from . ]) fi ## Check for ps2ascii: AC_PATH_PROG(PSTOASCII, ps2ascii) if test -z "$PSTOASCII"; then AC_MSG_WARN([ ps2ascii was not found in your PATH. It is used for the fulltext indexation of PostScript files. Please install ps2ascii. Otherwise you can safely continue. You have also an option to install ps2ascii later and edit invenio-local.conf to let Invenio know the path to ps2ascii. ps2ascii is available from . ]) fi ## Check for any2djvu: AC_PATH_PROG(ANY2DJVU, any2djvu) if test -z "$ANY2DJVU"; then AC_MSG_WARN([ any2djvu was not found in your PATH. It is used in the WebSubmit module to convert documents to DJVU. Please install any2djvu. Otherwise you can safely continue. You have also an option to install any2djvu later and edit invenio-local.conf to let Invenio know the path to any2djvu. any2djvu is available from .]) fi ## Check for DJVUPS: AC_PATH_PROG(DJVUPS, djvups) if test -z "$DJVUPS"; then AC_MSG_WARN([ djvups was not found in your PATH. It is used in the WebSubmit module to convert documents from DJVU. Please install djvups. Otherwise you can safely continue. You have also an option to install djvups later and edit invenio-local.conf to let Invenio know the path to djvups. djvups is available from .]) fi ## Check for DJVUTXT: AC_PATH_PROG(DJVUTXT, djvutxt) if test -z "$DJVUTXT"; then AC_MSG_WARN([ djvutxt was not found in your PATH. It is used in the WebSubmit module to extract text from DJVU documents. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. djvutxt is available from .]) fi ## Check for file: AC_PATH_PROG(FILE, file) if test -z "$FILE"; then AC_MSG_WARN([ File was not found in your PATH. It is used in the WebSubmit module to check the validity of the submitted files. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. File is available from .]) fi ## Check for convert: AC_PATH_PROG(CONVERT, convert) if test -z "$CONVERT"; then AC_MSG_WARN([ Convert was not found in your PATH. It is used in the WebSubmit module to create an icon from a submitted picture. You can continue without it but you will miss some Invenio functionality. We recommend you to install it first and to rerun the configure script. Convert is available from .]) fi ## Check for CLISP: AC_MSG_CHECKING(for clisp) AC_ARG_WITH(clisp, AC_HELP_STRING([--with-clisp], [path to a specific CLISP binary (optional)]), CLISP=${withval}) if test -n "$CLISP"; then AC_MSG_RESULT($CLISP) else AC_PATH_PROG(CLISP, clisp) if test -z "$CLISP"; then AC_MSG_WARN([ GNU CLISP was not found in your PATH. It is used by the WebStat module to produce statistics about Invenio usage. (Alternatively, SBCL or CMUCL can be used instead of CLISP.) You can continue without it but you will miss this feature. We recommend you to install it first (if you don't have neither CMUCL nor SBCL) and to rerun the configure script. GNU CLISP is available from .]) fi fi ## Check for CMUCL: AC_MSG_CHECKING(for cmucl) AC_ARG_WITH(cmucl, AC_HELP_STRING([--with-cmucl], [path to a specific CMUCL binary (optional)]), CMUCL=${withval}) if test -n "$CMUCL"; then AC_MSG_RESULT($CMUCL) else AC_PATH_PROG(CMUCL, cmucl) if test -z "$CMUCL"; then AC_MSG_CHECKING(for lisp) # CMUCL can also be installed under `lisp' exec name AC_PATH_PROG(CMUCL, lisp) fi if test -z "$CMUCL"; then AC_MSG_WARN([ CMUCL was not found in your PATH. It is used by the WebStat module to produce statistics about Invenio usage. (Alternatively, CLISP or SBCL can be used instead of CMUCL.) You can continue without it but you will miss this feature. We recommend you to install it first (if you don't have neither CLISP nor SBCL) and to rerun the configure script. CMUCL is available from .]) fi fi ## Check for SBCL: AC_MSG_CHECKING(for sbcl) AC_ARG_WITH(sbcl, AC_HELP_STRING([--with-sbcl], [path to a specific SBCL binary (optional)]), SBCL=${withval}) if test -n "$SBCL"; then AC_MSG_RESULT($SBCL) else AC_PATH_PROG(SBCL, sbcl) if test -z "$SBCL"; then AC_MSG_WARN([ SBCL was not found in your PATH. It is used by the WebStat module to produce statistics about Invenio usage. (Alternatively, CLISP or CMUCL can be used instead of SBCL.) You can continue without it but you will miss this feature. We recommend you to install it first (if you don't have neither CLISP nor CMUCL) and to rerun the configure script. SBCL is available from .]) fi fi ## Check for gnuplot: AC_PATH_PROG(GNUPLOT, gnuplot) if test -z "$GNUPLOT"; then AC_MSG_WARN([ Gnuplot was not found in your PATH. It is used by the BibRank module to produce graphs about download and citation history. You can continue without it but you will miss these graphs. We recommend you to install it first and to rerun the configure script. Gnuplot is available from .]) fi ## Check for ffmpeg: AC_PATH_PROG(FFMPEG, ffmpeg) AC_PATH_PROG(FFPROBE, ffprobe) if test -z "$FFMPEG"; then AC_MSG_WARN([ FFmpeg was not found in your PATH. It is used by the BibEncode module to for video encoding. You can continue without but you will not be able to use BibEncode and no video submission workflows are thereby possible. We recommend you to install it first if you would like to support video submissions and to rerun the configure script. FFmpeg is available from .]) fi ## Check for mediainfo: AC_PATH_PROG(MEDIAINFO, mediainfo) if test -z "$MEDIAINFO"; then AC_MSG_WARN([ Mediainfo was not found in your PATH. It is used by the BibEncode module to for video encoding and media metadata handling. You can continue without but you will not be able to use BibEncode and no video submission workflows are thereby possible. We recommend you to install it first if you would like to support video submissions and to rerun the configure script. Mediainfo is available from .]) fi ## Check for ffmpeg ## Substitute variables: AC_SUBST(VERSION) AC_SUBST(OPENOFFICE_PYTHON) AC_SUBST(MYSQL) AC_SUBST(PYTHON) AC_SUBST(GZIP) AC_SUBST(GUNZIP) AC_SUBST(TAR) AC_SUBST(WGET) AC_SUBST(MD5SUM) AC_SUBST(PS2PDF) AC_SUBST(GS) AC_SUBST(PDFTOTEXT) AC_SUBST(PDFTK) AC_SUBST(PDF2PS) AC_SUBST(PDFTOPS) AC_SUBST(PDFOPT) AC_SUBST(PDFTOPPM) AC_SUBST(OCROSCRIPT) AC_SUBST(PSTOTEXT) AC_SUBST(PSTOASCII) AC_SUBST(ANY2DJVU) AC_SUBST(DJVUPS) AC_SUBST(DJVUTXT) AC_SUBST(FILE) AC_SUBST(CONVERT) AC_SUBST(GNUPLOT) AC_SUBST(CLISP) AC_SUBST(CMUCL) AC_SUBST(SBCL) AC_SUBST(CACHEDIR) AC_SUBST(FFMPEG) AC_SUBST(MEDIAINFO) AC_SUBST(FFPROBE) AC_SUBST(localstatedir, `eval echo "${localstatedir}"`) ## Define output files: AC_CONFIG_FILES([config.nice \ Makefile \ po/Makefile.in \ config/Makefile \ config/invenio-autotools.conf \ modules/Makefile \ modules/bibauthorid/Makefile \ modules/bibauthorid/bin/Makefile \ modules/bibauthorid/bin/bibauthorid \ modules/bibauthorid/doc/Makefile \ modules/bibauthorid/doc/admin/Makefile \ modules/bibauthorid/doc/hacking/Makefile \ modules/bibauthorid/lib/Makefile \ modules/bibauthorid/etc/Makefile \ modules/bibauthorid/etc/name_authority_files/Makefile \ modules/bibauthorid/web/Makefile \ modules/bibcatalog/Makefile \ modules/bibcatalog/doc/Makefile \ modules/bibcatalog/doc/admin/Makefile \ modules/bibcatalog/doc/hacking/Makefile modules/bibcatalog/lib/Makefile \ modules/bibcheck/Makefile \ modules/bibcheck/doc/Makefile \ modules/bibcheck/doc/admin/Makefile \ modules/bibcheck/doc/hacking/Makefile \ modules/bibcheck/etc/Makefile \ modules/bibcheck/web/Makefile \ modules/bibcheck/web/admin/Makefile \ modules/bibcirculation/Makefile \ modules/bibcirculation/bin/Makefile \ modules/bibcirculation/doc/Makefile \ modules/bibcirculation/doc/admin/Makefile \ modules/bibcirculation/doc/hacking/Makefile modules/bibcirculation/lib/Makefile \ modules/bibcirculation/web/Makefile \ modules/bibcirculation/web/admin/Makefile \ modules/bibclassify/Makefile \ modules/bibclassify/bin/Makefile \ modules/bibclassify/bin/bibclassify \ modules/bibclassify/doc/Makefile \ modules/bibclassify/doc/admin/Makefile \ modules/bibclassify/doc/hacking/Makefile \ modules/bibclassify/etc/Makefile \ modules/bibclassify/lib/Makefile \ modules/bibconvert/Makefile \ modules/bibconvert/bin/Makefile \ modules/bibconvert/bin/bibconvert \ modules/bibconvert/doc/Makefile \ modules/bibconvert/doc/admin/Makefile \ modules/bibconvert/doc/hacking/Makefile \ modules/bibconvert/etc/Makefile \ modules/bibconvert/lib/Makefile \ modules/bibedit/Makefile \ modules/bibedit/bin/Makefile \ modules/bibedit/bin/bibedit \ modules/bibedit/bin/refextract \ modules/bibedit/bin/xmlmarc2textmarc \ modules/bibedit/bin/textmarc2xmlmarc \ modules/bibedit/bin/xmlmarclint \ modules/bibedit/doc/Makefile \ modules/bibedit/doc/admin/Makefile \ modules/bibedit/doc/hacking/Makefile \ modules/bibedit/etc/Makefile \ modules/bibedit/lib/Makefile \ modules/bibedit/web/Makefile \ modules/bibencode/Makefile \ modules/bibencode/bin/Makefile \ modules/bibencode/bin/bibencode \ modules/bibencode/lib/Makefile \ modules/bibencode/etc/Makefile \ modules/bibencode/www/Makefile \ modules/bibexport/Makefile \ modules/bibexport/bin/Makefile \ modules/bibexport/bin/bibexport \ modules/bibexport/doc/Makefile \ modules/bibexport/doc/admin/Makefile \ modules/bibexport/doc/hacking/Makefile modules/bibexport/etc/Makefile \ modules/bibexport/lib/Makefile \ modules/bibexport/web/Makefile \ modules/bibexport/web/admin/Makefile \ modules/bibformat/Makefile \ modules/bibformat/bin/Makefile \ modules/bibformat/bin/bibreformat \ modules/bibformat/doc/Makefile \ modules/bibformat/doc/admin/Makefile \ modules/bibformat/doc/hacking/Makefile \ modules/bibformat/etc/Makefile \ modules/bibformat/etc/format_templates/Makefile \ modules/bibformat/etc/output_formats/Makefile \ modules/bibformat/lib/Makefile \ modules/bibformat/lib/elements/Makefile \ modules/bibformat/web/Makefile \ modules/bibformat/web/admin/Makefile \ modules/bibharvest/Makefile \ modules/bibharvest/bin/Makefile \ modules/bibharvest/bin/oairepositoryupdater \ modules/bibharvest/bin/oaiharvest \ modules/bibharvest/doc/Makefile \ modules/bibharvest/doc/admin/Makefile \ modules/bibharvest/doc/hacking/Makefile \ + modules/bibharvest/etc/Makefile \ modules/bibharvest/lib/Makefile \ modules/bibharvest/web/Makefile \ modules/bibharvest/web/admin/Makefile \ modules/bibindex/Makefile \ modules/bibindex/bin/Makefile \ modules/bibindex/bin/bibindex \ modules/bibindex/bin/bibstat \ modules/bibindex/doc/Makefile \ modules/bibindex/doc/admin/Makefile \ modules/bibindex/doc/hacking/Makefile \ modules/bibindex/lib/Makefile \ modules/bibindex/web/Makefile \ modules/bibindex/web/admin/Makefile \ modules/bibknowledge/Makefile \ modules/bibknowledge/lib/Makefile \ modules/bibknowledge/doc/Makefile \ modules/bibknowledge/doc/admin/Makefile \ modules/bibknowledge/doc/hacking/Makefile \ modules/bibmatch/Makefile \ modules/bibmatch/bin/Makefile \ modules/bibmatch/bin/bibmatch \ modules/bibmatch/doc/Makefile \ modules/bibmatch/doc/admin/Makefile \ modules/bibmatch/etc/Makefile \ modules/bibmatch/lib/Makefile \ modules/bibmerge/Makefile \ modules/bibmerge/bin/Makefile \ modules/bibmerge/doc/Makefile \ modules/bibmerge/doc/admin/Makefile \ modules/bibmerge/doc/hacking/Makefile \ modules/bibmerge/lib/Makefile \ modules/bibmerge/web/Makefile \ modules/bibmerge/web/admin/Makefile \ modules/bibrank/Makefile \ modules/bibrank/bin/Makefile \ modules/bibrank/bin/bibrank \ modules/bibrank/bin/bibrankgkb \ modules/bibrank/doc/Makefile \ modules/bibrank/doc/admin/Makefile \ modules/bibrank/doc/hacking/Makefile \ modules/bibrank/etc/Makefile \ modules/bibrank/etc/bibrankgkb.cfg \ modules/bibrank/etc/demo_jif.cfg \ modules/bibrank/etc/template_single_tag_rank_method.cfg \ modules/bibrank/lib/Makefile \ modules/bibrank/web/Makefile \ modules/bibrank/web/admin/Makefile \ modules/bibsched/Makefile \ modules/bibsched/bin/Makefile \ modules/bibsched/bin/bibsched \ modules/bibsched/bin/bibtaskex \ modules/bibsched/bin/bibtasklet \ modules/bibsched/doc/Makefile \ modules/bibsched/doc/admin/Makefile \ modules/bibsched/doc/hacking/Makefile \ modules/bibsched/lib/Makefile \ modules/bibsched/lib/tasklets/Makefile \ modules/bibupload/Makefile \ modules/bibsword/Makefile \ modules/bibsword/bin/Makefile \ modules/bibsword/bin/bibsword \ modules/bibsword/doc/Makefile \ modules/bibsword/doc/admin/Makefile \ modules/bibsword/doc/hacking/Makefile \ modules/bibsword/lib/Makefile \ modules/bibsword/etc/Makefile \ modules/bibupload/bin/Makefile \ modules/bibupload/bin/bibupload \ modules/bibupload/bin/batchuploader \ modules/bibupload/doc/Makefile \ modules/bibupload/doc/admin/Makefile \ modules/bibupload/doc/hacking/Makefile \ modules/bibupload/lib/Makefile \ modules/elmsubmit/Makefile \ modules/elmsubmit/bin/Makefile \ modules/elmsubmit/bin/elmsubmit \ modules/elmsubmit/doc/Makefile \ modules/elmsubmit/doc/admin/Makefile \ modules/elmsubmit/doc/hacking/Makefile \ modules/elmsubmit/etc/Makefile \ modules/elmsubmit/etc/elmsubmit.cfg \ modules/elmsubmit/lib/Makefile \ modules/miscutil/Makefile \ modules/miscutil/bin/Makefile \ modules/miscutil/bin/dbdump \ modules/miscutil/bin/dbexec \ modules/miscutil/bin/inveniocfg \ modules/miscutil/bin/plotextractor \ modules/miscutil/demo/Makefile \ modules/miscutil/doc/Makefile \ modules/miscutil/doc/hacking/Makefile \ modules/miscutil/etc/Makefile \ modules/miscutil/etc/bash_completion.d/Makefile \ modules/miscutil/etc/bash_completion.d/inveniocfg \ modules/miscutil/etc/ckeditor_scientificchar/Makefile \ modules/miscutil/etc/ckeditor_scientificchar/dialogs/Makefile \ modules/miscutil/etc/ckeditor_scientificchar/lang/Makefile \ modules/miscutil/lib/Makefile \ modules/miscutil/sql/Makefile \ modules/miscutil/web/Makefile \ modules/webaccess/Makefile \ modules/webaccess/bin/Makefile \ modules/webaccess/bin/authaction \ modules/webaccess/bin/webaccessadmin \ modules/webaccess/doc/Makefile \ modules/webaccess/doc/admin/Makefile \ modules/webaccess/doc/hacking/Makefile \ modules/webaccess/lib/Makefile \ modules/webaccess/web/Makefile \ modules/webaccess/web/admin/Makefile \ modules/webalert/Makefile \ modules/webalert/bin/Makefile \ modules/webalert/bin/alertengine \ modules/webalert/doc/Makefile \ modules/webalert/doc/admin/Makefile \ modules/webalert/doc/hacking/Makefile \ modules/webalert/lib/Makefile \ modules/webalert/web/Makefile \ modules/webbasket/Makefile \ modules/webbasket/doc/Makefile \ modules/webbasket/doc/admin/Makefile \ modules/webbasket/doc/hacking/Makefile \ modules/webbasket/lib/Makefile \ modules/webbasket/web/Makefile \ modules/webcomment/Makefile \ modules/webcomment/doc/Makefile \ modules/webcomment/doc/admin/Makefile \ modules/webcomment/doc/hacking/Makefile \ modules/webcomment/lib/Makefile \ modules/webcomment/web/Makefile \ modules/webcomment/web/admin/Makefile \ modules/webhelp/Makefile \ modules/webhelp/web/Makefile \ modules/webhelp/web/admin/Makefile \ modules/webhelp/web/admin/howto/Makefile \ modules/webhelp/web/hacking/Makefile \ modules/webjournal/Makefile \ modules/webjournal/etc/Makefile \ modules/webjournal/doc/Makefile \ modules/webjournal/doc/admin/Makefile \ modules/webjournal/doc/hacking/Makefile \ modules/webjournal/lib/Makefile \ modules/webjournal/lib/elements/Makefile \ modules/webjournal/lib/widgets/Makefile \ modules/webjournal/web/Makefile \ modules/webjournal/web/admin/Makefile \ modules/webmessage/Makefile \ modules/webmessage/bin/Makefile \ modules/webmessage/bin/webmessageadmin \ modules/webmessage/doc/Makefile \ modules/webmessage/doc/admin/Makefile \ modules/webmessage/doc/hacking/Makefile \ modules/webmessage/lib/Makefile \ modules/webmessage/web/Makefile \ modules/websearch/Makefile \ modules/websearch/bin/Makefile \ modules/websearch/bin/webcoll \ modules/websearch/doc/Makefile \ modules/websearch/doc/admin/Makefile \ modules/websearch/doc/hacking/Makefile \ modules/websearch/lib/Makefile \ modules/websearch/web/Makefile \ modules/websearch/web/admin/Makefile \ modules/websession/Makefile \ modules/websession/bin/Makefile \ modules/websession/bin/inveniogc \ modules/websession/doc/Makefile \ modules/websession/doc/admin/Makefile \ modules/websession/doc/hacking/Makefile \ modules/websession/lib/Makefile \ modules/websession/web/Makefile \ modules/webstat/Makefile \ modules/webstat/bin/Makefile \ modules/webstat/bin/webstat \ modules/webstat/bin/webstatadmin \ modules/webstat/doc/Makefile \ modules/webstat/doc/admin/Makefile \ modules/webstat/doc/hacking/Makefile \ modules/webstat/etc/Makefile \ modules/webstat/lib/Makefile \ modules/webstyle/Makefile \ modules/webstyle/bin/Makefile \ modules/webstyle/bin/webdoc \ modules/webstyle/css/Makefile \ modules/webstyle/doc/Makefile \ modules/webstyle/doc/admin/Makefile \ modules/webstyle/doc/hacking/Makefile \ modules/webstyle/etc/Makefile \ modules/webstyle/img/Makefile \ modules/webstyle/lib/Makefile \ modules/websubmit/Makefile \ modules/websubmit/bin/Makefile \ modules/websubmit/bin/bibdocfile \ modules/websubmit/bin/inveniounoconv \ modules/websubmit/doc/Makefile \ modules/websubmit/doc/admin/Makefile \ modules/websubmit/doc/hacking/Makefile \ modules/websubmit/etc/Makefile \ modules/websubmit/lib/Makefile \ modules/websubmit/lib/functions/Makefile \ modules/websubmit/web/Makefile \ modules/websubmit/web/admin/Makefile \ ]) ## Finally, write output files: AC_OUTPUT ## Write help: AC_MSG_RESULT([****************************************************************************]) AC_MSG_RESULT([** Your Invenio installation is now ready for building. **]) AC_MSG_RESULT([** You have entered the following parameters: **]) AC_MSG_RESULT([** - Invenio main install directory: ${prefix}]) AC_MSG_RESULT([** - Python executable: $PYTHON]) AC_MSG_RESULT([** - MySQL client executable: $MYSQL]) AC_MSG_RESULT([** - CLISP executable: $CLISP]) AC_MSG_RESULT([** - CMUCL executable: $CMUCL]) AC_MSG_RESULT([** - SBCL executable: $SBCL]) AC_MSG_RESULT([** Here are the steps to continue the building process: **]) AC_MSG_RESULT([** 1) Type 'make' to build your Invenio system. **]) AC_MSG_RESULT([** 2) Type 'make install' to install your Invenio system. **]) AC_MSG_RESULT([** After that you can start customizing your installation as documented **]) AC_MSG_RESULT([** in the INSTALL file (i.e. edit invenio.conf, run inveniocfg, etc). **]) AC_MSG_RESULT([** Good luck, and thanks for choosing Invenio. **]) AC_MSG_RESULT([** -- Invenio Development Team **]) AC_MSG_RESULT([****************************************************************************]) ## end of file diff --git a/modules/bibconvert/etc/oaiarxiv2marcxml.xsl b/modules/bibconvert/etc/oaiarxiv2marcxml.xsl index 2d3f6c61e..58ccebdee 100644 --- a/modules/bibconvert/etc/oaiarxiv2marcxml.xsl +++ b/modules/bibconvert/etc/oaiarxiv2marcxml.xsl @@ -1,1033 +1,1049 @@ abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ LANL EDS arXiv arXiv CERN ; CERN , , , giva a faire accepted@appear@press@publ@review@submitted"> - - - - + + + + + + + + + + false + arXiv + DELETED SzGeCERN - arXiv - + + + + + + + + + false + arXiv eng TH- CERN Library PH-TH PH-EP- CERN Library PH-EP http://arxiv.org/pdf/.pdf p mult. p Comments: LANL EDS ARTICLE ARTICLE 13 Thesis THESIS THESIS 14 PREPRINT PREPRINT 11 diff --git a/modules/bibconvert/etc/oaidc2marcxml.xsl b/modules/bibconvert/etc/oaidc2marcxml.xsl index 25e4552f6..12bbd512d 100644 --- a/modules/bibconvert/etc/oaidc2marcxml.xsl +++ b/modules/bibconvert/etc/oaidc2marcxml.xsl @@ -1,191 +1,221 @@ - + - - - - - - - - - - - - - - - - DELETED - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + false + + + + DELETED + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + false + + + + - PREPRINT - - - - - - - - - - + the right source data. --> + PREPRINT + + + + + + + diff --git a/modules/bibconvert/etc/oaimarc2marcxml.xsl b/modules/bibconvert/etc/oaimarc2marcxml.xsl index 6536173d9..f80d0f898 100644 --- a/modules/bibconvert/etc/oaimarc2marcxml.xsl +++ b/modules/bibconvert/etc/oaimarc2marcxml.xsl @@ -1,116 +1,136 @@ - + - - - - - - - - - - - - - - - - - DELETED - - - - - - - + + + + + + + + + + + + + false + + + + DELETED + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + --> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + false + + + + + + + + diff --git a/modules/bibconvert/lib/bibconvert_xslt_engine.py b/modules/bibconvert/lib/bibconvert_xslt_engine.py index c9e15db11..81f33eff3 100644 --- a/modules/bibconvert/lib/bibconvert_xslt_engine.py +++ b/modules/bibconvert/lib/bibconvert_xslt_engine.py @@ -1,279 +1,317 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ bibconvert_xslt_engine - Wrapper for an XSLT engine. Customized to support BibConvert functions through the use of XPath 'format' function. Dependencies: Need one of the following XSLT processors: - libxml2 & libxslt - 4suite Used by: bibconvert.in FIXME: - Find better namespace for functions - Find less bogus URI (given as param to processor) for source and template - Implement command-line options - Think about better handling of 'value' parameter in bibconvert_function_* """ __revision__ = "$Id$" import sys import os +from warnings import warn + from invenio.config import \ CFG_ETCDIR, \ CFG_SITE_URL from invenio.bibconvert import FormatField +from invenio.textutils import encode_for_xml # The namespace used for BibConvert functions CFG_BIBCONVERT_FUNCTION_NS = "http://cdsweb.cern.ch/bibconvert/fn" # Import one XSLT processor # # processor_type: # -1 : No processor found # 0 : libxslt # 1 : 4suite processor_type = -1 try: # libxml2 & libxslt import libxml2 import libxslt processor_type = 0 except ImportError: pass if processor_type == -1: try: # 4suite from Ft.Xml.Xslt import Processor, XsltException from Ft.Xml import InputSource from xml.dom import Node processor_type = 1 except ImportError: pass CFG_BIBCONVERT_XSL_PATH = "%s%sbibconvert%sconfig" % (CFG_ETCDIR, os.sep, os.sep) -def bibconvert_function_libxslt(ctx, value, func): +def bibconvert_function_libxslt(dummy_ctx, value, func): """ libxslt extension function: Bridge between BibConvert formatting functions and XSL stylesheets. Can be used in that way in XSL stylesheet (provided xmlns:fn="http://cdsweb.cern.ch/bibconvert/fn" has been declared): (Adds strings 'mypref' and 'mysuff' as prefix/suffix to current node value, using BibConvert ADD function) if value is int, value is converted to string if value is Node (PyCObj), first child node (text node) is taken as value """ try: if isinstance(value, str): string_value = value elif isinstance(value, (int, long)): string_value = str(value) else: string_value = libxml2.xmlNode(_obj=value[0]).children.content return FormatField(string_value, func).rstrip('\n') except Exception, err: sys.stderr.write("Error during formatting function evaluation: " + \ str(err) + \ '\n') return '' +def bibconvert_escape_libxslt(dummy_ctx, value): + """ + Bridge to libxslt to escape the provided value. + """ + try: + if isinstance(value, str): + string_value = value + elif isinstance(value, (int, long)): + string_value = str(value) + else: + string_value = libxml2.xmlNode(_obj=value[0]).serialize('utf8') + + return encode_for_xml(string_value) -def bibconvert_function_4suite(ctx, value, func): + except Exception, err: + sys.stderr.write("Error during formatting function evaluation: " + \ + str(err) + \ + '\n') + + return '' + + +def bibconvert_function_4suite(dummy_ctx, value, func): """ 4suite extension function: Bridge between BibConvert formatting functions and XSL stylesheets. Can be used in that way in XSL stylesheet (provided xmlns:fn="http://cdsweb.cern.ch/bibconvert/fn" has been declared): (Adds strings 'mypref' and 'mysuff' as prefix/suffix to current node value, using BibConvert ADD function) if value is int, value is converted to string if value is Node, first child node (text node) is taken as value """ try: if len(value) > 0 and isinstance(value[0], Node): string_value = value[0].firstChild.nodeValue if string_value is None: string_value = '' else: string_value = str(value) return FormatField(string_value, func).rstrip('\n') except Exception, err: sys.stderr.write("Error during formatting function evaluation: " + \ str(err) + \ '\n') return '' +def bibconvert_escape_4suite(dummy_ctx, value): + """ + Bridge to 4suite to escape the provided value. + """ + ##FIXME: this does not work with 4suite. How one does serialize a Node? + warn("Invenio fn:escape is currently broken, when using 4suite") + return value + def convert(xmltext, template_filename=None, template_source=None): """ Processes an XML text according to a template, and returns the result. The template can be given either by name (or by path) or by source. If source is given, name is ignored. bibconvert_xslt_engine will look for template_filename in standard directories for templates. If not found, template_filename will be assumed to be a path to a template. If none can be found, return None. Raises an exception if cannot find an appropriate XSLT processor. @param xmltext: The string representation of the XML to process @param template_filename: The name of the template to use for the processing @param template_source: The configuration describing the processing. @return: the transformed XML text, or None if an error occured """ if processor_type == -1: # No XSLT processor found raise "No XSLT processor could be found" # Retrieve template and read it if template_source: template = template_source elif template_filename: try: path_to_templates = (CFG_BIBCONVERT_XSL_PATH + os.sep + template_filename) if os.path.exists(path_to_templates): template = file(path_to_templates).read() elif os.path.exists(template_filename): template = file(template_filename).read() else: sys.stderr.write(template_filename +' does not exist.') return None except IOError: sys.stderr.write(template_filename +' could not be read.') return None else: sys.stderr.write(template_filename +' was not given.') return None result = "" if processor_type == 0: # libxml2 & libxslt # Register BibConvert functions for use in XSL libxslt.registerExtModuleFunction("format", CFG_BIBCONVERT_FUNCTION_NS, bibconvert_function_libxslt) + libxslt.registerExtModuleFunction("escape", + CFG_BIBCONVERT_FUNCTION_NS, + bibconvert_escape_libxslt) # Load template and source try: template_xml = libxml2.parseDoc(template) except libxml2.parserError, e: sys.stderr.write('Parsing XSL template failed:\n ' + \ str(e) + '\n') return None processor = libxslt.parseStylesheetDoc(template_xml) try: source = libxml2.parseDoc(xmltext) except libxml2.parserError, e: sys.stderr.write('Parsing XML source failed:\n ' + \ str(e) + '\n') return None # Transform result_object = processor.applyStylesheet(source, None) result = processor.saveResultToString(result_object) # Deallocate processor.freeStylesheet() source.freeDoc() result_object.freeDoc() elif processor_type == 1: # 4suite # Init processor = Processor.Processor() # Register BibConvert functions for use in XSL processor.registerExtensionFunction(CFG_BIBCONVERT_FUNCTION_NS, "format", bibconvert_function_4suite) + processor.registerExtensionFunction(CFG_BIBCONVERT_FUNCTION_NS, + "escape", + bibconvert_escape_4suite) # Load template and source transform = InputSource.DefaultFactory.fromString(template, uri=CFG_SITE_URL) source = InputSource.DefaultFactory.fromString(xmltext, uri=CFG_SITE_URL) try: processor.appendStylesheet(transform) except XsltException, e: sys.stderr.write('Parsing XSL template failed:\n' + str(e)) return None # Transform try: result = processor.run(source) except XsltException, e: sys.stderr.write('Conversion failed:\n' + str(e)) return None else: sys.stderr.write("No XSLT processor could be found") return result ## def bc_profile(): ## """ ## Runs a benchmark ## """ ## global xmltext ## convert(xmltext, 'oaidc2marcxml.xsl') ## return ## def benchmark(): ## """ ## Benchmark the module, using profile and pstats ## """ ## import profile ## import pstats ## from invenio.bibformat import record_get_xml ## global xmltext ## xmltext = record_get_xml(10, 'oai_dc') ## profile.run('bc_profile()', "bibconvert_xslt_profile") ## p = pstats.Stats("bibconvert_xslt_profile") ## p.strip_dirs().sort_stats("cumulative").print_stats() if __name__ == "__main__": pass diff --git a/modules/bibformat/etc/format_templates/Makefile.am b/modules/bibformat/etc/format_templates/Makefile.am index cdf7dd94f..71e964eb9 100644 --- a/modules/bibformat/etc/format_templates/Makefile.am +++ b/modules/bibformat/etc/format_templates/Makefile.am @@ -1,40 +1,40 @@ ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. etcdir = $(sysconfdir)/bibformat/format_templates etc_DATA = Default_HTML_captions.bft Picture_HTML_brief.bft \ Default_HTML_detailed.bft Default_HTML_portfolio.bft \ Picture_HTML_detailed.bft Default_HTML_brief.bft \ BibTeX.bft MARCXML.bft Excel.bft \ Default_HTML_similarity.bft NLM.xsl \ - OAI_DC.xsl DC.xsl EndNote.xsl RSS.xsl \ + OAI_DC.xsl OAI_MARC.bft DC.xsl EndNote.xsl RSS.xsl \ RefWorks.xsl MODS.xsl \ Default_HTML_references.bft Default_HTML_files.bft \ Default_HTML_actions.bft Journal_HTML_detailed.bft \ Journal_HTML_brief.bft \ - Poetry_HTML_brief.bft Poetry_HTML_detailed.bft \ - AID_HTML_very_brief.bft Podcast.xsl \ - Video_HTML_brief.bft Video_HTML_detailed.bft + Poetry_HTML_brief.bft Poetry_HTML_detailed.bft \ + AID_HTML_very_brief.bft Podcast.xsl \ + Video_HTML_brief.bft Video_HTML_detailed.bft tmpdir = $(prefix)/var/tmp tmp_DATA = Test1.bft Test3.bft Test_2.bft Test_no_template.test EXTRA_DIST = $(etc_DATA) $(tmp_DATA) CLEANFILES = *.tmp diff --git a/modules/bibformat/etc/format_templates/OAI_MARC.bft b/modules/bibformat/etc/format_templates/OAI_MARC.bft new file mode 100644 index 000000000..1d7d1fe57 --- /dev/null +++ b/modules/bibformat/etc/format_templates/OAI_MARC.bft @@ -0,0 +1,3 @@ +OAI MARC +Standard MARC XML output suitable for embed in OAI-PMH responses + diff --git a/modules/bibformat/etc/output_formats/Makefile.am b/modules/bibformat/etc/output_formats/Makefile.am index 3864bd3c3..01fd35d8a 100644 --- a/modules/bibformat/etc/output_formats/Makefile.am +++ b/modules/bibformat/etc/output_formats/Makefile.am @@ -1,32 +1,32 @@ ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. etcdir = $(sysconfdir)/bibformat/output_formats etc_DATA = HB.bfo HC.bfo HD.bfo HP.bfo HX.bfo XM.bfo EXCEL.bfo \ XD.bfo HS.bfo HA.bfo \ XE.bfo XN.bfo XR.bfo XW.bfo \ - XOAIDC.bfo XO.bfo \ + XOAIDC.bfo XO.bfo XOAIMARC.bfo \ HDREF.bfo HDFILE.bfo HDACT.bfo XP.bfo tmpdir = $(prefix)/var/tmp tmp_DATA = TEST1.bfo TEST2.bfo TEST3.bfo EXTRA_DIST = $(etc_DATA) $(tmp_DATA) CLEANFILES = *.tmp diff --git a/modules/bibformat/etc/output_formats/XOAIMARC.bfo b/modules/bibformat/etc/output_formats/XOAIMARC.bfo new file mode 100644 index 000000000..f3cd4655f --- /dev/null +++ b/modules/bibformat/etc/output_formats/XOAIMARC.bfo @@ -0,0 +1 @@ +default: OAI_MARC.bft diff --git a/modules/bibformat/lib/bibformat_engine.py b/modules/bibformat/lib/bibformat_engine.py index bd8f17d0d..6f530033f 100644 --- a/modules/bibformat/lib/bibformat_engine.py +++ b/modules/bibformat/lib/bibformat_engine.py @@ -1,2111 +1,2111 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Formats a single XML Marc record using specified format. There is no API for the engine. Instead use module L{bibformat}. You can have a look at the various escaping modes available in X{BibFormatObject} in function L{escape_field} Still it is useful sometimes for debugging purpose to use the L{BibFormatObject} class directly. For eg: >>> from invenio.bibformat_engine import BibFormatObject >>> bfo = BibFormatObject(102) >>> bfo.field('245__a') The order Rodentia in South America >>> from invenio.bibformat_elements import bfe_title >>> bfe_title.format_element(bfo) The order Rodentia in South America @see: bibformat.py, bibformat_utils.py """ __revision__ = "$Id$" import re import sys import os import inspect import traceback import zlib import cgi from invenio.config import \ CFG_PATH_PHP, \ CFG_BINDIR, \ CFG_SITE_LANG from invenio.errorlib import \ register_errors, \ get_msgs_for_code_list from invenio.bibrecord import \ create_record, \ record_get_field_instances, \ record_get_field_value, \ record_get_field_values, \ record_xml_output from invenio.bibformat_xslt_engine import format from invenio.dbquery import run_sql from invenio.messages import \ language_list_long, \ wash_language, \ gettext_set_language from invenio import bibformat_dblayer from invenio.bibformat_config import \ CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION, \ CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION, \ CFG_BIBFORMAT_TEMPLATES_PATH, \ CFG_BIBFORMAT_ELEMENTS_PATH, \ CFG_BIBFORMAT_OUTPUTS_PATH, \ CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH from invenio.bibformat_utils import \ record_get_xml, \ parse_tag from invenio.htmlutils import \ HTMLWasher, \ - cfg_html_buffer_allowed_tag_whitelist, \ - cfg_html_buffer_allowed_attribute_whitelist + CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST, \ + CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST from invenio.webuser import collect_user_info from invenio.bibknowledge import get_kbr_values from HTMLParser import HTMLParseError if CFG_PATH_PHP: #Remove when call_old_bibformat is removed from xml.dom import minidom import tempfile # Cache for data we have already read and parsed format_templates_cache = {} format_elements_cache = {} format_outputs_cache = {} html_field = '' # String indicating that field should be # treated as HTML (and therefore no escaping of # HTML tags should occur. # Appears in some field values. washer = HTMLWasher() # Used to remove dangerous tags from HTML # sources # Regular expression for finding ... tag in format templates pattern_lang = re.compile(r''' #closing start tag (?P.*?) #anything but the next group (greedy) () #end tag ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) # Builds regular expression for finding each known language in tags ln_pattern_text = r"<(" for lang in language_list_long(enabled_langs_only=False): ln_pattern_text += lang[0] +r"|" ln_pattern_text = ln_pattern_text.rstrip(r"|") ln_pattern_text += r")>(.*?)" ln_pattern = re.compile(ln_pattern_text, re.IGNORECASE | re.DOTALL) # Regular expression for finding text to be translated translation_pattern = re.compile(r'_\((?P.*?)\)_', \ re.IGNORECASE | re.DOTALL | re.VERBOSE) # Regular expression for finding tag in format templates pattern_format_template_name = re.compile(r''' #closing start tag (?P.*?) #name value. any char that is not end tag ()(\n)? #end tag ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) # Regular expression for finding tag in format templates pattern_format_template_desc = re.compile(r''' #closing start tag (?P.*?) #description value. any char that is not end tag (\n)? #end tag ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) # Regular expression for finding tags in format templates pattern_tag = re.compile(r''' [^/\s]+) #any char but a space or slash \s* #any number of spaces (?P(\s* #params here (?P([^=\s])*)\s* #param name: any chars that is not a white space or equality. Followed by space(s) =\s* #equality: = followed by any number of spaces (?P[\'"]) #one of the separators (?P.*?) #param value: any chars that is not a separator like previous one (?P=sep) #same separator as starting one )*) #many params \s* #any number of spaces (/)?> #end of the tag ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) # Regular expression for finding params inside tags in format templates pattern_function_params = re.compile(''' (?P([^=\s])*)\s* # Param name: any chars that is not a white space or equality. Followed by space(s) =\s* # Equality: = followed by any number of spaces (?P[\'"]) # One of the separators (?P.*?) # Param value: any chars that is not a separator like previous one (?P=sep) # Same separator as starting one ''', re.VERBOSE | re.DOTALL ) # Regular expression for finding format elements "params" attributes # (defined by @param) pattern_format_element_params = re.compile(''' @param\s* # Begins with AT param keyword followed by space(s) (?P[^\s=]*):\s* # A single keyword and comma, then space(s) #(=\s*(?P[\'"]) # Equality, space(s) and then one of the separators #(?P.*?) # Default value: any chars that is not a separator like previous one #(?P=sep) # Same separator as starting one #)?\s* # Default value for param is optional. Followed by space(s) (?P.*) # Any text that is not end of line (thanks to MULTILINE parameter) ''', re.VERBOSE | re.MULTILINE) # Regular expression for finding format elements "see also" attribute # (defined by @see) pattern_format_element_seealso = re.compile('''@see:\s*(?P.*)''', re.VERBOSE | re.MULTILINE) #Regular expression for finding 2 expressions in quotes, separated by #comma (as in template("1st","2nd") ) #Used when parsing output formats ## pattern_parse_tuple_in_quotes = re.compile(''' ## (?P[\'"]) ## (?P.*) ## (?P=sep1) ## \s*,\s* ## (?P[\'"]) ## (?P.*) ## (?P=sep2) ## ''', re.VERBOSE | re.MULTILINE) def call_old_bibformat(recID, of="HD", on_the_fly=False, verbose=0): """ FIXME: REMOVE FUNCTION WHEN MIGRATION IS DONE Calls BibFormat for the record RECID in the desired output format 'of'. Note: this functions always try to return HTML, so when bibformat returns XML with embedded HTML format inside the tag FMT $g, as is suitable for prestoring output formats, we perform un-XML-izing here in order to return HTML body only. @param recID: record ID to format @param of: output format to be used for formatting @param on_the_fly: if False, try to return an already preformatted version of the record in the database @param verbose: verbosity @return: a formatted output using old BibFormat """ out = "" res = [] if not on_the_fly: # look for formatted record existence: query = "SELECT value, last_updated FROM bibfmt WHERE "\ "id_bibrec='%s' AND format='%s'" % (recID, of) res = run_sql(query, None, 1) if res: # record 'recID' is formatted in 'of', so print it if verbose == 9: last_updated = res[0][1] out += """\n
Found preformatted output for record %i (cache updated on %s). """ % (recID, last_updated) decompress = zlib.decompress return "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'of', # so try to call BibFormat on the fly or use default format: if verbose == 9: out += """\n
Formatting record %i on-the-fly with old BibFormat.
""" % recID # Retrieve MARCXML # Build it on-the-fly only if 'call_old_bibformat' was called # with format=xm and on_the_fly=True xm_record = record_get_xml(recID, 'xm', on_the_fly=(on_the_fly and of == 'xm')) ## import platform ## # Some problem have been found using either popen() or os.system(). ## # Here is a temporary workaround until the issue is solved. ## if platform.python_compiler().find('Red Hat') > -1: ## # use os.system (result_code, result_path) = tempfile.mkstemp() command = "( %s/bibformat otype=%s ) > %s" % \ (CFG_BINDIR, of, result_path) (xm_code, xm_path) = tempfile.mkstemp() xm_file = open(xm_path, "w") xm_file.write(xm_record) xm_file.close() command = command + " <" + xm_path os.system(command) result_file = open(result_path,"r") bibformat_output = result_file.read() result_file.close() os.close(result_code) os.remove(result_path) os.close(xm_code) os.remove(xm_path) ## else: ## # use popen ## pipe_input, pipe_output, pipe_error = os.popen3(["%s/bibformat" % CFG_BINDIR, ## "otype=%s" % format], ## 'rw') ## pipe_input.write(xm_record) ## pipe_input.flush() ## pipe_input.close() ## bibformat_output = pipe_output.read() ## pipe_output.close() ## pipe_error.close() if bibformat_output.startswith(""): dom = minidom.parseString(bibformat_output) for e in dom.getElementsByTagName('subfield'): if e.getAttribute('code') == 'g': for t in e.childNodes: out += t.data.encode('utf-8') else: out += bibformat_output return out def format_record(recID, of, ln=CFG_SITE_LANG, verbose=0, search_pattern=None, xml_record=None, user_info=None): """ Formats a record given output format. Main entry function of bibformat engine. Returns a formatted version of the record in the specified language, search pattern, and with the specified output format. The function will define which format template must be applied. You can either specify an record ID to format, or give its xml representation. if 'xml_record' is not None, then use it instead of recID. 'user_info' allows to grant access to some functionalities on a page depending on the user's priviledges. 'user_info' is the same object as the one returned by 'webuser.collect_user_info(req)' @param recID: the ID of record to format @param of: an output format code (or short identifier for the output format) @param ln: the language to use to format the record @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, stop if error in format elements 9: errors and warnings, stop if error (debug mode )) @param search_pattern: list of strings representing the user request in web interface @param xml_record: an xml string representing the record to format @param user_info: the information of the user who will view the formatted page @return: formatted record """ if search_pattern is None: search_pattern = [] out = "" errors_ = [] # Temporary workflow (during migration of formats): # Call new BibFormat # But if format not found for new BibFormat, then call old BibFormat #Create a BibFormat Object to pass that contain record and context bfo = BibFormatObject(recID, ln, search_pattern, xml_record, user_info, of) if of.lower() != 'xm' and \ (not bfo.get_record() or len(bfo.get_record()) <= 1): # Record only has recid: do not format, excepted # for xm format return "" #Find out which format template to use based on record and output format. template = decide_format_template(bfo, of) if verbose == 9 and template is not None: out += """\n
Using %s template for record %i. """ % (template, recID) ############### FIXME: REMOVE WHEN MIGRATION IS DONE ############### path = "%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH, os.sep, template) if template is None or not os.access(path, os.R_OK): # template not found in new BibFormat. Call old one if verbose == 9: if template is None: out += """\n
No template found for output format %s and record %i. (Check invenio.err log file for more details) """ % (of, recID) else: out += """\n
Template %s could not be read. """ % (template) if CFG_PATH_PHP: if verbose == 9: out += """\n
Using old BibFormat for record %s. """ % recID return out + call_old_bibformat(recID, of=of, on_the_fly=True, verbose=verbose) ############################# END ################################## error = get_msgs_for_code_list([("ERR_BIBFORMAT_NO_TEMPLATE_FOUND", of)], stream='error', ln=CFG_SITE_LANG) errors_.append(error) if verbose == 0: register_errors(error, 'error') elif verbose > 5: return out + error[0][1] return out # Format with template (out_, errors) = format_with_format_template(template, bfo, verbose) errors_.extend(errors) out += out_ return out def decide_format_template(bfo, of): """ Returns the format template name that should be used for formatting given output format and L{BibFormatObject}. Look at of rules, and take the first matching one. If no rule matches, returns None To match we ignore lettercase and spaces before and after value of rule and value of record @param bfo: a L{BibFormatObject} @param of: the code of the output format to use @return: name of a format template """ output_format = get_output_format(of) for rule in output_format['rules']: if rule['field'].startswith('00'): # Rule uses controlfield values = [bfo.control_field(rule['field']).strip()] #Remove spaces else: # Rule uses datafield values = bfo.fields(rule['field']) # loop over multiple occurences, but take the first match if len(values) > 0: for value in values: value = value.strip() #Remove spaces pattern = rule['value'].strip() #Remove spaces match_obj = re.match(pattern, value, re.IGNORECASE) if match_obj is not None and \ match_obj.end() == len(value): return rule['template'] template = output_format['default'] if template != '': return template else: return None def format_with_format_template(format_template_filename, bfo, verbose=0, format_template_code=None): """ Format a record given a format template. Also returns errors Returns a formatted version of the record represented by bfo, in the language specified in bfo, and with the specified format template. If format_template_code is provided, the template will not be loaded from format_template_filename (but format_template_filename will still be used to determine if bft or xsl transformation applies). This allows to preview format code without having to save file on disk. @param format_template_filename: the dilename of a format template @param bfo: the object containing parameters for the current formatting @param format_template_code: if not empty, use code as template instead of reading format_template_filename (used for previews) @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @return: tuple (formatted text, errors) """ _ = gettext_set_language(bfo.lang) def translate(match): """ Translate matching values """ word = match.group("word") translated_word = _(word) return translated_word errors_ = [] if format_template_code is not None: format_content = str(format_template_code) else: format_content = get_format_template(format_template_filename)['code'] if format_template_filename is None or \ format_template_filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION): # .bft filtered_format = filter_languages(format_content, bfo.lang) localized_format = translation_pattern.sub(translate, filtered_format) (evaluated_format, errors) = eval_format_template_elements(localized_format, bfo, verbose) errors_ = errors else: #.xsl if bfo.xml_record: # bfo was initialized with a custom MARCXML xml_record = '\n' + \ record_xml_output(bfo.record) else: # Fetch MARCXML. On-the-fly xm if we are now formatting in xm xml_record = '\n' + \ record_get_xml(bfo.recID, 'xm', on_the_fly=False) # Transform MARCXML using stylesheet evaluated_format = format(xml_record, template_source=format_content) return (evaluated_format, errors_) def eval_format_template_elements(format_template, bfo, verbose=0): """ Evalutes the format elements of the given template and replace each element with its value. Also returns errors. Prepare the format template content so that we can directly replace the marc code by their value. This implies: 1. Look for special tags 2. replace special tags by their evaluation @param format_template: the format template code @param bfo: the object containing parameters for the current formatting @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @return: tuple (result, errors) """ errors_ = [] # First define insert_element_code(match), used in re.sub() function def insert_element_code(match): """ Analyses 'match', interpret the corresponding code, and return the result of the evaluation. Called by substitution in 'eval_format_template_elements(...)' @param match: a match object corresponding to the special tag that must be interpreted """ function_name = match.group("function_name") try: format_element = get_format_element(function_name, verbose) except Exception, e: if verbose >= 5: return '' + \ cgi.escape(str(e)).replace('\n', '
') + \ '
' if format_element is None: error = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_ELEMENT_NAME", function_name)], stream='error', ln=CFG_SITE_LANG) errors_.append(error) if verbose >= 5: return '' + \ error[0][1]+'' else: params = {} # Look for function parameters given in format template code all_params = match.group('params') if all_params is not None: function_params_iterator = pattern_function_params.finditer(all_params) for param_match in function_params_iterator: name = param_match.group('param') value = param_match.group('value') params[name] = value # Evaluate element with params and return (Do not return errors) (result, errors) = eval_format_element(format_element, bfo, params, verbose) errors_.append(errors) return result # Substitute special tags in the format by our own text. # Special tags have the form format = pattern_tag.sub(insert_element_code, format_template) return (format, errors_) def eval_format_element(format_element, bfo, parameters=None, verbose=0): """ Returns the result of the evaluation of the given format element name, with given L{BibFormatObject} and parameters. Also returns the errors of the evaluation. @param format_element: a format element structure as returned by get_format_element @param bfo: a L{BibFormatObject} used for formatting @param parameters: a dict of parameters to be used for formatting. Key is parameter and value is value of parameter @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @return: tuple (result, errors) """ if parameters is None: parameters = {} errors = [] #Load special values given as parameters prefix = parameters.get('prefix', "") suffix = parameters.get('suffix', "") default_value = parameters.get('default', "") escape = parameters.get('escape', "") output_text = '' # 3 possible cases: # a) format element file is found: we execute it # b) format element file is not found, but exist in tag table (e.g. bfe_isbn) # c) format element is totally unknown. Do nothing or report error if format_element is not None and format_element['type'] == "python": # a) We found an element with the tag name, of type "python" # Prepare a dict 'params' to pass as parameter to 'format' # function of element params = {} # Look for parameters defined in format element # Fill them with specified default values and values # given as parameters. # Also remember if the element overrides the 'escape' # parameter format_element_overrides_escape = False for param in format_element['attrs']['params']: name = param['name'] default = param['default'] params[name] = parameters.get(name, default) if name == 'escape': format_element_overrides_escape = True # Add BibFormatObject params['bfo'] = bfo # Execute function with given parameters and return result. function = format_element['code'] try: output_text = apply(function, (), params) except Exception, e: name = format_element['attrs']['name'] error = ("ERR_BIBFORMAT_EVALUATING_ELEMENT", name, str(params)) errors.append(error) if verbose == 0: register_errors(errors, 'error') elif verbose >= 5: tb = sys.exc_info()[2] error_string = get_msgs_for_code_list(error, stream='error', ln=CFG_SITE_LANG) stack = traceback.format_exception(Exception, e, tb, limit=None) output_text = ''+ \ str(error_string[0][1]) + "".join(stack) +' ' # None can be returned when evaluating function if output_text is None: output_text = "" else: output_text = str(output_text) # Escaping: # (1) By default, everything is escaped in mode 1 # (2) If evaluated element has 'escape_values()' function, use # its returned value as escape mode, and override (1) # (3) If template has a defined parameter 'escape' (in allowed # values), use it, and override (1) and (2). If this # 'escape' parameter is overriden by the format element # (defined in the 'format' function of the element), leave # the escaping job to this element # (1) escape_mode = 1 # (2) escape_function = format_element['escape_function'] if escape_function is not None: try: escape_mode = apply(escape_function, (), {'bfo': bfo}) except Exception, e: error = ("ERR_BIBFORMAT_EVALUATING_ELEMENT_ESCAPE", name) errors.append(error) if verbose == 0: register_errors(errors, 'error') elif verbose >= 5: tb = sys.exc_info()[2] error_string = get_msgs_for_code_list(error, stream='error', ln=CFG_SITE_LANG) output_text += ''+ \ str(error_string[0][1]) +' ' # (3) if escape in ['0', '1', '2', '3', '4', '5', '6', '7']: escape_mode = int(escape) # If escape is equal to 1, then escape all # HTML reserved chars. if escape_mode > 0 and not format_element_overrides_escape: output_text = escape_field(output_text, mode=escape_mode) # Add prefix and suffix if they have been given as parameters and if # the evaluation of element is not empty if output_text.strip() != "": output_text = prefix + output_text + suffix # Add the default value if output_text is empty if output_text == "": output_text = default_value return (output_text, errors) elif format_element is not None and format_element['type'] == "field": # b) We have not found an element in files that has the tag # name. Then look for it in the table "tag" # # # # Load special values given as parameters separator = parameters.get('separator ', "") nbMax = parameters.get('nbMax', "") escape = parameters.get('escape', "1") # By default, escape here # Get the fields tags that have to be printed tags = format_element['attrs']['tags'] output_text = [] # Get values corresponding to tags for tag in tags: p_tag = parse_tag(tag) values = record_get_field_values(bfo.get_record(), p_tag[0], p_tag[1], p_tag[2], p_tag[3]) if len(values)>0 and isinstance(values[0], dict): #flatten dict to its values only values_list = map(lambda x: x.values(), values) #output_text.extend(values) for values in values_list: output_text.extend(values) else: output_text.extend(values) if nbMax != "": try: nbMax = int(nbMax) output_text = output_text[:nbMax] except: name = format_element['attrs']['name'] error = ("ERR_BIBFORMAT_NBMAX_NOT_INT", name) errors.append(error) if verbose < 5: register_errors(error, 'error') elif verbose >= 5: error_string = get_msgs_for_code_list(error, stream='error', ln=CFG_SITE_LANG) output_text = output_text.append(error_string[0][1]) # Add prefix and suffix if they have been given as parameters and if # the evaluation of element is not empty. # If evaluation is empty string, return default value if it exists. # Else return empty string if ("".join(output_text)).strip() != "": # If escape is equal to 1, then escape all # HTML reserved chars. if escape == '1': output_text = cgi.escape(separator.join(output_text)) else: output_text = separator.join(output_text) output_text = prefix + output_text + suffix else: #Return default value output_text = default_value return (output_text, errors) else: # c) Element is unknown error = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_ELEMENT_NAME", format_element)], stream='error', ln=CFG_SITE_LANG) errors.append(error) if verbose < 5: register_errors(error, 'error') return ("", errors) elif verbose >= 5: if verbose >= 9: sys.exit(error[0][1]) return ('' + \ error[0][1]+'', errors) def filter_languages(format_template, ln='en'): """ Filters the language tags that do not correspond to the specified language. @param format_template: the format template code @param ln: the language that is NOT filtered out from the template @return: the format template with unnecessary languages filtered out """ # First define search_lang_tag(match) and clean_language_tag(match), used # in re.sub() function def search_lang_tag(match): """ Searches for the ... tag and remove inner localized tags such as , , that are not current_lang. If current_lang cannot be found inside ... , try to use 'CFG_SITE_LANG' @param match: a match object corresponding to the special tag that must be interpreted """ current_lang = ln def clean_language_tag(match): """ Return tag text content if tag language of match is output language. Called by substitution in 'filter_languages(...)' @param match: a match object corresponding to the special tag that must be interpreted """ if match.group(1) == current_lang: return match.group(2) else: return "" # End of clean_language_tag lang_tag_content = match.group("langs") # Try to find tag with current lang. If it does not exists, # then current_lang becomes CFG_SITE_LANG until the end of this # replace pattern_current_lang = re.compile(r"<("+current_lang+ \ r")\s*>(.*?)()", re.IGNORECASE | re.DOTALL) if re.search(pattern_current_lang, lang_tag_content) is None: current_lang = CFG_SITE_LANG cleaned_lang_tag = ln_pattern.sub(clean_language_tag, lang_tag_content) return cleaned_lang_tag # End of search_lang_tag filtered_format_template = pattern_lang.sub(search_lang_tag, format_template) return filtered_format_template def get_format_template(filename, with_attributes=False): """ Returns the structured content of the given formate template. if 'with_attributes' is true, returns the name and description. Else 'attrs' is not returned as key in dictionary (it might, if it has already been loaded previously):: {'code':"Some template code" 'attrs': {'name': "a name", 'description': "a description"} } @param filename: the filename of an format template @param with_attributes: if True, fetch the attributes (names and description) for format' @return: strucured content of format template """ # Get from cache whenever possible global format_templates_cache if not filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION) and \ not filename.endswith(".xsl"): return None if format_templates_cache.has_key(filename): # If we must return with attributes and template exist in # cache with attributes then return cache. # Else reload with attributes if with_attributes and \ format_templates_cache[filename].has_key('attrs'): return format_templates_cache[filename] format_template = {'code':""} try: path = "%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH, os.sep, filename) format_file = open(path) format_content = format_file.read() format_file.close() # Load format template code # Remove name and description if filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION): code_and_description = pattern_format_template_name.sub("", format_content, 1) code = pattern_format_template_desc.sub("", code_and_description, 1) else: code = format_content format_template['code'] = code except Exception, e: errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_TEMPLATE_FILE", filename, str(e))], stream='error', ln=CFG_SITE_LANG) register_errors(errors, 'error') # Save attributes if necessary if with_attributes: format_template['attrs'] = get_format_template_attrs(filename) # Cache and return format_templates_cache[filename] = format_template return format_template def get_format_templates(with_attributes=False): """ Returns the list of all format templates, as dictionary with filenames as keys if 'with_attributes' is true, returns the name and description. Else 'attrs' is not returned as key in each dictionary (it might, if it has already been loaded previously):: [{'code':"Some template code" 'attrs': {'name': "a name", 'description': "a description"} }, ... } @param with_attributes: if True, fetch the attributes (names and description) for formats @return: the list of format templates (with code and info) """ format_templates = {} files = os.listdir(CFG_BIBFORMAT_TEMPLATES_PATH) for filename in files: if filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION) or \ filename.endswith(".xsl"): format_templates[filename] = get_format_template(filename, with_attributes) return format_templates def get_format_template_attrs(filename): """ Returns the attributes of the format template with given filename The attributes are {'name', 'description'} Caution: the function does not check that path exists or that the format element is valid. @param filename: the name of a format template @return: a structure with detailed information about given format template """ attrs = {} attrs['name'] = "" attrs['description'] = "" try: template_file = open("%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH, os.sep, filename)) code = template_file.read() template_file.close() match = None if filename.endswith(".xsl"): # .xsl attrs['name'] = filename[:-4] else: # .bft match = pattern_format_template_name.search(code) if match is not None: attrs['name'] = match.group('name') else: attrs['name'] = filename match = pattern_format_template_desc.search(code) if match is not None: attrs['description'] = match.group('desc').rstrip('.') except Exception, e: errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_TEMPLATE_FILE", filename, str(e))], stream='error', ln=CFG_SITE_LANG) register_errors(errors, 'error') attrs['name'] = filename return attrs def get_format_element(element_name, verbose=0, with_built_in_params=False): """ Returns the format element structured content. Return None if element cannot be loaded (file not found, not readable or invalid) The returned structure is:: {'attrs': {some attributes in dict. See get_format_element_attrs_from_*} 'code': the_function_code, 'type':"field" or "python" depending if element is defined in file or table, 'escape_function': the function to call to know if element output must be escaped} @param element_name: the name of the format element to load @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @param with_built_in_params: if True, load the parameters built in all elements @return: a dictionary with format element attributes """ # Get from cache whenever possible global format_elements_cache errors = [] # Resolve filename and prepare 'name' as key for the cache filename = resolve_format_element_filename(element_name) if filename is not None: name = filename.upper() else: name = element_name.upper() if format_elements_cache.has_key(name): element = format_elements_cache[name] if not with_built_in_params or \ (with_built_in_params and \ element['attrs'].has_key('builtin_params')): return element if filename is None: # Element is maybe in tag table if bibformat_dblayer.tag_exists_for_name(element_name): format_element = {'attrs': get_format_element_attrs_from_table( \ element_name, with_built_in_params), 'code':None, 'escape_function':None, 'type':"field"} # Cache and returns format_elements_cache[name] = format_element return format_element else: errors = get_msgs_for_code_list([("ERR_BIBFORMAT_FORMAT_ELEMENT_NOT_FOUND", element_name)], stream='error', ln=CFG_SITE_LANG) if verbose == 0: register_errors(errors, 'error') elif verbose >= 5: sys.stderr.write(errors[0][1]) return None else: format_element = {} module_name = filename if module_name.endswith(".py"): module_name = module_name[:-3] # Load element try: module = __import__(CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH + \ "." + module_name) # Load last module in import path # For eg. load bfe_name in # invenio.bibformat_elements.bfe_name # Used to keep flexibility regarding where elements # directory is (for eg. test cases) components = CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH.split(".") for comp in components[1:]: module = getattr(module, comp) except Exception, e: # We catch all exceptions here, as we just want to print # traceback in all cases tb = sys.exc_info()[2] stack = traceback.format_exception(Exception, e, tb, limit=None) errors = get_msgs_for_code_list([("ERR_BIBFORMAT_IN_FORMAT_ELEMENT", element_name,"\n" + "\n".join(stack[-2:-1]))], stream='error', ln=CFG_SITE_LANG) if verbose == 0: register_errors(errors, 'error') elif verbose >= 5: sys.stderr.write(errors[0][1]) if errors: if verbose >= 7: raise Exception, errors[0][1] return None # Load function 'format_element()' inside element try: function_format = module.__dict__[module_name].format_element format_element['code'] = function_format except AttributeError, e: # Try to load 'format()' function try: function_format = module.__dict__[module_name].format format_element['code'] = function_format except AttributeError, e: errors = get_msgs_for_code_list([("ERR_BIBFORMAT_FORMAT_ELEMENT_FORMAT_FUNCTION", element_name)], stream='error', ln=CFG_SITE_LANG) if verbose == 0: register_errors(errors, 'error') elif verbose >= 5: sys.stderr.write(errors[0][1]) if errors: if verbose >= 7: raise Exception, errors[0][1] return None # Load function 'escape_values()' inside element function_escape = getattr(module.__dict__[module_name], 'escape_values', None) format_element['escape_function'] = function_escape # Prepare, cache and return format_element['attrs'] = get_format_element_attrs_from_function( \ function_format, element_name, with_built_in_params) format_element['type'] = "python" format_elements_cache[name] = format_element return format_element def get_format_elements(with_built_in_params=False): """ Returns the list of format elements attributes as dictionary structure Elements declared in files have priority over element declared in 'tag' table The returned object has this format:: {element_name1: {'attrs': {'description':..., 'seealso':... 'params':[{'name':..., 'default':..., 'description':...}, ...] 'builtin_params':[{'name':..., 'default':..., 'description':...}, ...] }, 'code': code_of_the_element }, element_name2: {...}, ...} Returns only elements that could be loaded (not error in code) @return: a dict of format elements with name as key, and a dict as attributes @param with_built_in_params: if True, load the parameters built in all elements """ format_elements = {} mappings = bibformat_dblayer.get_all_name_tag_mappings() for name in mappings: format_elements[name.upper().replace(" ", "_").strip()] = get_format_element(name, with_built_in_params=with_built_in_params) files = os.listdir(CFG_BIBFORMAT_ELEMENTS_PATH) for filename in files: filename_test = filename.upper().replace(" ", "_") if filename_test.endswith(".PY") and filename.upper() != "__INIT__.PY": if filename_test.startswith("BFE_"): filename_test = filename_test[4:] element_name = filename_test[:-3] element = get_format_element(element_name, with_built_in_params=with_built_in_params) if element is not None: format_elements[element_name] = element return format_elements def get_format_element_attrs_from_function(function, element_name, with_built_in_params=False): """ Returns the attributes of the function given as parameter. It looks for standard parameters of the function, default values and comments in the docstring. The attributes are:: {'name' : "name of element" #basically the name of 'name' parameter 'description': "a string description of the element", 'seealso' : ["element_1.py", "element_2.py", ...] #a list of related elements 'params': [{'name':"param_name", #a list of parameters for this element (except 'bfo') 'default':"default value", 'description': "a description"}, ...], 'builtin_params': {name: {'name':"param_name",#the parameters builtin for all elem of this kind 'default':"default value", 'description': "a description"}, ...}, } @param function: the formatting function of a format element @param element_name: the name of the element @param with_built_in_params: if True, load the parameters built in all elements @return: a structure with detailed information of a function """ attrs = {} attrs['description'] = "" attrs['name'] = element_name.replace(" ", "_").upper() attrs['seealso'] = [] docstring = function.__doc__ if isinstance(docstring, str): # Look for function description in docstring #match = pattern_format_element_desc.search(docstring) description = docstring.split("@param")[0] description = description.split("@see:")[0] attrs['description'] = description.strip().rstrip('.') # Look for @see: in docstring match = pattern_format_element_seealso.search(docstring) if match is not None: elements = match.group('see').rstrip('.').split(",") for element in elements: attrs['seealso'].append(element.strip()) params = {} # Look for parameters in function definition (args, varargs, varkw, defaults) = inspect.getargspec(function) # Prepare args and defaults_list such that we can have a mapping # from args to defaults args.reverse() if defaults is not None: defaults_list = list(defaults) defaults_list.reverse() else: defaults_list = [] for arg, default in map(None, args, defaults_list): if arg == "bfo": #Don't keep this as parameter. It is hidden to users, and #exists in all elements of this kind continue param = {} param['name'] = arg if default is None: #In case no check is made inside element, we prefer to #print "" (nothing) than None in output param['default'] = "" else: param['default'] = default param['description'] = "(no description provided)" params[arg] = param if isinstance(docstring, str): # Look for AT param descriptions in docstring. # Add description to existing parameters in params dict params_iterator = pattern_format_element_params.finditer(docstring) for match in params_iterator: name = match.group('name') if params.has_key(name): params[name]['description'] = match.group('desc').rstrip('.') attrs['params'] = params.values() # Load built-in parameters if necessary if with_built_in_params: builtin_params = [] # Add 'prefix' parameter param_prefix = {} param_prefix['name'] = "prefix" param_prefix['default'] = "" param_prefix['description'] = """A prefix printed only if the record has a value for this element""" builtin_params.append(param_prefix) # Add 'suffix' parameter param_suffix = {} param_suffix['name'] = "suffix" param_suffix['default'] = "" param_suffix['description'] = """A suffix printed only if the record has a value for this element""" builtin_params.append(param_suffix) # Add 'default' parameter param_default = {} param_default['name'] = "default" param_default['default'] = "" param_default['description'] = """A default value printed if the record has no value for this element""" builtin_params.append(param_default) # Add 'escape' parameter param_escape = {} param_escape['name'] = "escape" param_escape['default'] = "" param_escape['description'] = """0 keeps value as it is. Refer to main documentation for escaping modes 1 to 7""" builtin_params.append(param_escape) attrs['builtin_params'] = builtin_params return attrs def get_format_element_attrs_from_table(element_name, with_built_in_params=False): """ Returns the attributes of the format element with given name in 'tag' table. Returns None if element_name does not exist in tag table. The attributes are:: {'name' : "name of element" #basically the name of 'element_name' parameter 'description': "a string description of the element", 'seealso' : [] #a list of related elements. Always empty in this case 'params': [], #a list of parameters for this element. Always empty in this case 'builtin_params': [{'name':"param_name", #the parameters builtin for all elem of this kind 'default':"default value", 'description': "a description"}, ...], 'tags':["950.1", 203.a] #the list of tags printed by this element } @param element_name: an element name in database @param element_name: the name of the element @param with_built_in_params: if True, load the parameters built in all elements @return: a structure with detailed information of an element found in DB """ attrs = {} tags = bibformat_dblayer.get_tags_from_name(element_name) field_label = "field" if len(tags)>1: field_label = "fields" attrs['description'] = "Prints %s %s of the record" % (field_label, ", ".join(tags)) attrs['name'] = element_name.replace(" ", "_").upper() attrs['seealso'] = [] attrs['params'] = [] attrs['tags'] = tags # Load built-in parameters if necessary if with_built_in_params: builtin_params = [] # Add 'prefix' parameter param_prefix = {} param_prefix['name'] = "prefix" param_prefix['default'] = "" param_prefix['description'] = """A prefix printed only if the record has a value for this element""" builtin_params.append(param_prefix) # Add 'suffix' parameter param_suffix = {} param_suffix['name'] = "suffix" param_suffix['default'] = "" param_suffix['description'] = """A suffix printed only if the record has a value for this element""" builtin_params.append(param_suffix) # Add 'separator' parameter param_separator = {} param_separator['name'] = "separator" param_separator['default'] = " " param_separator['description'] = """A separator between elements of the field""" builtin_params.append(param_separator) # Add 'nbMax' parameter param_nbMax = {} param_nbMax['name'] = "nbMax" param_nbMax['default'] = "" param_nbMax['description'] = """The maximum number of values to print for this element. No limit if not specified""" builtin_params.append(param_nbMax) # Add 'default' parameter param_default = {} param_default['name'] = "default" param_default['default'] = "" param_default['description'] = """A default value printed if the record has no value for this element""" builtin_params.append(param_default) # Add 'escape' parameter param_escape = {} param_escape['name'] = "escape" param_escape['default'] = "" param_escape['description'] = """If set to 1, replaces special characters '&', '<' and '>' of this element by SGML entities""" builtin_params.append(param_escape) attrs['builtin_params'] = builtin_params return attrs def get_output_format(code, with_attributes=False, verbose=0): """ Returns the structured content of the given output format If 'with_attributes' is true, also returns the names and description of the output formats, else 'attrs' is not returned in dict (it might, if it has already been loaded previously). if output format corresponding to 'code' is not found return an empty structure. See get_output_format_attrs() to learn more about the attributes:: {'rules': [ {'field': "980__a", 'value': "PREPRINT", 'template': "filename_a.bft", }, {...} ], 'attrs': {'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}} 'description': "a description" 'code': "fnm1", 'content_type': "application/ms-excel", 'visibility': 1 } 'default':"filename_b.bft" } @param code: the code of an output_format @param with_attributes: if True, fetch the attributes (names and description) for format @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @return: strucured content of output format """ output_format = {'rules':[], 'default':""} filename = resolve_output_format_filename(code, verbose) if filename is None: errors = get_msgs_for_code_list([("ERR_BIBFORMAT_OUTPUT_FORMAT_CODE_UNKNOWN", code)], stream='error', ln=CFG_SITE_LANG) register_errors(errors, 'error') if with_attributes: #Create empty attrs if asked for attributes output_format['attrs'] = get_output_format_attrs(code, verbose) return output_format # Get from cache whenever possible global format_outputs_cache if format_outputs_cache.has_key(filename): # If was must return with attributes but cache has not # attributes, then load attributes if with_attributes and not \ format_outputs_cache[filename].has_key('attrs'): format_outputs_cache[filename]['attrs'] = get_output_format_attrs(code, verbose) return format_outputs_cache[filename] try: if with_attributes: output_format['attrs'] = get_output_format_attrs(code, verbose) path = "%s%s%s" % (CFG_BIBFORMAT_OUTPUTS_PATH, os.sep, filename ) format_file = open(path) current_tag = '' for line in format_file: line = line.strip() if line == "": # Ignore blank lines continue if line.endswith(":"): # Retrieve tag # Remove : spaces and eol at the end of line clean_line = line.rstrip(": \n\r") # The tag starts at second position current_tag = "".join(clean_line.split()[1:]).strip() elif line.find('---') != -1: words = line.split('---') template = words[-1].strip() condition = ''.join(words[:-1]) value = "" output_format['rules'].append({'field': current_tag, 'value': condition, 'template': template, }) elif line.find(':') != -1: # Default case default = line.split(':')[1].strip() output_format['default'] = default except Exception, e: errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_OUTPUT_FILE", filename, str(e))], stream='error', ln=CFG_SITE_LANG) register_errors(errors, 'error') # Cache and return format_outputs_cache[filename] = output_format return output_format def get_output_format_attrs(code, verbose=0): """ Returns the attributes of an output format. The attributes contain 'code', which is the short identifier of the output format (to be given as parameter in format_record function to specify the output format), 'description', a description of the output format, 'visibility' the visibility of the format in the output format list on public pages and 'names', the localized names of the output format. If 'content_type' is specified then the search_engine will send a file with this content type and with result of formatting as content to the user. The 'names' dict always contais 'generic', 'ln' (for long name) and 'sn' (for short names) keys. 'generic' is the default name for output format. 'ln' and 'sn' contain long and short localized names of the output format. Only the languages for which a localization exist are used:: {'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}} 'description': "a description" 'code': "fnm1", 'content_type': "application/ms-excel", 'visibility': 1 } @param code: the short identifier of the format @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @return: strucured content of output format attributes """ if code.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION): code = code[:-(len(CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION) + 1)] attrs = {'names':{'generic':"", 'ln':{}, 'sn':{}}, 'description':'', 'code':code.upper(), 'content_type':"", 'visibility':1} filename = resolve_output_format_filename(code, verbose) if filename is None: return attrs attrs['names'] = bibformat_dblayer.get_output_format_names(code) attrs['description'] = bibformat_dblayer.get_output_format_description(code) attrs['content_type'] = bibformat_dblayer.get_output_format_content_type(code) attrs['visibility'] = bibformat_dblayer.get_output_format_visibility(code) return attrs def get_output_formats(with_attributes=False): """ Returns the list of all output format, as a dictionary with their filename as key If 'with_attributes' is true, also returns the names and description of the output formats, else 'attrs' is not returned in dicts (it might, if it has already been loaded previously). See get_output_format_attrs() to learn more on the attributes:: {'filename_1.bfo': {'rules': [ {'field': "980__a", 'value': "PREPRINT", 'template': "filename_a.bft", }, {...} ], 'attrs': {'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}} 'description': "a description" 'code': "fnm1" } 'default':"filename_b.bft" }, 'filename_2.bfo': {...}, ... } @param with_attributes: if returned output formats contain detailed info, or not @type with_attributes: boolean @return: the list of output formats """ output_formats = {} files = os.listdir(CFG_BIBFORMAT_OUTPUTS_PATH) for filename in files: if filename.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION): code = "".join(filename.split(".")[:-1]) output_formats[filename] = get_output_format(code, with_attributes) return output_formats def resolve_format_element_filename(element_name): """ Returns the filename of element corresponding to x{element_name} This is necessary since format templates code call elements by ignoring case, for eg. is the same as . It is also recommended that format elements filenames are prefixed with bfe_ . We need to look for these too. The name of the element has to start with "BFE_". @param element_name: a name for a format element @return: the corresponding filename, with right case """ if not element_name.endswith(".py"): name = element_name.replace(" ", "_").upper() +".PY" else: name = element_name.replace(" ", "_").upper() files = os.listdir(CFG_BIBFORMAT_ELEMENTS_PATH) for filename in files: test_filename = filename.replace(" ", "_").upper() if test_filename == name or \ test_filename == "BFE_" + name or \ "BFE_" + test_filename == name: return filename # No element with that name found # Do not log error, as it might be a normal execution case: # element can be in database return None def resolve_output_format_filename(code, verbose=0): """ Returns the filename of output corresponding to code This is necessary since output formats names are not case sensitive but most file systems are. @param code: the code for an output format @param verbose: the level of verbosity from 0 to 9 (O: silent, 5: errors, 7: errors and warnings, 9: errors and warnings, stop if error (debug mode )) @return: the corresponding filename, with right case, or None if not found """ #Remove non alphanumeric chars (except . and _) code = re.sub(r"[^.0-9a-zA-Z_]", "", code) if not code.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION): code = re.sub(r"\W", "", code) code += "."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION files = os.listdir(CFG_BIBFORMAT_OUTPUTS_PATH) for filename in files: if filename.upper() == code.upper(): return filename # No output format with that name found errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_OUTPUT_NAME", code)], stream='error', ln=CFG_SITE_LANG) if verbose == 0: register_errors(errors, 'error') elif verbose >= 5: sys.stderr.write(errors[0][1]) if verbose >= 9: sys.exit(errors[0][1]) return None def get_fresh_format_template_filename(name): """ Returns a new filename and name for template with given name. Used when writing a new template to a file, so that the name has no space, is unique in template directory Returns (unique_filename, modified_name) @param name: name for a format template @return: the corresponding filename, and modified name if necessary """ #name = re.sub(r"\W", "", name) #Remove non alphanumeric chars name = name.replace(" ", "_") filename = name # Remove non alphanumeric chars (except .) filename = re.sub(r"[^.0-9a-zA-Z]", "", filename) path = CFG_BIBFORMAT_TEMPLATES_PATH + os.sep + filename \ + "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION index = 1 while os.path.exists(path): index += 1 filename = name + str(index) path = CFG_BIBFORMAT_TEMPLATES_PATH + os.sep + filename \ + "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION if index > 1: returned_name = (name + str(index)).replace("_", " ") else: returned_name = name.replace("_", " ") return (filename + "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION, returned_name) #filename.replace("_", " ")) def get_fresh_output_format_filename(code): """ Returns a new filename for output format with given code. Used when writing a new output format to a file, so that the code has no space, is unique in output format directory. The filename also need to be at most 6 chars long, as the convention is that filename == output format code (+ .extension) We return an uppercase code Returns (unique_filename, modified_code) @param code: the code of an output format @return: the corresponding filename, and modified code if necessary """ #code = re.sub(r"\W", "", code) #Remove non alphanumeric chars code = code.upper().replace(" ", "_") # Remove non alphanumeric chars (except . and _) code = re.sub(r"[^.0-9a-zA-Z_]", "", code) if len(code) > 6: code = code[:6] filename = code path = CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename \ + "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION index = 2 while os.path.exists(path): filename = code + str(index) if len(filename) > 6: filename = code[:-(len(str(index)))]+str(index) index += 1 path = CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename \ + "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION # We should not try more than 99999... Well I don't see how we # could get there.. Sanity check. if index >= 99999: errors = get_msgs_for_code_list([("ERR_BIBFORMAT_NB_OUTPUTS_LIMIT_REACHED", code)], stream='error', ln=CFG_SITE_LANG) register_errors(errors, 'error') sys.exit("Output format cannot be named as %s"%code) return (filename + "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION, filename) def clear_caches(): """ Clear the caches (Output Format, Format Templates and Format Elements) @return: None """ global format_templates_cache, format_elements_cache, format_outputs_cache format_templates_cache = {} format_elements_cache = {} format_outputs_cache = {} class BibFormatObject: """ An object that encapsulates a record and associated methods, and that is given as parameter to all format elements 'format' function. The object is made specifically for a given formatting, i.e. it includes for example the language for the formatting. The object provides basic accessors to the record. For full access, one can get the record with get_record() and then use BibRecord methods on the returned object. """ # The record record = None # The language in which the formatting has to be done lang = CFG_SITE_LANG # A list of string describing the context in which the record has # to be formatted. # It represents the words of the user request in web interface search search_pattern = [] # The id of the record recID = 0 # The information about the user, as returned by # 'webuser.collect_user_info(req)' user_info = None # The format in which the record is being formatted output_format = '' req = None # DEPRECATED: use bfo.user_info instead. Used by WebJournal. def __init__(self, recID, ln=CFG_SITE_LANG, search_pattern=None, xml_record=None, user_info=None, output_format=''): """ Creates a new bibformat object, with given record. You can either specify an record ID to format, or give its xml representation. if 'xml_record' is not None, use 'xml_record' instead of recID for the record. 'user_info' allows to grant access to some functionalities on a page depending on the user's priviledges. It is a dictionary in the following form:: user_info = { 'remote_ip' : '', 'remote_host' : '', 'referer' : '', 'uri' : '', 'agent' : '', 'uid' : -1, 'nickname' : '', 'email' : '', 'group' : [], 'guest' : '1' } @param recID: the id of a record @param ln: the language in which the record has to be formatted @param search_pattern: list of string representing the request used by the user in web interface @param xml_record: a xml string of the record to format @param user_info: the information of the user who will view the formatted page @param output_format: the output_format used for formatting this record """ self.xml_record = None # *Must* remain empty if recid is given if xml_record is not None: # If record is given as parameter self.xml_record = xml_record self.record = create_record(xml_record)[0] recID = record_get_field_value(self.record, "001") self.lang = wash_language(ln) if search_pattern is None: search_pattern = [] self.search_pattern = search_pattern self.recID = recID self.output_format = output_format self.user_info = user_info if self.user_info is None: self.user_info = collect_user_info(None) def get_record(self): """ Returns the record structure of this L{BibFormatObject} instance @return: the record structure as defined by BibRecord library """ from invenio.search_engine import get_record # Create record if necessary if self.record is None: # on-the-fly creation if current output is xm self.record = get_record(self.recID) return self.record def control_field(self, tag, escape=0): """ Returns the value of control field given by tag in record @param tag: the marc code of a field @param escape: 1 if returned value should be escaped. Else 0. @return: value of field tag in record """ if self.get_record() is None: #Case where BibRecord could not parse object return '' p_tag = parse_tag(tag) field_value = record_get_field_value(self.get_record(), p_tag[0], p_tag[1], p_tag[2], p_tag[3]) if escape == 0: return field_value else: return escape_field(field_value, escape) def field(self, tag, escape=0): """ Returns the value of the field corresponding to tag in the current record. If the value does not exist, return empty string. Else returns the same as bfo.fields(..)[0] (see docstring below). 'escape' parameter allows to escape special characters of the field. The value of escape can be: 0. no escaping 1. escape all HTML characters 2. remove unsafe HTML tags (Eg. keep
) 3. Mix of mode 1 and 2. If value of field starts with , then use mode 2. Else use mode 1. 4. Remove all HTML tags 5. Same as 2, with more tags allowed (like ) 6. Same as 3, with more tags allowed (like ) 7. Mix of mode 0 and mode 1. If field_value starts with , then use mode 0. Else use mode 1. @param tag: the marc code of a field @param escape: 1 if returned value should be escaped. Else 0. (see above for other modes) @return: value of field tag in record """ list_of_fields = self.fields(tag) if len(list_of_fields) > 0: # Escaping below if escape == 0: return list_of_fields[0] else: return escape_field(list_of_fields[0], escape) else: return "" def fields(self, tag, escape=0, repeatable_subfields_p=False): """ Returns the list of values corresonding to "tag". If tag has an undefined subcode (such as 999C5), the function returns a list of dictionaries, whoose keys are the subcodes and the values are the values of tag.subcode. If the tag has a subcode, simply returns list of values corresponding to tag. Eg. for given MARC:: 999C5 $a value_1a $b value_1b 999C5 $b value_2b 999C5 $b value_3b $b value_3b_bis >>> bfo.fields('999C5b') >>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis'] >>> bfo.fields('999C5') >>> [{'a':'value_1a', 'b':'value_1b'}, {'b':'value_2b'}, {'b':'value_3b'}] By default the function returns only one value for each subfield (that is it considers that repeatable subfields are not allowed). It is why in the above example 'value3b_bis' is not shown for bfo.fields('999C5'). (Note that it is not defined which of value_3b or value_3b_bis is returned). This is to simplify the use of the function, as most of the time subfields are not repeatable (in that way we get a string instead of a list). You can allow repeatable subfields by setting 'repeatable_subfields_p' parameter to True. In this mode, the above example would return: >>> bfo.fields('999C5b', repeatable_subfields_p=True) >>> ['value_1b', 'value_2b', 'value_3b'] >>> bfo.fields('999C5', repeatable_subfields_p=True) >>> [{'a':['value_1a'], 'b':['value_1b']}, {'b':['value_2b']}, {'b':['value_3b', 'value3b_bis']}] NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT. Also note that whatever the value of 'repeatable_subfields_p' is, bfo.fields('999C5b') always show all fields, even repeatable ones. This is because the parameter has no impact on the returned structure (it is always a list). 'escape' parameter allows to escape special characters of the fields. The value of escape can be: 0. No escaping 1. Escape all HTML characters 2. Remove unsafe HTML tags (Eg. keep
) 3. Mix of mode 1 and 2. If value of field starts with , then use mode 2. Else use mode 1. 4. Remove all HTML tags 5. Same as 2, with more tags allowed (like ) 6. Same as 3, with more tags allowed (like ) 7. Mix of mode 0 and mode 1. If field_value starts with , then use mode 0. Else use mode 1. @param tag: the marc code of a field @param escape: 1 if returned values should be escaped. Else 0. @repeatable_subfields_p if True, returns the list of subfields in the dictionary @return: values of field tag in record """ if self.get_record() is None: # Case where BibRecord could not parse object return [] p_tag = parse_tag(tag) if p_tag[3] != "": # Subcode has been defined. Simply returns list of values values = record_get_field_values(self.get_record(), p_tag[0], p_tag[1], p_tag[2], p_tag[3]) if escape == 0: return values else: return [escape_field(value, escape) for value in values] else: # Subcode is undefined. Returns list of dicts. # However it might be the case of a control field. instances = record_get_field_instances(self.get_record(), p_tag[0], p_tag[1], p_tag[2]) if repeatable_subfields_p: list_of_instances = [] for instance in instances: instance_dict = {} for subfield in instance[0]: if not instance_dict.has_key(subfield[0]): instance_dict[subfield[0]] = [] if escape == 0: instance_dict[subfield[0]].append(subfield[1]) else: instance_dict[subfield[0]].append(escape_field(subfield[1], escape)) list_of_instances.append(instance_dict) return list_of_instances else: if escape == 0: return [dict(instance[0]) for instance in instances] else: return [dict([ (subfield[0], escape_field(subfield[1], escape)) \ for subfield in instance[0] ]) \ for instance in instances] def kb(self, kb, string, default=""): """ Returns the value of the "string" in the knowledge base "kb". If kb does not exist or string does not exist in kb, returns 'default' string or empty string if not specified. @param kb: a knowledge base name @param string: the string we want to translate @param default: a default value returned if 'string' not found in 'kb' @return: a string value corresponding to translated input with given kb """ if not string: return default val = get_kbr_values(kb, searchkey=string, searchtype='e') try: return val[0][0] except: return default def escape_field(value, mode=0): """ Utility function used to escape the value of a field in given mode. - mode 0: no escaping - mode 1: escaping all HTML/XML characters (escaped chars are shown as escaped) - mode 2: escaping unsafe HTML tags to avoid XSS, but keep basic one (such as
) Escaped tags are removed. - mode 3: mix of mode 1 and mode 2. If field_value starts with , then use mode 2. Else use mode 1. - mode 4: escaping all HTML/XML tags (escaped tags are removed) - mode 5: same as 2, but allows more tags, like - mode 6: same as 3, but allows more tags, like - mode 7: mix of mode 0 and mode 1. If field_value starts with , then use mode 0. Else use mode 1. @param value: value to escape @param mode: escaping mode to use @return: an escaped version of X{value} according to chosen X{mode} """ if mode == 1: return cgi.escape(value) elif mode in [2, 5]: - allowed_attribute_whitelist = cfg_html_buffer_allowed_attribute_whitelist - allowed_tag_whitelist = cfg_html_buffer_allowed_tag_whitelist + \ + allowed_attribute_whitelist = CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST + allowed_tag_whitelist = CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST + \ ('class',) if mode == 5: allowed_attribute_whitelist += ('src', 'alt', 'width', 'height', 'style', 'summary', 'border', 'cellspacing', 'cellpadding') allowed_tag_whitelist += ('img', 'table', 'td', 'tr', 'th', 'span', 'caption') try: return washer.wash(value, allowed_attribute_whitelist=\ allowed_attribute_whitelist, allowed_tag_whitelist= \ allowed_tag_whitelist ) except HTMLParseError: # Parsing failed return cgi.escape(value) elif mode in [3, 6]: if value.lstrip(' \n').startswith(html_field): - allowed_attribute_whitelist = cfg_html_buffer_allowed_attribute_whitelist - allowed_tag_whitelist = cfg_html_buffer_allowed_tag_whitelist + \ + allowed_attribute_whitelist = CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST + allowed_tag_whitelist = CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST + \ ('class',) if mode == 6: allowed_attribute_whitelist += ('src', 'alt', 'width', 'height', 'style', 'summary', 'border', 'cellspacing', 'cellpadding') allowed_tag_whitelist += ('img', 'table', 'td', 'tr', 'th', 'span', 'caption') try: return washer.wash(value, allowed_attribute_whitelist=\ allowed_attribute_whitelist, allowed_tag_whitelist=\ allowed_tag_whitelist ) except HTMLParseError: # Parsing failed return cgi.escape(value) else: return cgi.escape(value) elif mode == 4: try: return washer.wash(value, allowed_attribute_whitelist=[], allowed_tag_whitelist=[] ) except HTMLParseError: # Parsing failed return cgi.escape(value) elif mode == 7: if value.lstrip(' \n').startswith(html_field): return value else: return cgi.escape(value) else: return value def bf_profile(): """ Runs a benchmark @return: None """ for i in range(1, 51): format_record(i, "HD", ln=CFG_SITE_LANG, verbose=9, search_pattern=[]) return if __name__ == "__main__": import profile import pstats #bf_profile() profile.run('bf_profile()', "bibformat_profile") p = pstats.Stats("bibformat_profile") p.strip_dirs().sort_stats("cumulative").print_stats() diff --git a/modules/bibformat/lib/elements/Makefile.am b/modules/bibformat/lib/elements/Makefile.am index 7f9bf81c3..2e15d1b24 100644 --- a/modules/bibformat/lib/elements/Makefile.am +++ b/modules/bibformat/lib/elements/Makefile.am @@ -1,46 +1,47 @@ ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. pylibdir=$(libdir)/python/invenio/bibformat_elements pylib_DATA = bfe_field.py bfe_title.py bfe_authors.py bfe_abstract.py bfe_affiliation.py \ bfe_imprint.py bfe_fulltext.py bfe_place.py bfe_publisher.py bfe_topbanner.py \ bfe_date_rec.py bfe_keywords.py bfe_notes.py bfe_reprints.py bfe_publi_info.py \ bfe_cited_by.py bfe_references.py bfe_title_brief.py \ bfe_report_numbers.py bfe_additional_report_numbers.py bfe_url.py \ bfe_addresses.py bfe_contact.py bfe_photo_resources_brief.py \ bfe_collection.py bfe_editors.py bfe_bibtex.py bfe_edit_record.py \ bfe_date.py bfe_xml_record.py bfe_external_publications.py __init__.py \ bfe_bfx_engine.py bfe_creation_date.py bfe_server_info.py bfe_issn.py \ bfe_client_info.py bfe_language.py bfe_record_id.py bfe_comments.py \ bfe_pagination.py bfe_fulltext_mini.py bfe_year.py bfe_isbn.py \ bfe_appears_in_collections.py bfe_photos.py bfe_record_stats.py \ bfe_edit_files.py bfe_plots.py bfe_plots_thumb.py bfe_sword_push.py \ bfe_video_sources.py bfe_video_bigthumb.py \ bfe_aid_authors.py bfe_doi.py bfe_addthis.py \ bfe_duration.py bfe_record_url.py bfe_video_selector.py \ bfe_video_platform_downloads.py bfe_video_platform_suggestions.py \ - bfe_video_platform_sources.py bfe_sciencewise.py bfe_bookmark.py + bfe_video_platform_sources.py bfe_sciencewise.py bfe_bookmark.py \ + bfe_oai_marcxml.py tmpdir = $(prefix)/var/tmp/tests_bibformat_elements tmp_DATA = test_1.py bfe_test_2.py bfe_test_4.py test3.py test_5.py \ test_no_element.test __init__.py EXTRA_DIST = $(pylib_DATA) $(tmp_DATA) CLEANFILES = *~ *.tmp *.pyc diff --git a/modules/bibformat/lib/elements/bfe_oai_marcxml.py b/modules/bibformat/lib/elements/bfe_oai_marcxml.py new file mode 100644 index 000000000..fc8555690 --- /dev/null +++ b/modules/bibformat/lib/elements/bfe_oai_marcxml.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +## +## This file is part of Invenio. +## Copyright (C) 2011 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +"""BibFormat element - OAI ready MARCXML + +This element return the full MARCXML representation of a record with the marc +prefix and namespace and adding the leader. +""" + +from invenio.bibformat_dblayer import get_preformatted_record + +def format_element(bfo): + """ + Return the MARCXML representation of the record with the marc prefix and + namespace and adding the leader. + """ + formatted_record = get_preformatted_record(bfo.recID, 'xm') + formatted_record = formatted_record.replace("", "\n 00000coc 2200000uu 4500") + formatted_record = formatted_record.replace("", "\n 00000coc 2200000uu 4500") + formatted_record = formatted_record.replace(" + + + + XML Schema which can be used to validate replies to all OAI-PMH + v2.0 requests. Herbert Van de Sompel, 2002-05-13. + Validated with XML Spy v.4.3 on 2002-05-13. + Validated with XSV 1.203.2.45/1.106.2.22 on 2002-05-13. + Added definition of protocolVersionType instead of using anonymous + type. No change of function. Simeon Warner, 2004-03-29. + Tightened definition of UTCdatetimeType to enforce the restriction + to UTC Z notation. Simeon Warner, 2004-09-14. + Corrected pattern matches for setSpecType and metadataPrefixType + to agree with protocol specification. Simeon Warner, 2004-10-12. + Spelling correction. Simeon Warner, 2008-12-07. + $Date: 2004/10/12 15:20:29 $ + + + + + + + + + + + + + + + + + + + + + + + + Define requestType, indicating the protocol request that + led to the response. Element content is BASE-URL, attributes are arguments + of protocol request, attribute-values are values of arguments of protocol + request + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record has a header, a metadata part, and + an optional about container + + + + + + + + + + + A header has a unique identifier, a datestamp, + and setSpec(s) in case the item from which + the record is disseminated belongs to set(s). + the header can carry a deleted status indicating + that the record is deleted. + + + + + + + + + + + + + + + + + + + + + + Metadata must be expressed in XML that complies + with another XML Schema (namespace=#other). Metadata must be + explicitly qualified in the response. + + + + + + + + + Data "about" the record must be expressed in XML + that is compliant with an XML Schema defined by a community. + + + + + + + + + A resumptionToken may have 3 optional attributes + and can be used in ListSets, ListIdentifiers, ListRecords + responses. + + + + + + + + + + + + + The descriptionType is used for the description + element in Identify and for setDescription element in ListSets. + Content must be compliant with an XML Schema defined by a + community. + + + + + + + + + Datestamps are to either day (type date) + or to seconds granularity (type oai:UTCdateTimeZType) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/modules/bibharvest/etc/oai2.xsl.v1.0 b/modules/bibharvest/etc/oai2.xsl.v1.0 new file mode 100644 index 000000000..ee3173297 --- /dev/null +++ b/modules/bibharvest/etc/oai2.xsl.v1.0 @@ -0,0 +1,659 @@ + + + + + + + + + + + + + + + +td.value { + vertical-align: top; + padding-left: 1em; + padding: 3px; +} +td.key { + background-color: #e0e0ff; + padding: 3px; + text-align: right; + border: 1px solid #c0c0c0; + white-space: nowrap; + font-weight: bold; + vertical-align: top; +} +.dcdata td.key { + background-color: #ffffe0; +} +body { + margin: 1em 2em 1em 2em; +} +h1, h2, h3 { + font-family: sans-serif; + clear: left; +} +h1 { + padding-bottom: 4px; + margin-bottom: 0px; +} +h2 { + margin-bottom: 0.5em; +} +h3 { + margin-bottom: 0.3em; + font-size: medium; +} +.link { + border: 1px outset #88f; + background-color: #c0c0ff; + padding: 1px 4px 1px 4px; + font-size: 80%; + text-decoration: none; + font-weight: bold; + font-family: sans-serif; + color: black; +} +.link:hover { + color: red; +} +.link:active { + color: red; + border: 1px inset #88f; + background-color: #a0a0df; +} +.oaiRecord, .oaiRecordTitle { + background-color: #f0f0ff; + border-style: solid; + border-color: #d0d0d0; +} +h2.oaiRecordTitle { + background-color: #e0e0ff; + font-size: medium; + font-weight: bold; + padding: 10px; + border-width: 2px 2px 0px 2px; + margin: 0px; +} +.oaiRecord { + margin-bottom: 3em; + border-width: 2px; + padding: 10px; +} + +.results { + margin-bottom: 1.5em; +} +ul.quicklinks { + margin-top: 2px; + padding: 4px; + text-align: left; + border-bottom: 2px solid #ccc; + border-top: 2px solid #ccc; + clear: left; +} +ul.quicklinks li { + font-size: 80%; + display: inline; + list-stlye: none; + font-family: sans-serif; +} +p.intro { + font-size: 80%; +} + + + + + + + + + OAI 2.0 Request Results + + + +

OAI 2.0 Request Results

+ +
+ + +

About the XSLT

+

An XSLT file has converted the OAI-PMH 2.0 responses into XHTML which looks nice in a browser which supports XSLT such as Mozilla, Firebird and Internet Explorer. The XSLT file was created by Christopher Gutteridge at the University of Southampton as part of the GNU EPrints system, and is freely redistributable under the GPL.

If you want to use the XSL file on your own OAI interface you may but due to the way XSLT works you must install the XSL file on the same server as the OAI script, you can't just link to this copy.

For more information or to download the XSL file please see the OAI to XHTML XSLT homepage.

+ + + + + + + + + + + + + + + + +
Datestamp of response
Request URL
+ + + +

OAI Error(s)

+

The request could not be completed due to the following error or errors.

+
+ +
+
+ +

Request was of type .

+
+ + + + + + +
+
+
+
+ + + + + + + + +
Error Code
+

+
+ + + + + + + + + + + + + + + + + + +
Repository Name
Base URL
Protocol Version
Earliest Datestamp
Deleted Record Policy
Granularity
+ + +
+ + + Admin Email + + + + + + +

Unsupported Description Type

+

The XSL currently does not support this type of description.

+
+ +
+
+ + + + + +

OAI-Identifier

+ + + + + + + + + +
Scheme
Repository Identifier
Delimiter
Sample OAI Identifier
+
+ + + + + +

EPrints Description

+

Content

+ + +

Submission Policy

+ +
+

Metadata Policy

+ +

Data Policy

+ + +

Content

+ +
+ +
+ + + +

+
+ +
+
+
+ + +

Comment

+
+
+ + + + + +

Friends

+
    + +
+
+ + +
  • + +Identify
  • +
    + + + + + +

    Branding

    + + +
    + + +

    Icon

    + + + {br:title} + + + {br:title} + + +
    + + +

    Metadata Rendering Rule

    + + + + + + + +
    URL
    Namespace
    Mime Type
    +
    + + + + + + +

    Gateway Information

    + + + + + + + + + + + + + + +
    Source
    Description
    URL
    Notes
    +
    + + + Admin + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Set

    + + + + +
    setName
    +
    + + + + + + +

    This is a list of metadata formats available for the record "". Use these links to view the metadata:

    +
    + +

    This is a list of metadata formats available from this archive.

    +
    +
    + +
    + + +

    Metadata Format

    + + + + + + + +
    metadataPrefix
    metadataNamespace
    schema
    +
    + + + + + + + + +

    OAI Record:

    +
    + + + +
    +
    + + +

    OAI Record Header

    + + + + + + +
    OAI Identifier + + oai_dc + formats +
    Datestamp
    + +

    This record has been deleted.

    +
    +
    + + + +

    "about" part of record container not supported by the XSL

    +
    + + +   + + + + + + + + + + setSpec + + Identifiers + Records + + + + + + + + +

    There are more results.

    + + + +
    resumptionToken: + +Resume
    +
    + + + + +

    Unknown Metadata Format

    +
    + +
    +
    + + + + +
    +

    Dublin Core Metadata (oai_dc)

    + + +
    +
    +
    + + +Title + + +Author or Creator + + +Subject and Keywords + + +Description + + +Publisher + + +Other Contributor + + +Date + + +Resource Type + + +Format + + +Resource Identifier + + +Source + + +Language + + +Relation + + + + + URL + URL not shown as it is very long. + + + + + + + + + + + + + +Coverage + + +Rights Management + + + + +
    + <></> +
    +
    + + + + + ="" + + + +.xmlSource { + font-size: 70%; + border: solid #c0c0a0 1px; + background-color: #ffffe0; + padding: 2em 2em 2em 0em; +} +.xmlBlock { + padding-left: 2em; +} +.xmlTagName { + color: #800000; + font-weight: bold; +} +.xmlAttrName { + font-weight: bold; +} +.xmlAttrValue { + color: #0000c0; +} + + + + diff --git a/modules/bibharvest/lib/Makefile.am b/modules/bibharvest/lib/Makefile.am index fb24aab01..cab6ffe49 100644 --- a/modules/bibharvest/lib/Makefile.am +++ b/modules/bibharvest/lib/Makefile.am @@ -1,37 +1,38 @@ ## This file is part of Invenio. ## Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. pylibdir = $(libdir)/python/invenio pylib_DATA = oai_repository_server.py \ oai_repository_tests.py \ oai_repository_webinterface.py \ oai_repository_regression_tests.py \ + oai_repository_config.py \ oai_harvest_getter.py \ - oai_harvest_dblayer.py \ + oai_harvest_dblayer.py \ bibharvest_templates.py \ oai_harvest_admin.py \ oai_harvest_admin_regression_tests.py \ oai_repository_admin.py \ oai_repository_admin_regression_tests.py \ oai_harvest_daemon.py \ oai_repository_updater.py \ oai_harvest_config.py EXTRA_DIST = $(pylib_DATA) CLEANFILES = *~ *.tmp *.pyc diff --git a/modules/bibharvest/lib/oai_repository_admin.py b/modules/bibharvest/lib/oai_repository_admin.py index 571678c68..1358d22fd 100644 --- a/modules/bibharvest/lib/oai_repository_admin.py +++ b/modules/bibharvest/lib/oai_repository_admin.py @@ -1,816 +1,826 @@ ## This file is part of Invenio. ## Copyright (C) 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Invenio OAI Repository Administrator Interface.""" __revision__ = "$Id$" import cgi import os from invenio.config import \ CFG_SITE_LANG, \ CFG_TMPDIR, \ CFG_SITE_URL import invenio.access_control_engine as access_manager from invenio.urlutils import create_html_link from invenio.dbquery import run_sql from invenio.oai_repository_updater import parse_set_definition from invenio.messages import gettext_set_language +from invenio.errorlib import register_exception +from invenio.oai_repository_config import CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC import invenio.template bibharvest_templates = invenio.template.load('bibharvest') tmppath = CFG_TMPDIR + '/oairepositoryadmin.' + str(os.getpid()) guideurl = "help/admin/oai-admin-guide" oai_rep_admin_url = CFG_SITE_URL + \ "/admin/bibharvest/oairepositoryadmin.py" def getnavtrail(previous = '', ln = CFG_SITE_LANG): """Get navtrail""" return bibharvest_templates.tmpl_getnavtrail(previous = previous, ln = ln) def perform_request_index(ln=CFG_SITE_LANG): """OAI Repository admin index""" out = '''

    Define below the sets to expose through the OAI harvesting protocol.
    You will have to run the oairepositoryupdater utility to apply the settings you have defined here.

    ''' % {'siteurl': CFG_SITE_URL, 'ln': ln} titlebar = bibharvest_templates.tmpl_draw_titlebar(ln = ln, title = "OAI repository", guideurl = guideurl, extraname = "add new OAI set", extraurl = "admin/bibharvest/oairepositoryadmin.py/addset") header = ['id', 'setSpec', 'setName', 'collection', 'p1', 'f1', 'm1', 'op1', 'p2', 'f2', 'm2', 'op2', 'p3', 'f3', 'm3', '', ''] oai_set = get_oai_set() sets = [] for (id, setSpec, setName, setCollection, \ setDescription, p1, f1, m1, p2, f2, m2, \ p3, f3, m3, op1, op2) in oai_set: del_request = 'delete' edit_request = 'edit' sets.append([id, cgi.escape(setSpec), cgi.escape(setName), cgi.escape(setCollection), cgi.escape(p1), f1, m1, op1, cgi.escape(p2), f2, m2, op2, cgi.escape(p3), f3, m3, del_request, edit_request]) add_request = 'Add new OAI set definition' sets.append(['', add_request, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) out += transform_tuple(header=header, tuple=sets) out += "

    " return out def perform_request_addset(oai_set_name='', oai_set_spec='', oai_set_collection='', oai_set_description='', oai_set_definition='', oai_set_reclist='', oai_set_p1='', oai_set_f1='',oai_set_m1='', oai_set_p2='', oai_set_f2='', oai_set_m2='', oai_set_p3='', oai_set_f3='', oai_set_m3='', oai_set_op1='a', oai_set_op2='a', ln=CFG_SITE_LANG, func=0): """add a new OAI set""" _ = gettext_set_language(ln) out = "" if func in ["0", 0]: text = input_form(oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_definition, oai_set_reclist, oai_set_p1, oai_set_f1,oai_set_m1, oai_set_p2, oai_set_f2,oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2, ln=ln) out = createform(action="addset", text=text, ln=ln, button="Add new OAI set definition line", func=1) lnargs = [["ln", ln]] if func in ["1", 1]: out += "
    " res = add_oai_set(oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_definition, oai_set_reclist, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2) if res[0] == 1: out += bibharvest_templates.tmpl_print_info(ln, "OAI set definition %s added." % \ cgi.escape(oai_set_name)) out += "
    " out += "

    " out += create_html_link(urlbase=oai_rep_admin_url + \ "/index", urlargd={'ln': ln}, link_label=_("Return to main selection")) return nice_box("", out) def perform_request_editset(oai_set_id=None, oai_set_name='', oai_set_spec='', oai_set_collection='', oai_set_description='', oai_set_definition='', oai_set_reclist='', oai_set_p1='', oai_set_f1='', oai_set_m1='', oai_set_p2='', oai_set_f2='', oai_set_m2='', oai_set_p3='', oai_set_f3='', oai_set_m3='', oai_set_op1='a', oai_set_op2='a', ln=CFG_SITE_LANG, func=0): """creates html form to edit an OAI set.""" _ = gettext_set_language(ln) if oai_set_id is None: return "No OAI set ID selected." out = "" if func in [0, "0"]: oai_set = get_oai_set(oai_set_id) if not oai_set: return "ERROR: oai_set_id %s seems invalid" % oai_set_id oai_set_spec = oai_set[0][1] oai_set_name = oai_set[0][2] oai_set_collection = oai_set[0][3] oai_set_description = oai_set[0][4] oai_set_definition = '' oai_set_reclist = '' oai_set_p1 = oai_set[0][5] oai_set_f1 = oai_set[0][6] oai_set_m1 = oai_set[0][7] oai_set_p2 = oai_set[0][8] oai_set_f2 = oai_set[0][9] oai_set_m2 = oai_set[0][10] oai_set_p3 = oai_set[0][11] oai_set_f3 = oai_set[0][12] oai_set_m3 = oai_set[0][13] oai_set_op1 = oai_set[0][14] oai_set_op2 = oai_set[0][15] text = input_form(oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_definition, oai_set_reclist, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2, ln=ln) out += extended_input_form(action="editset", text=text, button="Modify", oai_set_id=oai_set_id, ln=ln, func=1) if func in [1, "1"]: res = modify_oai_set(oai_set_id, oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2) out += "
    " if res[0] == 1: out += bibharvest_templates.tmpl_print_info(ln, "OAI set definition #%s edited." % oai_set_id) out += "
    " else: out += bibharvest_templates.tmpl_print_warning(ln, "A problem was encountered:
    " + cgi.escape(res[1])) out += "
    " out += "
    " out += create_html_link(urlbase=oai_rep_admin_url + \ "/index", urlargd={'ln': ln}, link_label=_("Return to main selection")) return nice_box("", out) def perform_request_delset(oai_set_id=None, ln=CFG_SITE_LANG, callback='yes', func=0): """creates html form to delete an OAI set""" _ = gettext_set_language(ln) out = "" if oai_set_id: oai_set = get_oai_set(oai_set_id) if not oai_set: return "ERROR: oai_set_id %s seems invalid" % oai_set_id nameset = (oai_set[0][1]) pagetitle = """Delete OAI set: %s""" % cgi.escape(nameset) if func in ["0", 0]: oai_set = get_oai_set(oai_set_id) oai_set_spec = oai_set[0][1] oai_set_name = oai_set[0][2] oai_set_collection = oai_set[0][3] oai_set_description = oai_set[0][4] oai_set_definition = '' oai_set_reclist = '' oai_set_p1 = oai_set[0][5] oai_set_f1 = oai_set[0][6] oai_set_m1 = oai_set[0][7] oai_set_p2 = oai_set[0][8] oai_set_f2 = oai_set[0][9] oai_set_m2 = oai_set[0][10] oai_set_p3 = oai_set[0][11] oai_set_f3 = oai_set[0][12] oai_set_m3 = oai_set[0][13] oai_set_op1 = oai_set[0][14] oai_set_op2 = oai_set[0][15] if oai_set: question = """Do you want to delete the OAI definition #%s?""" % oai_set_id text = bibharvest_templates.tmpl_print_info(ln, question) text += "


    " text += pagebody_text( cgi.escape("%s-%s-%s-%s-%s-%s-%s-%s-%s-%s-%s-%s-%s-%s" % \ (oai_set_spec, oai_set_name, oai_set_collection, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_op1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_op2, oai_set_p3, oai_set_f3, oai_set_m3))) out += createform(action="delset", text=text, button="Delete", oai_set_id=oai_set_id, func=1) else: return bibharvest_templates.tmpl_print_info(ln, "OAI set does not exist.") elif func in ["1", 1]: res = delete_oai_set(oai_set_id) if res[0] == 1: out += bibharvest_templates.tmpl_print_info(ln, "OAI set definition #%s deleted." % oai_set_id) out += "
    " else: pass out += "

    " out += create_html_link(urlbase=oai_rep_admin_url + \ "/index", urlargd={'ln': ln}, link_label=_("Return to main selection")) return nice_box("", out) def get_oai_set(id=''): """Returns a row parameters for a given id""" sets = [] sql = "SELECT id, setSpec, setName, setCollection, setDescription, p1,f1,m1, p2,f2,m2, p3,f3,m3, setDefinition FROM oaiREPOSITORY" try: if id: sql += " WHERE id=%s" % id sql += " ORDER BY setSpec asc" res = run_sql(sql) for row in res: set = ['']*16 set[0] = row[0] set[1] = row[1] set[2] = row[2] params = parse_set_definition(row[14]) set[3] = params.get('c', '') set[5] = params.get('p1', '') set[6] = params.get('f1', '') set[7] = params.get('m1', '') set[8] = params.get('p2', '') set[9] = params.get('f2', '') set[10] = params.get('m2', '') set[11] = params.get('p3', '') set[12] = params.get('f3', '') set[13] = params.get('m3', '') set[14] = params.get('op1', 'a') set[15] = params.get('op2', 'a') sets.append(set) return sets except StandardError, e: + register_exception(alert_admin=True) return str(e) def modify_oai_set(oai_set_id, oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_p1, oai_set_f1,oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2): """Modifies a row's parameters""" try: + if not oai_set_spec: + oai_set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC set_definition = 'c=' + oai_set_collection + ';' + \ 'p1=' + oai_set_p1 + ';' + \ 'f1=' + oai_set_f1 + ';' + \ 'm1=' + oai_set_m1 + ';' + \ 'op1='+ oai_set_op1 + ';' + \ 'p2=' + oai_set_p2 + ';' + \ 'f2=' + oai_set_f2 + ';' + \ 'm2=' + oai_set_m2 + ';' + \ 'op2='+ oai_set_op2 + ';' + \ 'p3=' + oai_set_p3 + ';' + \ 'f3=' + oai_set_f3 + ';' + \ 'm3=' + oai_set_m3 + ';' res = run_sql("""UPDATE oaiREPOSITORY SET setName=%s, setSpec=%s, setCollection=%s, setDescription=%s, setDefinition=%s, p1=%s, f1=%s, m1=%s, p2=%s, f2=%s, m2=%s, p3=%s, f3=%s, - m3=%s + m3=%s, WHERE id=%s""", (oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, set_definition, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_id)) return (1, "") except StandardError, e: + register_exception(alert_admin=True) return (0, str(e)) def add_oai_set(oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_definition, oai_set_reclist, oai_set_p1, oai_set_f1,oai_set_m1, oai_set_p2, oai_set_f2,oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2): """Add a definition into the OAI Repository""" try: + if not oai_set_spec: + oai_set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC set_definition = 'c=' + oai_set_collection + ';' + \ 'p1=' + oai_set_p1 + ';' + \ 'f1=' + oai_set_f1 + ';' + \ 'm1=' + oai_set_m1 + ';' + \ 'op1='+ oai_set_op1 + ';' + \ 'p2=' + oai_set_p2 + ';' + \ 'f2=' + oai_set_f2 + ';' + \ 'm2=' + oai_set_m2 + ';' + \ 'op2='+ oai_set_op2 + ';' + \ 'p3=' + oai_set_p3 + ';' + \ 'f3=' + oai_set_f3 + ';' + \ 'm3=' + oai_set_m3 + ';' res = run_sql("""INSERT INTO oaiREPOSITORY (id, setName, setSpec, setCollection, setDescription, setDefinition, setRecList, p1, f1, m1, p2, f2, m2, p3, f3, m3) VALUES (0, %s, %s, %s, %s, %s, NULL, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, set_definition, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3)) return (1, "") except StandardError, e: + register_exception(alert_admin=True) return (0, e) def delete_oai_set(oai_set_id): """""" try: res = run_sql("DELETE FROM oaiREPOSITORY WHERE id=%s", (oai_set_id,)) return (1, "") except StandardError, e: + register_exception(alert_admin=True) return (0, e) def drop_down_menu(boxname, content): """ Returns the code of a drop down menu. Parameters: boxname - *str* name of the input form content - *list(tuple3)* the content of the list. List of items as tuple3 with: - *str* value of the item - *bool* if item is selected of not - *str* label of the item (displayed value) """ text = " """ % \ (cgi.escape(name, 1), cgi.escape(value, 1)) return text def pagebody_text(title): """""" text = """%s""" % title return text def bar_text(title): """""" text = """%s""" % title return text def input_form(oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_definition, oai_set_reclist, oai_set_p1, oai_set_f1,oai_set_m1, oai_set_p2, oai_set_f2,oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2, ln=CFG_SITE_LANG): """returns the standard settings form""" modes = { 'r' : 'Regular Expression', 'a' : 'All of the words', 'y' : 'Any of the words', 'e' : 'Exact phrase', 'p' : 'Partial phrase' } mode_dropdown = [['r', '', modes['r']], ['e', '', modes['e']], ['p', '', modes['p']], ['a', '', modes['a']], ['y', '', modes['y']], ['', '', '']] operators = { 'a' : 'AND', 'o' : 'OR', 'n' : 'AND NOT', } mode_operators_1 = [['a', '', operators['a']], ['o', '', operators['o']], ['n', '', operators['n']], ['a', '', '']] mode_operators_2 = [['a', '', operators['a']], ['o', '', operators['o']], ['n', '', operators['n']], ['a', '', '']] text = "
    " text += "
    " text += input_text(ln = ln, title = "OAI Set spec:", name = "oai_set_spec", value = oai_set_spec) - text += 'Optional: leave blank if not needed [?]' + text += 'Optional: if you leave it blank it will be automatically set to "%s", with the implicit convention that any record belonging to it can be harvested by not specifying any set. [?]' % CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC text += "
    " text += input_text(ln = ln, title = "OAI Set name:", name = "oai_set_name", value = oai_set_name) text += 'Optional: leave blank if not needed [?]' text += "
     
    " text += '
    Choose below the search query that defines the records that belong to this set:
    ' text += "
     
    " # text += input_text(ln = ln, title = "OAI Set description", name = "oai_set_description", value = oai_set_description) #text += "" #menu = create_drop_down_menu_content("SELECT distinct(name) from collection") #menu.append(['','','']) #if (oai_set_collection): # menu.append([oai_set_collection,'selected',oai_set_collection]) #else: # menu.append(['','selected','Collection']) text += input_text(ln = ln, title = "Collection(s):", name="oai_set_collection", value=oai_set_collection) #text += drop_down_menu("oai_set_collection", menu) text += 'Eg: Published Articles, Preprints, Theses
    (collections identifiers, not collections names/translations).
    ' text += input_text(ln = ln, title = "Phrase:", name = "oai_set_p1", value = oai_set_p1) text += "" fields = create_drop_down_menu_content("SELECT distinct(code) from field") fields.append(['', '', '']) if (oai_set_f1): fields.append([oai_set_f1, 'selected', oai_set_f1]) else: fields.append(['', 'selected', 'Field']) if (oai_set_m1): mode_dropdown_m1 = [[oai_set_m1, 'selected', modes[oai_set_m1]]] else: mode_dropdown_m1 = [['', 'selected', 'Mode']] text += drop_down_menu("oai_set_f1", fields) text += "" text += drop_down_menu("oai_set_m1", mode_dropdown + mode_dropdown_m1) text += "" if (oai_set_op1): mode_operators_1.append([oai_set_op1, 'selected', operators[oai_set_op1]]) else: mode_operators_1.append(['', 'selected', 'Operators']) text += drop_down_menu("oai_set_op1", mode_operators_1) text += "
    " text += input_text(ln = ln, title = "Phrase:", name = "oai_set_p2", value = oai_set_p2) text += "" fields = create_drop_down_menu_content("SELECT distinct(code) from field") fields.append(['', '', '']) if (oai_set_f2): fields.append([oai_set_f2, 'selected', oai_set_f2]) else: fields.append(['', 'selected', 'Field']) if (oai_set_m2): mode_dropdown_m2 = [[oai_set_m2, 'selected', modes[oai_set_m2]]] else: mode_dropdown_m2 = [['', 'selected', 'Mode']] text += drop_down_menu("oai_set_f2", fields) text += "" text += drop_down_menu("oai_set_m2", mode_dropdown + mode_dropdown_m2) text += "" if (oai_set_op2): mode_operators_2.append([oai_set_op2, 'selected', operators[oai_set_op2]]) else: mode_operators_2.append(['', 'selected', 'Operators']) text += drop_down_menu("oai_set_op2", mode_operators_2) text += "
    " text += input_text(ln = ln, title = "Phrase:", name = "oai_set_p3", value = oai_set_p3) text += "" fields = create_drop_down_menu_content("SELECT distinct(code) from field") fields.append(['', '', '']) if (oai_set_f3): fields.append([oai_set_f3, 'selected', oai_set_f3]) else: fields.append(['', 'selected', 'Field']) if (oai_set_m3): mode_dropdown_m3 = [[oai_set_m3, 'selected', modes[oai_set_m3]]] else: mode_dropdown_m3 = [['', 'selected', 'Mode']] text += drop_down_menu("oai_set_f3", fields) text += "" text += drop_down_menu("oai_set_m3", mode_dropdown + mode_dropdown_m3) text += "
    " return text def check_user(req, role, adminarea=2, authorized=0): """""" (auth_code, auth_message) = access_manager.acc_authorize_action(req, role) if not authorized and auth_code != 0: return ("false", auth_message) return ("", auth_message) def transform_tuple(header, tuple, start='', end='', extracolumn=''): """""" align = [] try: firstrow = tuple[0] if type(firstrow) in [int, long]: align = ['admintdright'] elif type(firstrow) in [str, dict]: align = ['admintdleft'] else: for item in firstrow: if type(item) is int: align.append('admintdright') else: align.append('admintdleft') except IndexError: firstrow = [] tblstr = '' for h in header: tblstr += ' %s\n' % (h, ) if tblstr: tblstr = ' \n%s\n \n' % (tblstr, ) tblstr = start + '\n' + tblstr try: extra = '' if type(firstrow) not in [int, long, str, dict]: for i in range(len(firstrow)): extra += '\n' % (align[i], firstrow[i]) else: extra += ' \n' % (align[0], firstrow) #extra += '\n\n' % (len(tuple), extracolumn) extra += '\n' except IndexError: extra = '' tblstr += extra j = 1 for row in tuple[1:]: style = '' if j % 2: style = ' style="background-color: rgb(235, 247, 255);"' j += 1 tblstr += ' \n' % style if type(row) not in [int, long, str, dict]: for i in range(len(row)): tblstr += '\n' % (align[i], row[i]) else: tblstr += ' \n' % (align[0], row) tblstr += ' \n' tblstr += '
    %s%s\n%s\n
    %s%s
    \n ' tblstr += end return tblstr def nice_box(header='', content='', cls="admin_wvar"): """ Embed the content into a box with given header Parameters: header - *str* header of the box datalist - *str* the content of the box cls - *str* the class of the box """ out = '''
    %s
    %s
    ''' % (cls, header, content) return out def extended_input_form(action="", text="", button="func", cnfrm='', **hidden): """""" out = '
    \n' % (action, ) out += '\n
    ' out += text if cnfrm: out += ' ' for key in hidden.keys(): if type(hidden[key]) is list: for value in hidden[key]: out += ' \n' % (key, value) else: out += ' \n' % (key, hidden[key]) out += '' out += ' \n' % (button, ) out += '
    ' out += '
    \n' return out diff --git a/modules/bibupload/lib/bibupload_config.py b/modules/bibharvest/lib/oai_repository_config.py similarity index 59% copy from modules/bibupload/lib/bibupload_config.py copy to modules/bibharvest/lib/oai_repository_config.py index 690e5a2e0..9b29d21ef 100644 --- a/modules/bibupload/lib/bibupload_config.py +++ b/modules/bibharvest/lib/oai_repository_config.py @@ -1,31 +1,28 @@ -# -*- coding: utf-8 -*- -## ## This file is part of Invenio. -## Copyright (C) 2006, 2007, 2008, 2010, 2011 CERN. +## Copyright (C) 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -""" -BibUpload Engine configuration. -""" - -__revision__ = "$Id$" - -CFG_BIBUPLOAD_CONTROLFIELD_TAGS = ['001', '002', '003', '004', - '005', '006', '007', '008'] - -CFG_BIBUPLOAD_SPECIAL_TAGS = ['FMT', 'FFT'] +"""OAI Repository Configuration.""" +## Maximum number of records to put in a single bibupload +CFG_OAI_REPOSITORY_MARCXML_SIZE = 100 +## A magic value used to specify the global set (e.g. when the admin +## specify a set configuration without putting any setSpec) +## NOTE: if you change this value, please update accordingly the root +## Makefile.am and tabcreate.sql defaults for setSpec column in +## oaiREPOSITORY MySQL table. +CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC = "GLOBAL_SET" diff --git a/modules/bibharvest/lib/oai_repository_regression_tests.py b/modules/bibharvest/lib/oai_repository_regression_tests.py index ef32d5fd2..d71221693 100644 --- a/modules/bibharvest/lib/oai_repository_regression_tests.py +++ b/modules/bibharvest/lib/oai_repository_regression_tests.py @@ -1,192 +1,186 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """OAI Repository Regression Test Suite.""" __revision__ = "$Id$" import unittest import time import re +from cStringIO import StringIO + from invenio.config import CFG_SITE_URL, \ CFG_OAI_SLEEP, \ CFG_OAI_LOAD, \ CFG_OAI_ID_FIELD -from invenio import oai_repository_server, search_engine, oai_repository_updater +from invenio.intbitset import intbitset +from invenio import oai_repository_server, search_engine from invenio.testutils import make_test_suite, run_test_suite, \ test_web_page_content, merge_error_messages class OAIRepositoryWebPagesAvailabilityTest(unittest.TestCase): """Check OAI Repository web pages whether they are up or not.""" - def test_your_baskets_pages_availability(self): + def test_oai_server_pages_availability(self): """oairepository - availability of OAI server pages""" baseurl = CFG_SITE_URL + '/oai2d' _exports = [#fast commands first: '?verb=Identify', '?verb=ListMetadataFormats', # sleepy commands now: '?verb=ListSets', '?verb=ListRecords', '?verb=GetRecord'] error_messages = [] for url in [baseurl + page for page in _exports]: if url.endswith('Identify') or \ url.endswith('ListMetadataFormats'): pass else: # some sleep required for verbs other than Identify # and ListMetadataFormats, since oai2d refuses too # frequent access: time.sleep(CFG_OAI_SLEEP) error_messages.extend(test_web_page_content(url, expected_text= '')) if error_messages: self.fail(merge_error_messages(error_messages)) return class TestSelectiveHarvesting(unittest.TestCase): """Test set, from and until parameters used to do selective harvesting.""" def test_set(self): """oairepository - testing selective harvesting with 'set' parameter""" - self.assertNotEqual([], oai_repository_server.oaigetsysnolist(set="cern:experiment")) + self.assertEqual(intbitset([10, 17]), oai_repository_server.oai_get_recid_list(set_spec="cern:experiment")) self.assert_("Multifractal analysis of minimum bias events" in \ ''.join([oai_repository_server.print_record(recID) for recID in \ - oai_repository_server.oaigetsysnolist(set="cern:experiment")])) + oai_repository_server.oai_get_recid_list(set_spec="cern:experiment")])) self.assert_("Multifractal analysis of minimum bias events" not in \ ''.join([oai_repository_server.print_record(recID) for recID in \ - oai_repository_server.oaigetsysnolist(set="cern:theory")])) - self.assertEqual([], oai_repository_server.oaigetsysnolist(set="nonExistingSet")) + oai_repository_server.oai_get_recid_list(set_spec="cern:theory")])) + self.failIf(oai_repository_server.oai_get_recid_list(set_spec="nonExistingSet")) def test_from_and_until(self): """oairepository - testing selective harvesting with 'from' and 'until' parameters""" + req = StringIO() # List available records, get datestamps and play with them - identifiers = oai_repository_server.oailistidentifiers("") - datestamps = re.findall('(?P.*)\s*(?P.*)', identifiers) + oai_repository_server.oai_list_records_or_identifiers(req, {'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml'}) + identifiers = req.getvalue() + datestamps = re.findall('(?P.*?)\s*(?P.*?)', identifiers, re.M) sample_datestamp = datestamps[0][1] # Take one datestamp sample_oai_id = datestamps[0][0] # Take corresponding oai id sample_id = search_engine.perform_request_search(p=sample_oai_id, f=CFG_OAI_ID_FIELD)[0] # Find corresponding system number id # There must be some datestamps self.assertNotEqual([], datestamps) # We must be able to retrieve an id with the date we have just found - self.assert_(sample_id in oai_repository_server.oaigetsysnolist(fromdate=sample_datestamp)) - self.assert_(sample_id in oai_repository_server.oaigetsysnolist(untildate=sample_datestamp)) - self.assert_(sample_id in oai_repository_server.oaigetsysnolist(untildate=sample_datestamp, \ + self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp)) + self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp)) + self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp, \ fromdate=sample_datestamp)) # Same, with short format date. Eg 2007-12-13 - self.assert_(sample_id in oai_repository_server.oaigetsysnolist(fromdate=sample_datestamp.split('T')[0])) - self.assert_(sample_id in oai_repository_server.oaigetsysnolist(untildate=sample_datestamp.split('T')[0])) - self.assert_(sample_id in oai_repository_server.oaigetsysnolist(fromdate=sample_datestamp.split('T')[0], \ + self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0])) + self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp.split('T')[0])) + self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0], \ untildate=sample_datestamp.split('T')[0])) # At later date (year after) we should not find our id again sample_datestamp_year = int(sample_datestamp[0:4]) sample_datestamp_rest = sample_datestamp[4:] later_datestamp = str(sample_datestamp_year + 1) + sample_datestamp_rest - self.assert_(sample_id not in oai_repository_server.oaigetsysnolist(fromdate=later_datestamp)) + self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(fromdate=later_datestamp)) # At earlier date (year before) we should not find our id again earlier_datestamp = str(sample_datestamp_year - 1) + sample_datestamp_rest - self.assert_(sample_id not in oai_repository_server.oaigetsysnolist(untildate=earlier_datestamp)) + self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(untildate=earlier_datestamp)) # From earliest date to latest date must include all oai records dates = [(time.mktime(time.strptime(date[1], "%Y-%m-%dT%H:%M:%SZ")), date[1]) for date in datestamps] dates = dict(dates) sorted_times = dates.keys() sorted_times.sort() earliest_datestamp = dates[sorted_times[0]] latest_datestamp = dates[sorted_times[-1]] - self.assertEqual(len(oai_repository_server.oaigetsysnolist()), \ - len(oai_repository_server.oaigetsysnolist(fromdate=earliest_datestamp, \ - untildate=latest_datestamp))) + self.assertEqual(oai_repository_server.oai_get_recid_list(), \ + oai_repository_server.oai_get_recid_list(fromdate=earliest_datestamp, \ + untildate=latest_datestamp)) + def test_resumption_token(self): """oairepository - testing harvesting with bad resumption token""" # Non existing resumptionToken - self.assert_('badResumptionToken' in oai_repository_server.oailistrecords('resumptionToken=foobar&verb=ListRecords')) + req = StringIO() + oai_repository_server.oai_list_records_or_identifiers(req, {'resumptionToken': 'foobar', 'verb': 'ListRecords'}) + + self.assert_('badResumptionToken' in req.getvalue()) class TestPerformance(unittest.TestCase): """Test performance of the repository """ def setUp(self): """Setting up some variables""" # Determine how many records are served - self.number_of_records = oai_repository_server.oaigetsysnolist("", "", "") + self.number_of_records = len(oai_repository_server.oai_get_recid_list("", "", "")) if CFG_OAI_LOAD < self.number_of_records: self.number_of_records = CFG_OAI_LOAD def test_response_speed_oai(self): """oairepository - speed of response for oai_dc output""" allowed_seconds_per_record_oai = 0.02 # Test oai ListRecords performance t0 = time.time() - oai_repository_server.oailistrecords('metadataPrefix=oai_dc&verb=ListRecords') + oai_repository_server.oai_list_records_or_identifiers(StringIO(), {'metadataPrefix': 'oai_dc', 'verb': 'ListRecords'}) t = time.time() - t0 if t > self.number_of_records * allowed_seconds_per_record_oai: self.fail("""Response for ListRecords with metadataPrefix=oai_dc took too much time: %s seconds. Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_oai)) def test_response_speed_marcxml(self): """oairepository - speed of response for marcxml output""" allowed_seconds_per_record_marcxml = 0.05 # Test marcxml ListRecords performance t0 = time.time() - oai_repository_server.oailistrecords('metadataPrefix=marcxml&verb=ListRecords') + oai_repository_server.oai_list_records_or_identifiers(StringIO(), argd={'metadataPrefix': 'marcxml', 'verb': 'ListRecords'}) t = time.time() - t0 if t > self.number_of_records * allowed_seconds_per_record_marcxml: self.fail("""Response for ListRecords with metadataPrefix=marcxml took too much time:\n %s seconds. Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_marcxml)) -class TestOAIRepositoryUpdater(unittest.TestCase): - """Test functions in OAI_repository_updater""" - - def test_marcxml_filtering(self): - """oairepository - test MARCXML filtering""" - self.assertEqual(oai_repository_updater.marcxml_filter_out_tags(98, ['088__a']), - ' \n SCAN-9709037\n \n') - - self.assertEqual(oai_repository_updater.marcxml_filter_out_tags(98, ['088__c']), - ' \n SCAN-9709037\n \n \n UCRL-8417\n \n') - - self.assertEqual(oai_repository_updater.marcxml_filter_out_tags(98, ['0248_p']), - ' \n oai:cds.cern.ch:SCAN-9709037\n \n') - TEST_SUITE = make_test_suite(OAIRepositoryWebPagesAvailabilityTest, TestSelectiveHarvesting, - TestPerformance, - TestOAIRepositoryUpdater) + TestPerformance) if __name__ == "__main__": run_test_suite(TEST_SUITE, warn_user=True) diff --git a/modules/bibharvest/lib/oai_repository_server.py b/modules/bibharvest/lib/oai_repository_server.py index 23a0b2b91..a048afc85 100644 --- a/modules/bibharvest/lib/oai_repository_server.py +++ b/modules/bibharvest/lib/oai_repository_server.py @@ -1,1010 +1,869 @@ ## This file is part of Invenio. ## Copyright (C) 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Receive OAI-PMH 2.0 requests and responds""" __revision__ = "$Id$" import cPickle import os import re -import cgi -import urllib import time +import datetime +import tempfile import sys -if sys.hexversion < 0x2060000: - from md5 import md5 +if sys.hexversion < 0x2050000: + from glob import glob as iglob else: - from hashlib import md5 + from glob import iglob from invenio.config import \ CFG_OAI_DELETED_POLICY, \ CFG_OAI_EXPIRE, \ CFG_OAI_IDENTIFY_DESCRIPTION, \ CFG_OAI_ID_FIELD, \ CFG_OAI_LOAD, \ CFG_OAI_SET_FIELD, \ + CFG_OAI_PREVIOUS_SET_FIELD, \ + CFG_OAI_METADATA_FORMATS, \ CFG_CACHEDIR, \ CFG_SITE_NAME, \ CFG_SITE_SUPPORT_EMAIL, \ CFG_SITE_URL, \ - CFG_WEBSTYLE_HTTP_USE_COMPRESSION + CFG_WEBSTYLE_HTTP_USE_COMPRESSION, \ + CFG_CERN_SITE, \ + CFG_OAI_SAMPLE_IDENTIFIER, \ + CFG_OAI_ID_PREFIX, \ + CFG_OAI_FRIENDS, \ + CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, \ + CFG_OAI_PROVENANCE_BASEURL_SUBFIELD, \ + CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD, \ + CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD, \ + CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD, \ + CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD, \ + CFG_OAI_PROVENANCE_ALTERED_SUBFIELD from invenio.intbitset import intbitset +from invenio.htmlutils import X, EscapedXMLString from invenio.dbquery import run_sql, wash_table_column_name -from invenio.search_engine import record_exists, perform_request_search, \ - get_all_restricted_recids -from invenio.bibformat_dblayer import get_preformatted_record +from invenio.search_engine import record_exists, get_all_restricted_recids, get_all_field_values, search_unit_in_bibxxx, get_record from invenio.bibformat import format_record -from invenio.textutils import encode_for_xml +from invenio.bibrecord import record_get_field_instances +from invenio.errorlib import register_exception +from invenio.oai_repository_config import CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC -verbs = { +CFG_VERBS = { 'GetRecord' : ['identifier', 'metadataPrefix'], 'Identify' : [], 'ListIdentifiers' : ['from', 'until', 'metadataPrefix', 'set', 'resumptionToken'], 'ListMetadataFormats': ['identifier'], 'ListRecords' : ['from', 'until', 'metadataPrefix', 'set', 'resumptionToken'], 'ListSets' : ['resumptionToken'] } -params = { - "verb" : ["Identify","ListIdentifiers","ListSets","ListMetadataFormats","ListRecords","GetRecord"], - "metadataPrefix" : ["oai_dc","marcxml"], - "from" :[""], - "until":[""], - "set" :[""], - "identifier": [""] -} - -def escape_space(strxml): - "Encode special chars in string for URL-compliancy." - - strxml = strxml.replace(' ', '%20') - return strxml - -def encode_for_url(strxml): - "Encode special chars in string for URL-compliancy." - - strxml = strxml.replace('%', '%25') - strxml = strxml.replace(' ', '%20') - strxml = strxml.replace('?', '%3F') - strxml = strxml.replace('#', '%23') - strxml = strxml.replace('=', '%3D') - strxml = strxml.replace('&', '%26') - strxml = strxml.replace('/', '%2F') - strxml = strxml.replace(':', '%3A') - strxml = strxml.replace(';', '%3B') - strxml = strxml.replace('+', '%2B') - return strxml +CFG_ERRORS = { + "badArgument": "The request includes illegal arguments, is missing required arguments, includes a repeated argument, or values for arguments have an illegal syntax:", + "badResumptionToken": "The value of the resumptionToken argument is invalid or expired:", + "badVerb": "Value of the verb argument is not a legal OAI-PMH verb, the verb argument is missing, or the verb argument is repeated:", + "cannotDisseminateFormat": "The metadata format identified by the value given for the metadataPrefix argument is not supported by the item or by the repository:", + "idDoesNotExist": "The value of the identifier argument is unknown or illegal in this repository:", + "noRecordsMatch": "The combination of the values of the from, until, set and metadataPrefix arguments results in an empty list:", + "noMetadataFormats": "There are no metadata formats available for the specified item:", + "noSetHierarchy": "The repository does not support sets:" +} -def oai_header(args, verb): - "Print OAI header" +def oai_error(argd, errors): + """ + Return a well-formatted OAI-PMH error + """ + out = """ +""" + out += X.responseDate()(get_utc_now()) + for error_code, error_msg in errors: + assert(error_code in CFG_ERRORS) + if error_code in ("badArgument", "badVerb"): + out += X.request()(oai_get_request_url()) + break + else: + ## There are no badArgument or badVerb errors so we can + ## return the whole request information + out += X.request(**argd)(oai_get_request_url()) + for error_code, error_msg in errors: + if error_msg is None: + error_msg = CFG_ERRORS[error_code] + else: + error_msg = "%s %s" % (CFG_ERRORS[error_code], error_msg) + out += X.error(code=error_code)(error_msg) + out += "" + return out - out = "" +def oai_header(argd, verb): + """ + Return OAI header + """ - out = out + "" + "\n" - out = out + "\n" + out = "" + "\n" + out += "\n" % CFG_SITE_URL + out += "\n" - out = out + " " + oaigetresponsedate() + "\n" + #out += "%s" % get_utc_now() + out += X.responseDate()(get_utc_now()) if verb: - out = out + " %s\n" % (verb, oaigetrequesturl(args)) - out = out + " <%s>\n" % verb + out += X.request(**argd)(oai_get_request_url()) + out += "<%s>\n" % verb else: - out = out + " %s\n" % (oaigetrequesturl(args)) + out += X.request()(oai_get_request_url()) return out def oai_footer(verb): - "Print OAI footer" - + """ + @return: the OAI footer. + """ out = "" - if verb: - out = "%s \n" % (out, verb) - out = out + "\n" - - return out - -def oai_error_header(args, verb): - "Print OAI header" - - out = "" - -### out = "Content-Type: text/xml\n\n" - out = out + "" + "\n" - out = out + "\n" - - out = out + " " + oaigetresponsedate() + "\n" - out = out + " %s\n" % (verb, oaigetrequesturl(args)) - + out += "\n" % (verb) + out += "\n" return out -def oai_error_footer(verb): - "Print OAI footer" - - out = verb - out = "\n" - return out - -def get_field(sysno, field): - "Gets list of field 'field' for the record with 'sysno' system number." +def get_field(recid, field): + """ + Gets list of field 'field' for the record with 'recid' system number. + """ - out = [] digit = field[0:2] bibbx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec=%%s AND bx.id=bibx.id_bibxxx AND bx.tag=%%s" % (wash_table_column_name(bibbx), wash_table_column_name(bibx)) - res = run_sql(query, (sysno, field)) - - for row in res: - - out.append(row[0]) - - return out + return [row[0] for row in run_sql(query, (recid, field))] def utc_to_localtime(date): """ Convert UTC to localtime Reference: - (1) http://www.openarchives.org/OAI/openarchivesprotocol.html#Dates - (2) http://www.w3.org/TR/NOTE-datetime This function works only with dates complying with the "Complete date plus hours, minutes and seconds" profile of ISO 8601 defined by (2), and linked from (1). Eg: 1994-11-05T13:15:30Z """ ldate = date.split("T")[0] ltime = date.split("T")[1] lhour = ltime.split(":")[0] lminute = ltime.split(":")[1] lsec = ltime.split(":")[2] lsec = lsec[:-1] # Remove trailing "Z" lyear = ldate.split("-")[0] lmonth = ldate.split("-")[1] lday = ldate.split("-")[2] # 1: Build a time as UTC. Since time.mktime() expect a local time : ## 1a: build it without knownledge of dst ## 1b: substract timezone to get a local time, with possibly wrong dst utc_time = time.mktime((int(lyear), int(lmonth), int(lday), int(lhour), int(lminute), int(lsec), 0, 0, -1)) local_time = utc_time - time.timezone # 2: Fix dst for local_time # Find out the offset for daily saving time of the local # timezone at the time of the given 'date' if time.localtime(local_time)[-1] == 1: local_time = local_time + 3600 return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(local_time)) def localtime_to_utc(date): - "Convert localtime to UTC" + """Convert localtime to UTC""" ldate = date.split(" ")[0] ltime = date.split(" ")[1] lhour = ltime.split(":")[0] lminute = ltime.split(":")[1] lsec = ltime.split(":")[2] lyear = ldate.split("-")[0] lmonth = ldate.split("-")[1] lday = ldate.split("-")[2] # Find out the offset for daily saving time of the local # timezone at the time of the given 'date' # # 1: build time that correspond to local date, without knowledge of dst # 2: determine if dst is locally enabled at this time tmp_date = time.mktime((int(lyear), int(lmonth), int(lday), int(lhour), int(lminute), int(lsec), 0, 0, -1)) if time.localtime(tmp_date)[-1] == 1: dst = time.localtime(tmp_date)[-1] else: dst = 0 # 3: Build a new time with knowledge of the dst local_time = time.mktime((int(lyear), int(lmonth), int(lday), int(lhour), int(lminute), int(lsec), 0, 0, dst)) # 4: Get the time as UTC utc_time = time.gmtime(local_time) return time.strftime("%Y-%m-%dT%H:%M:%SZ", utc_time) -def get_modification_date(sysno): - "Returns the date of last modification for the record 'sysno'." +def get_modification_date(recid): + """Returns the date of last modification for the record 'recid'.""" out = "" - res = run_sql("SELECT DATE_FORMAT(modification_date,'%%Y-%%m-%%d %%H:%%i:%%s') FROM bibrec WHERE id=%s", (sysno,), 1) + res = run_sql("SELECT DATE_FORMAT(modification_date,'%%Y-%%m-%%d %%H:%%i:%%s') FROM bibrec WHERE id=%s", (recid,), 1) if res and res[0][0]: out = localtime_to_utc(res[0][0]) return out def get_earliest_datestamp(): - "Get earliest datestamp in the database" + """Get earliest datestamp in the database""" out = "" - res = run_sql("SELECT MIN(DATE_FORMAT(creation_date,'%%Y-%%m-%%d %%H:%%i:%%s')) FROM bibrec", (), 1) - if res[0][0]: + res = run_sql("SELECT DATE_FORMAT(MIN(creation_date),'%Y-%m-%d %H:%i:%s') FROM bibrec", n=1) + if res: out = localtime_to_utc(res[0][0]) return out def get_latest_datestamp(): - "Get latest datestamp in the database" + """Get latest datestamp in the database""" out = "" - res = run_sql("SELECT MAX(DATE_FORMAT(modification_date,'%%Y-%%m-%%d %%H:%%i:%%s')) FROM bibrec", (), 1) - if res[0][0]: + res = run_sql("SELECT DATE_FORMAT(MAX(modification_date),'%Y-%m-%d %H:%i:%s') FROM bibrec", n=1) + if res: out = localtime_to_utc(res[0][0]) return out def check_date(date): """Check if given date has a correct format, complying to "Complete date" or "Complete date plus hours, minutes and seconds" formats defined in ISO8601.""" if(re.match("\d\d\d\d-\d\d-\d\d(T\d\d:\d\d:\d\dZ)?\Z", date) is not None): return date else: return "" def normalize_date(date, dtime="T00:00:00Z"): """ Normalize the given date to the "Complete date plus hours, minutes and seconds" format defined in ISO8601 (If "hours, minutes and seconds" part is missing, append 'dtime' to date). 'date' must be checked before with check_date(..). Returns empty string if cannot be normalized """ if len(date) == 10: date = date + dtime elif len(date) != 20: date = "" return date -def print_record(sysno, format='marcxml', record_exists_result=None): - """Prints record 'sysno' formatted according to 'format'. +def get_record_provenance(recid): + """ + Return the provenance XML representation of a record, suitable to be put + in the about tag. + """ + record = get_record(recid) + provenances = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) + out = "" + for provenance in provenances: + base_url = identifier = datestamp = metadata_namespace = origin_description = harvest_date = altered = "" + for (code, value) in provenance[0]: + if code == CFG_OAI_PROVENANCE_BASEURL_SUBFIELD: + base_url = value + elif code == CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]: + identifier = value + elif code == CFG_OAI_PROVENANCE_DATESTAMP_SUBFIELD: + datestamp = value + elif code == CFG_OAI_PROVENANCE_METADATANAMESPACE_SUBFIELD: + metadata_namespace = value + elif code == CFG_OAI_PROVENANCE_ORIGINDESCRIPTION_SUBFIELD: + origin_description = value + elif code == CFG_OAI_PROVENANCE_HARVESTDATE_SUBFIELD: + harvest_date = value + elif code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: + altered = value + if base_url: + out += """""" + out += X.originDescription(harvestDate=harvest_date, altered=altered)( + X.baseURL()(base_url), + X.identifier()(identifier), + X.datestamp()(datestamp), + X.metadataNamespace()(metadata_namespace), + origin_description and X.originDescription(origin_description) or '' ## This is already XML + ) + out += """""" + return out + +def get_record_rights(dummy): + """ + Return the record rights parts, suitable to be put in the about tag. + """ + return "" + ## FIXME: This need to be thought in a good way. What shall we really + ## put in the rights parts? + #record = get_record(recid) + #rights = record_get_field_instances(record, CFG_OAI_RIGHTS_FIELD[:3], CFG_OAI_RIGHTS_FIELD[3], CFG_OAI_RIGHTS_FIELD[4]) + #license = record_get_field_instances(record, CFG_OAI_LICENSE_FIELD[:3], CFG_OAI_LICENSE_FIELD[3], CFG_OAI_LICENSE_FIELD[4]) + + #holder = date = rights_uri = contact = statement = terms = publisher = license_uri = '' + #if rights: + #for code, value in rights[0][0]: + #if code == CFG_OAI_RIGHTS_HOLDER_SUBFIELD: + #holder = value + #elif code == CFG_OAI_RIGHTS_DATE_SUBFIELD: + #date = value + #elif code == CFG_OAI_RIGHTS_URI_SUBFIELD: + #rights_uri = value + #elif code == CFG_OAI_RIGHTS_CONTACT_SUBFIELD: + #contact = value + #elif CFG_OAI_RIGHTS_STATEMENT_SUBFIELD: + #statement = value + #if license: + #for code, value in license[0][0]: + #if code == CFG_OAI_LICENSE_TERMS_SUBFIELD: + #terms = value + #elif code == CFG_OAI_LICENSE_PUBLISHER_SUBFIELD: + #publisher = value + #elif code == CFG_OAI_LICENSE_URI_SUBFIELD: + #license_uri = value + +def print_record(recid, prefix='marcxml', verb='ListRecords', set_spec=None): + """Prints record 'recid' formatted according to 'prefix'. - if record does not exist, return nothing. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'transient' or 'deleted', then return only header, with status 'deleted'. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'no', then return nothing. Optional parameter 'record_exists_result' has the value of the result - of the record_exists(sysno) function (in order not to call that function + of the record_exists(recid) function (in order not to call that function again if already done.) """ - out = "" + record_exists_result = record_exists(recid) == 1 + if record_exists_result: + sets = get_field(recid, CFG_OAI_SET_FIELD) + if set_spec is not None and not set_spec in sets and not [set_ for set_ in sets if set_.startswith("%s:" % set_spec)]: + ## the record is not in the requested set, and is not + ## in any subset + record_exists_result = False - # sanity check: - if record_exists_result is not None: - _record_exists = record_exists_result + if record_exists_result: + status = None else: - _record_exists = record_exists(sysno) + status = 'deleted' - if not _record_exists: + if not record_exists_result and CFG_OAI_DELETED_POLICY not in ('persistent', 'transient'): return - if (format == "dc") or (format == "oai_dc"): - format = "xd" - - # print record opening tags: - - out = out + " \n" - - if _record_exists == -1: # Deleted? - if CFG_OAI_DELETED_POLICY == "persistent" or \ - CFG_OAI_DELETED_POLICY == "transient": - out = out + "
    \n" - else: - return - else: - out = out + "
    \n" - - for ident in get_field(sysno, CFG_OAI_ID_FIELD): - out = "%s %s\n" % (out, escape_space(ident)) - out = "%s %s\n" % (out, get_modification_date(sysno)) - for set in get_field(sysno, CFG_OAI_SET_FIELD): - if set: + idents = get_field(recid, CFG_OAI_ID_FIELD) + try: + assert idents, "No OAI ID for record %s, please do your checks!" % recid + except AssertionError, err: + register_exception(alert_admin=True) + return + try: + assert len(idents) == 1, "More than OAI ID found for recid %s. Considering only the first one, but please do your checks: %s" % (recid, idents) + except AssertionError, err: + register_exception(alert_admin=True) + ident = idents[0] + + header_body = EscapedXMLString('') + header_body += X.identifier()(ident) + header_body += X.datestamp()(get_modification_date(recid)) + for set_spec in get_field(recid, CFG_OAI_SET_FIELD): + if set_spec and set_spec != CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC: # Print only if field not empty - out = "%s %s\n" % (out, set) - out = out + "
    \n" + header_body += X.setSpec()(set_spec) + + header = X.header(status=status)(header_body) - if _record_exists == -1: # Deleted? - pass + if verb == 'ListIdentifiers': + return header else: - out = out + " \n" - - if format == "marcxml": - formatted_record = get_preformatted_record(sysno, 'xm') - if formatted_record is not None: - ## MARCXML is already preformatted. Adapt it if needed - formatted_record = formatted_record.replace("", "\n 00000coc 2200000uu 4500") - formatted_record = formatted_record.replace("", "\n 00000coc 2200000uu 4500") - formatted_record = formatted_record.replace("" - out = out + " 00000coc 2200000uu 4500" - out = "%s %d\n" % (out, int(sysno)) - - for digit1 in range(0, 10): - for digit2 in range(0, 10): - bibbx = "bib%d%dx" % (digit1, digit2) - bibx = "bibrec_bib%d%dx" % (digit1, digit2) - query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ - "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s "\ - "ORDER BY bb.field_number, b.tag ASC" % (wash_table_column_name(bibbx), wash_table_column_name(bibx)) - res = run_sql(query, (sysno, str(digit1) + str(digit2) + "%")) - field_number_old = -999 - field_old = "" - for row in res: - field, value, field_number = row[0], row[1], row[2] - ind1, ind2 = field[3], field[4] - if ind1 == "_": - ind1 = " " - if ind2 == "_": - ind2 = " " - # print field tag - if field_number != field_number_old or field[:-1] != field_old[:-1]: - if format == "marcxml": - - if field_number_old != -999: - if field_old[0:2] == "00": - out = out + " \n" - else: - out = out + " \n" - - if field[0:2] == "00": - out = "%s \n" % (out, encode_for_xml(field[0:3])) - else: - out = "%s \n" % (out, encode_for_xml(field[0:3]), encode_for_xml(ind1).lower(), encode_for_xml(ind2).lower()) - - - field_number_old = field_number - field_old = field - # print subfield value - if format == "marcxml": - value = encode_for_xml(value) - - if(field[0:2] == "00"): - out = "%s %s\n" % (out, value) - else: - out = "%s %s\n" % (out, encode_for_xml(field[-1:]), value) - - - # fetch next subfield - # all fields/subfields printed in this run, so close the tag: - if (format == "marcxml") and field_number_old != -999: - if field_old[0:2] == "00": - out = out + " \n" - else: - out = out + " \n" - - out = out + " \n" - - elif format == "xd": - out += format_record(sysno, 'xoaidc') - - # print record closing tags: - - out = out + " \n" - - out = out + " \n" - - return out - -def oailistmetadataformats(args): - "Generates response to oailistmetadataformats verb." - - arg = parse_args(args) - - out = "" - - flag = 1 # list or not depending on identifier - - if arg['identifier'] != "": - - flag = 0 - - sysno = oaigetsysno(arg['identifier']) - _record_exists = record_exists(sysno) - if _record_exists == 1 or \ - (_record_exists == -1 and CFG_OAI_DELETED_POLICY != "no"): - - flag = 1 - + provenance = '' + rights_body = get_record_rights(recid) + if rights_body: + rights = X.about(body=rights_body) + else: + rights = '' else: + metadata = '' + provenance = '' + rights = '' + return X.record()(header, metadata, provenance, rights) - out = out + oai_error("idDoesNotExist","invalid record Identifier") - out = oai_error_header(args, "ListMetadataFormats") + out + oai_error_footer("ListMetadataFormats") - return out - - if flag: - out = out + " \n" - out = out + " oai_dc\n" - out = out + " http://www.openarchives.org/OAI/1.1/dc.xsd\n" - out = out + " http://purl.org/dc/elements/1.1/\n" - out = out + " \n" - out = out + " \n" - out = out + " marcxml\n" - out = out + " http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\n" - out = out + " http://www.loc.gov/MARC21/slim\n" - out = out + " \n" - - out = oai_header(args, "ListMetadataFormats") + out + oai_footer("ListMetadataFormats") - return out - +def oai_list_metadata_formats(argd): + """Generates response to oai_list_metadata_formats verb.""" -def oailistrecords(args): - "Generates response to oailistrecords verb." - - arg = parse_args(args) + if argd.get('identifier'): + recid = oai_get_recid(argd['identifier']) + _record_exists = record_exists(recid) + if _record_exists != 1 and (_record_exists != -1 or CFG_OAI_DELETED_POLICY == "no"): + return oai_error(argd, [("idDoesNotExist", "invalid record Identifier: %s" % argd['identifier'])]) out = "" - resumptionToken_printed = False - - sysnos = [] - sysno = [] - # check if the resumptionToken did not expire - if arg['resumptionToken']: - filename = os.path.join(CFG_CACHEDIR, 'RTdata', arg['resumptionToken']) - if os.path.exists(filename) == 0: - out = oai_error("badResumptionToken", "ResumptionToken expired") - out = oai_error_header(args, "ListRecords") + out + oai_error_footer("ListRecords") - return out - - if arg['resumptionToken'] != "": - sysnos = oaicacheout(arg['resumptionToken']) - arg['metadataPrefix'] = sysnos.pop() + for prefix, (dummy, schema, namespace) in CFG_OAI_METADATA_FORMATS.items(): + out += X.metadataFormat()( + X.metadataPrefix(prefix), + X.schema(schema), + X.metadataNamespace(namespace) + ) + + return oai_header(argd, "ListMetadataFormats") + out + oai_footer("ListMetadataFormats") + +def oai_list_records_or_identifiers(req, argd): + """Generates response to oai_list_records verb.""" + + verb = argd['verb'] + resumption_token_was_specified = False + + # check if the resumption_token did not expire + if argd.get('resumptionToken'): + resumption_token_was_specified = True + try: + cache = oai_cache_load(argd['resumptionToken']) + last_recid = cache['last_recid'] + argd = cache['argd'] + complete_list = cache['complete_list'] + complete_list = filter_out_based_on_date_range(complete_list, argd.get('from', ''), argd.get('until', '')) + except Exception: + register_exception(alert_admin=True) + req.write(oai_error(argd, [("badResumptionToken", "ResumptionToken expired or invalid: %s" % argd['resumptionToken'])])) + return else: - sysnos = oaigetsysnolist(arg['set'], arg['from'], arg['until']) - - if len(sysnos) == 0: # noRecordsMatch error - - out = out + oai_error("noRecordsMatch", "no records correspond to the request") - out = oai_error_header(args, "ListRecords") + out + oai_error_footer("ListRecords") - return out - - i = 0 - for sysno_ in sysnos: - if sysno_: - if i >= CFG_OAI_LOAD: # cache or write? - if not resumptionToken_printed: # resumptionToken? - arg['resumptionToken'] = oaigenresumptionToken() - extdate = oaigetresponsedate(CFG_OAI_EXPIRE) - if extdate: - out = "%s %s\n" % (out, extdate, arg['resumptionToken']) - else: - out = "%s %s\n" % (out, arg['resumptionToken']) - resumptionToken_printed = True - sysno.append(sysno_) - else: - _record_exists = record_exists(sysno_) - if not (_record_exists == -1 and CFG_OAI_DELETED_POLICY == "no"): - #Produce output only if record exists and had to be printed - i = i + 1 # Increment limit only if record is returned - res = print_record(sysno_, arg['metadataPrefix'], _record_exists) - if res: - out += res - - if resumptionToken_printed: - oaicacheclean() - sysno.append(arg['metadataPrefix']) - oaicachein(arg['resumptionToken'], sysno) - - out = oai_header(args, "ListRecords") + out + oai_footer("ListRecords") - return out + last_recid = 0 + complete_list = oai_get_recid_list(argd.get('set', ""), argd.get('from', ""), argd.get('until', "")) -def oailistsets(args): - "Lists available sets for OAI metadata harvesting." + if not complete_list: # noRecordsMatch error + req.write(oai_error(argd, [("noRecordsMatch", "no records correspond to the request")])) + return + + cursor = 0 + for cursor, recid in enumerate(complete_list): + ## Let's fast-forward the cursor to point after the last recid that was + ## disseminated successfully + if recid > last_recid: + break + + req.write(oai_header(argd, verb)) + for recid in list(complete_list)[cursor:cursor+CFG_OAI_LOAD]: + req.write(print_record(recid, argd['metadataPrefix'], verb=verb, set_spec=argd.get('set'))) + + if list(complete_list)[cursor+CFG_OAI_LOAD:]: + resumption_token = oai_generate_resumption_token(argd.get('set', '')) + cache = { + 'argd': argd, + 'last_recid': recid, + 'complete_list': complete_list.fastdump(), + } + oai_cache_dump(resumption_token, cache) + expdate = oai_get_response_date(CFG_OAI_EXPIRE) + req.write(X.resumptionToken(expirationDate=expdate, cursor=cursor, completeListSize=len(complete_list))(resumption_token)) + elif resumption_token_was_specified: + ## Since a resumptionToken was used we shall put a last empty resumptionToken + req.write(X.resumptionToken(cursor=cursor, completeListSize=len(complete_list))("")) + req.write(oai_footer(verb)) + oai_cache_gc() + +def oai_list_sets(argd): + """ + Lists available sets for OAI metadata harvesting. + """ out = "" # note: no flow control in ListSets - - sets = get_sets() - + sets = get_all_sets().values() + if not sets: + return oai_error(argd, [("noSetHierarchy", "No sets have been configured for this repository")]) for set_ in sets: - - out = out + " \n" - out = "%s %s\n" % (out, set_[0]) - out = "%s %s\n" % (out, set_[1]) + out += " \n" + out += X.setSpec()(set_[0]) + X.setName()(set_[1]) if set_[2]: - out = "%s %s\n" % (out, set_[2]) + out += X.setDescription()(set_[2]) out = out + " \n" - out = oai_header(args, "ListSets") + out + oai_footer("ListSets") - - return out + return oai_header(argd, "ListSets") + out + oai_footer("ListSets") -def oaigetrecord(args): +def oai_get_record(argd): """Returns record 'identifier' according to 'metadataPrefix' format for OAI metadata harvesting. - if record does not exist, return oai_error 'idDoesNotExist'. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'transient' or 'deleted', then return only header, with status 'deleted'. - if record has been deleted and CFG_OAI_DELETED_POLICY is 'no', then return oai_error 'idDoesNotExist'. """ - arg = parse_args(args) - out = "" - sysno = oaigetsysno(arg['identifier']) - _record_exists = record_exists(sysno) + recid = oai_get_recid(argd['identifier']) + _record_exists = record_exists(recid) if _record_exists == 1 or \ (_record_exists == -1 and CFG_OAI_DELETED_POLICY != 'no'): - out = print_record(sysno, arg['metadataPrefix'], _record_exists) - out = oai_header(args, "GetRecord") + out + oai_footer("GetRecord") - else: - out = oai_error("idDoesNotExist", "invalid record Identifier") - out = oai_error_header(args, "GetRecord") + out + oai_error_footer("GetRecord") - return out - -def oailistidentifiers(args): - "Prints OAI response to the ListIdentifiers verb." - - arg = parse_args(args) - - out = "" - resumptionToken_printed = False - - sysno = [] - sysnos = [] - - if arg['resumptionToken']: - filename = os.path.join(CFG_CACHEDIR, 'RTdata', arg['resumptionToken']) - if os.path.exists(filename) == 0: - out = out + oai_error("badResumptionToken", "ResumptionToken expired") - out = oai_error_header(args, "ListIdentifiers") + out + oai_error_footer("ListIdentifiers") - return out - - if arg['resumptionToken']: - sysnos = oaicacheout(arg['resumptionToken']) + out = print_record(recid, argd['metadataPrefix'], _record_exists) + out = oai_header(argd, "GetRecord") + out + oai_footer("GetRecord") else: - sysnos = oaigetsysnolist(arg['set'], arg['from'], arg['until']) - - if len(sysnos) == 0: # noRecordsMatch error - out = out + oai_error("noRecordsMatch", "no records correspond to the request") - out = oai_error_header(args, "ListIdentifiers") + out + oai_error_footer("ListIdentifiers") - return out - - i = 0 - for sysno_ in sysnos: - if sysno_: - if i >= CFG_OAI_LOAD: # cache or write? - if not resumptionToken_printed: # resumptionToken? - arg['resumptionToken'] = oaigenresumptionToken() - extdate = oaigetresponsedate(CFG_OAI_EXPIRE) - if extdate: - out = "%s %s\n" % (out, extdate, arg['resumptionToken']) - else: - out = "%s %s\n" % (out, arg['resumptionToken']) - resumptionToken_printed = True - sysno.append(sysno_) - else: - _record_exists = record_exists(sysno_) - if (not _record_exists == -1 and CFG_OAI_DELETED_POLICY == "no"): - i = i + 1 # Increment limit only if record is returned - for ident in get_field(sysno_, CFG_OAI_ID_FIELD): - if ident != '': - if _record_exists == -1: #Deleted? - if CFG_OAI_DELETED_POLICY == "persistent" \ - or CFG_OAI_DELETED_POLICY == "transient": - out = out + "
    \n" - else: - # In that case, print nothing (do not go further) - break - else: - out = out + "
    \n" - out = "%s %s\n" % (out, escape_space(ident)) - out = "%s %s\n" % (out, get_modification_date(oaigetsysno(ident))) - for set in get_field(sysno_, CFG_OAI_SET_FIELD): - if set: - # Print only if field not empty - out = "%s %s\n" % (out, set) - out = out + "
    \n" - - if resumptionToken_printed: - oaicacheclean() # clean cache from expired resumptionTokens - oaicachein(arg['resumptionToken'], sysno) - - out = oai_header(args, "ListIdentifiers") + out + oai_footer("ListIdentifiers") - + return oai_error(argd, [("idDoesNotExist", "invalid record Identifier: %s" % argd['identifier'])]) return out -def oaiidentify(args, script_url): - """Generates a response to oaiidentify verb. - Parameters: - args - *dict* query parameters +def oai_identify(argd): + """Generates a response to oai_identify verb. script_url - *str* URL of the script used to access the service. This is made necessary since the gateway can be accessed either via /oai2d or /oai2d/ (or for backward compatibility: oai2d.py or oai2d.py/), and that the base URL must be returned in the Identify response """ - out = """ %(CFG_SITE_NAME)s - %(CFG_SITE_URL)s%(script_url)s - 2.0 - %(CFG_SITE_SUPPORT_EMAIL)s - %(earliest_datestamp)s - %(CFG_OAI_DELETED_POLICY)s - %(granularity)s - %(compression)s - %(CFG_OAI_IDENTIFY_DESCRIPTION)s\n""" % \ - {"CFG_SITE_NAME": cgi.escape(CFG_SITE_NAME), - "CFG_SITE_URL": cgi.escape(CFG_SITE_URL), - "earliest_datestamp": cgi.escape(get_earliest_datestamp()), - "granularity": "YYYY-MM-DDThh:mm:ssZ", - "CFG_OAI_DELETED_POLICY": cgi.escape(CFG_OAI_DELETED_POLICY), - "CFG_SITE_SUPPORT_EMAIL": cgi.escape(CFG_SITE_SUPPORT_EMAIL), - "CFG_OAI_IDENTIFY_DESCRIPTION": CFG_OAI_IDENTIFY_DESCRIPTION, - "compression": CFG_WEBSTYLE_HTTP_USE_COMPRESSION and "deflate" or "", - "script_url": script_url} - - out = oai_header(args, "Identify") + out + oai_footer("Identify") + out = X.repositoryName()(CFG_SITE_NAME) + out += X.baseURL()(CFG_SITE_URL + '/oai2d') + out += X.protocolVersion()("2.0") + out += X.adminEmail()(CFG_SITE_SUPPORT_EMAIL) + out += X.earliestDatestamp()(get_earliest_datestamp()) + out += X.deletedRecord()(CFG_OAI_DELETED_POLICY) + out += X.granularity()("YYYY-MM-DDThh:mm:ssZ") + if CFG_WEBSTYLE_HTTP_USE_COMPRESSION: + out += X.compression()('deflate') + out += X.description("""""" + + X.scheme()("oai") + + X.repositoryIdentifier()(CFG_OAI_ID_PREFIX) + + X.delimiter()(":") + + X.sampleIdentifier()(CFG_OAI_SAMPLE_IDENTIFIER) + + """""") + out += CFG_OAI_IDENTIFY_DESCRIPTION + if CFG_OAI_FRIENDS: + friends = """""" + for baseurl in CFG_OAI_FRIENDS: + friends += X.baseURL()(baseurl) + friends += """""" + out += X.description(friends) + + out = oai_header(argd, "Identify") + out + oai_footer("Identify") return out -def oaigetrequesturl(args): - "Generates requesturl tag for OAI." - - # re_amp = re.compile('&') +def get_utc_now(): + """ + Return current UTC time in the OAI-PMH format. + """ + return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') - requesturl = CFG_SITE_URL + "/" + "oai2d/"# + "?" + re_amp.sub("&", args) +def oai_build_request_element(argd=None): + """ + Build the request tag. + """ + if argd is None: + argd = {} + return X.responseDate()(get_utc_now()) + X.request(**argd)("%s/oai2d" % CFG_SITE_URL) +def oai_get_request_url(): + """Generates requesturl tag for OAI.""" + requesturl = CFG_SITE_URL + "/oai2d" return requesturl -def oaigetresponsedate(delay=0): - "Generates responseDate tag for OAI." - +def oai_get_response_date(delay=0): + """Generates responseDate tag for OAI.""" return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(time.time() + delay)) - -def oai_error(code, msg): - "OAI error occured" - - return "%s\n" % (code, msg) - - -def oaigetsysno(identifier): - "Returns the first database BIB ID for the OAI identifier 'identifier', if it exists." - sysno = None +def oai_get_recid(identifier): + """Returns the first database BIB ID for the OAI identifier 'identifier', if it exists.""" + recid = None if identifier: query = "SELECT DISTINCT(bb.id_bibrec) FROM bib%sx AS bx, bibrec_bib%sx AS bb WHERE bx.tag=%%s AND bb.id_bibxxx=bx.id AND bx.value=%%s" % (CFG_OAI_ID_FIELD[0:2], CFG_OAI_ID_FIELD[0:2]) res = run_sql(query, (CFG_OAI_ID_FIELD, identifier)) for row in res: - sysno = row[0] - return sysno - - -def oaigetsysnolist(set="", fromdate="", untildate=""): - "Returns list of system numbers for the OAI set 'set', modified from 'fromdate' until 'untildate'." - from invenio.oai_repository_updater import get_set_definitions + recid = row[0] + return recid +def filter_out_based_on_date_range(recids, fromdate="", untildate=""): + """ Filter out recids based on date range.""" if fromdate != "": fromdate = normalize_date(fromdate, "T00:00:00Z") else: fromdate = get_earliest_datestamp() + fromdate = utc_to_localtime(fromdate) if untildate != "": untildate = normalize_date(untildate, "T23:59:59Z") else: untildate = get_latest_datestamp() - collections = [] - for set_definition in get_set_definitions(set): - collections.extend(coll.strip() for coll in set_definition['c'].split(',')) - recids = perform_request_search(f1=CFG_OAI_ID_FIELD, p1="oai:*", m1="e", op1='a', - f2=((set and CFG_OAI_SET_FIELD) or ""), p2=set, m2="e", - d1=utc_to_localtime(fromdate), - d2=utc_to_localtime(untildate), - c=collections, - dt='m', - ap=0) - ## Let's discard non public records - return list(intbitset(recids) - get_all_restricted_recids()) - -def oaigenresumptionToken(): - "Generates unique ID for resumption token management." - - return md5(str(time.time())).hexdigest() - - -def oaicachein(resumptionToken, sysnos): - "Stores or adds sysnos in cache. Input is a string of sysnos separated by commas." + untildate = utc_to_localtime(untildate) - filename = os.path.join(CFG_CACHEDIR, 'RTdata', resumptionToken) + recids = intbitset(recids) ## Let's clone :-) - fil = open(filename, "w") - cPickle.dump(sysnos, fil) - fil.close() - return 1 + if fromdate and untildate: + recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s", (fromdate, untildate))) + elif fromdate: + recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (fromdate, ))) + elif untildate: + recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date <= %s", (untildate, ))) + return recids - get_all_restricted_recids() - -def oaicacheout(resumptionToken): - "Restores string of comma-separated system numbers from cache." - - sysnos = [] - - filename = os.path.join(CFG_CACHEDIR, 'RTdata', resumptionToken) - - if oaicachestatus(resumptionToken): - fil = open(filename, "r") - sysnos = cPickle.load(fil) - fil.close() +def oai_get_recid_list(set_spec="", fromdate="", untildate=""): + """ + Returns list of recids for the OAI set 'set', modified from 'fromdate' until 'untildate'. + """ + ret = intbitset() + if not set_spec: + ret |= search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') + if CFG_OAI_DELETED_POLICY != 'no': + ret |= search_unit_in_bibxxx(p='*', f=CFG_OAI_PREVIOUS_SET_FIELD, type='e') else: - return 0 - return sysnos - - -def oaicacheclean(): - "Removes cached resumptionTokens older than specified" - - directory = os.path.join(CFG_CACHEDIR, 'RTdata') + ret |= search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') + ret |= search_unit_in_bibxxx(p='%s:*' % set_spec, f=CFG_OAI_SET_FIELD, type='e') + if CFG_OAI_DELETED_POLICY != 'no': + ret |= search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_PREVIOUS_SET_FIELD, type='e') + ret |= search_unit_in_bibxxx(p='%s:*' % set_spec, f=CFG_OAI_PREVIOUS_SET_FIELD, type='e') + if CFG_OAI_DELETED_POLICY == 'no': + ret -= search_unit_in_bibxxx(p='DELETED', f='980__%', type='e') + if CFG_CERN_SITE: + ret -= search_unit_in_bibxxx(p='DUMMY', f='980__%', type='e') + return filter_out_based_on_date_range(ret, fromdate, untildate) + +def oai_generate_resumption_token(set_spec): + """Generates unique ID for resumption token management.""" + fd, name = tempfile.mkstemp(dir=os.path.join(CFG_CACHEDIR, 'RTdata'), prefix='%s___' % set_spec) + os.close(fd) + return os.path.basename(name) + +def oai_delete_resumption_tokens_for_set(set_spec): + """ + In case a set is modified by the admin interface, this will delete + any resumption token that is now invalid. + """ + aset = set_spec + while aset: + for name in iglob(os.path.join(CFG_CACHEDIR, 'RTdata', '%s___*' % set_spec)): + os.remove(name) + aset = aset.rsplit(":", 1)[0] + for name in iglob(os.path.join(CFG_CACHEDIR, 'RTdata', '___*')): + os.remove(name) + +def oai_cache_dump(resumption_token, cache): + """ + Given a resumption_token and the cache, stores the cache. + """ + cPickle.dump(cache, open(os.path.join(CFG_CACHEDIR, 'RTdata', resumption_token), 'w'), -1) - files = os.listdir(directory) +def oai_cache_load(resumption_token): + """ + Restores the cache from the resumption_token. + """ + fullpath = os.path.join(CFG_CACHEDIR, 'RTdata', resumption_token) + if os.path.dirname(os.path.abspath(fullpath)) != os.path.abspath(os.path.join(CFG_CACHEDIR, 'RTdata')): + raise ValueError("Invalid path") + return cPickle.load(open(fullpath)) - for file_ in files: - filename = os.path.join(directory, file_) +def oai_cache_gc(): + """ + OAI Cache Garbage Collector. + """ + for file_ in os.listdir(os.path.join(CFG_CACHEDIR, 'RTdata')): + filename = os.path.join(os.path.join(CFG_CACHEDIR, 'RTdata', file_)) # cache entry expires when not modified during a specified period of time if ((time.time() - os.path.getmtime(filename)) > CFG_OAI_EXPIRE): try: os.remove(filename) except OSError, e: # Most probably the cache was already deleted pass - return 1 - - -def oaicachestatus(resumptionToken): - "Checks cache status. Returns 0 for empty, 1 for full." - filename = os.path.join(CFG_CACHEDIR, 'RTdata', resumptionToken) - - if os.path.exists(filename): - if os.path.getsize(filename) > 0: - return 1 - else: - return 0 - else: - return 0 - - -def get_sets(): - "Returns list of sets." - # TODO: Try to remove dependency on oaiREPOSITORY table, by - # determining available sets from data. - - out = {} - row = ['', ''] - - query = "SELECT setSpec,setName,setDescription FROM oaiREPOSITORY" - res = run_sql(query) +def get_all_sets(): + """ + Return all the sets. + """ + res = run_sql("SELECT setSpec, setName, setDescription FROM oaiREPOSITORY") + ret = {} for row in res: - row_bis = [row[0], row[1], row[2]] - out[row[0]] = row_bis - - return out.values() + ret[row[0]] = row + ## Let's expand with all the set that exist in the DB + for a_set in get_all_field_values(CFG_OAI_SET_FIELD): + if a_set not in ret: + ret[a_set] = (a_set, a_set, '') -def parse_args(args=""): - "Parse input args" + ## Let's expand with all the supersets + for a_set in ret.keys(): + while ':' in a_set: + try: + a_set = a_set.rsplit(":", 1)[0] + except AttributeError: + a_set = ':'.join(a_set.split(":")[:-1]) + if a_set not in ret: + ret[a_set] = (a_set, a_set, '') - out_args = { - "verb" : "", - "metadataPrefix" : "", - "from" : "", - "until" : "", - "set" : "", - "identifier" : "", - "resumptionToken" : "" - } + if CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC in ret: + ## Let's remove the special global set + del ret[CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC] - if args == "" or args is None: - pass - else: - - list_of_arguments = args.split('&') - - for item in list_of_arguments: - keyvalue = item.split('=') - if len(keyvalue) == 2: - if (out_args.has_key(keyvalue[0])): - if(out_args[keyvalue[0]] != ""): - out_args[keyvalue[0]] = "Error" - else: - out_args[keyvalue[0]] = urllib.unquote(keyvalue[1]) - else: - out_args[keyvalue[0]] = urllib.unquote(keyvalue[1]) - else: - out_args['verb'] = "" + if '' in ret: + ## '' is not a valid setSpec but might be in the MARC + del ret[''] - return out_args + return ret -def check_argd(arguments): +def check_argd(argd): """ Check OAI arguments Also transform them from lists to strings. """ - - out = "" - -## no several times the same argument -# -# - for param, value in arguments.iteritems(): - if len(value) > 1 and not 'The request includes illegal arguments' in out: - out = out + oai_error("badArgument", "The request includes illegal arguments") - bad_arguments_error = True + errors = [] + + ## no several times the same argument + bad_arguments_error = False + for param, value in argd.iteritems(): + if len(value) > 1 and not bad_arguments_error: + errors.append(("badArgument", "More than one value specified for the %s argument: %s" % (param, value))) + bad_arguments_error = True ## This is needed only once if len(value) > 0: - arguments[param] = value[0] + argd[param] = value[0] else: - arguments[param] = '' + argd[param] = '' -## principal argument required -# -# - if verbs.has_key(arguments['verb']): - pass - else: - out = out + oai_error("badVerb", "Illegal OAI verb") - -## defined args -# -# - for param in arguments.keys(): - if not param in verbs.get(arguments['verb'], []) and param != 'verb' \ - and not 'The request includes illegal arguments' in out: - out = out + oai_error("badArgument", "The request includes illegal arguments") + ## principal argument required + if argd['verb'] not in CFG_VERBS: + errors.append(("badVerb", "Illegal OAI verb: %s" % argd['verb'])) + + ## defined argd + for param in argd.keys(): + if not param in CFG_VERBS.get(argd['verb'], []) and param != 'verb' \ + and not bad_arguments_error: + errors.append(("badArgument", "The request includes illegal arguments for the given verb: %s" % param)) + bad_arguments_error = True break # Indicate only once -## resumptionToken exclusive -# -# - if arguments.get('resumptionToken', '') != "" and \ - len(arguments.keys()) != 2 and \ - not 'The request includes illegal arguments' in out: - out = out + oai_error("badArgument", - "The request includes illegal arguments") - -## resumptionToken not empty when defined -# -# - if arguments.get('resumptionToken', None) == '': - out = out + oai_error("badResumptionToken", - "ResumptionToken invalid") - -## datestamp formats -# -# - if arguments.has_key('from') and \ - 'from' in verbs.get(arguments['verb'], []): - from_length = len(arguments['from']) - if check_date(arguments['from']) == "": - out = out + oai_error("badArgument", - "Bad datestamp format in from") + ## resumptionToken exclusive + if argd.get('resumptionToken', '') != "" and \ + len(argd.keys()) != 2 and not bad_arguments_error: + errors.append(("badArgument", "The resumptionToken was specified together with other arguments")) + bad_arguments_error = True + + if argd.get('resumptionToken', None) == '': + errors.append(("badResumptionToken", "ResumptionToken invalid: %s" % argd.get('resumptionToken', None))) + + ## datestamp formats + if argd.has_key('from') and \ + 'from' in CFG_VERBS.get(argd['verb'], []): + from_length = len(argd['from']) + if check_date(argd['from']) == "": + errors.append(("badArgument", "Bad datestamp format in from: %s" % argd['from'])) else: from_length = 0 - if arguments.has_key('until') and \ - 'until' in verbs.get(arguments['verb'], []): - until_length = len(arguments['until']) - if check_date(arguments['until']) == "": - out = out + oai_error("badArgument", - "Bad datestamp format in until") + if argd.has_key('until') and \ + 'until' in CFG_VERBS.get(argd['verb'], []): + until_length = len(argd['until']) + if check_date(argd['until']) == "": + errors.append(("badArgument", "Bad datestamp format in until: %s" % argd['until'])) else: until_length = 0 if from_length != 0: if until_length != 0: if from_length != until_length: - out = out + oai_error("badArgument", - "Bad datestamp format") - - if arguments.has_key('from') and arguments.has_key('until') \ - and arguments['from'] > arguments['until'] and \ - 'from' in verbs.get(arguments['verb'], []) and \ - 'until' in verbs.get(arguments['verb'], []): - out = out + oai_error("badArgument", "Wrong date") - -## Identify exclusive -# -# - if arguments['verb'] == "Identify" and \ - len(arguments.keys()) != 1: - if not 'The request includes illegal arguments' in out: # Do not repeat this error - out = out + oai_error("badArgument", - "The request includes illegal arguments") - -## parameters for GetRecord -# -# - if arguments['verb'] == "GetRecord" and \ - not arguments.has_key('identifier'): - out = out + oai_error("badArgument", - "Record identifier missing") - - if arguments['verb'] == "GetRecord" and \ - not arguments.has_key('metadataPrefix'): - out = out + oai_error("badArgument", - "Missing metadataPrefix") - -## parameters for ListRecords and ListIdentifiers -# -# - if (arguments['verb'] == "ListRecords" or arguments['verb'] == "ListIdentifiers") and \ - (not arguments.has_key('resumptionToken') and not arguments.has_key('metadataPrefix')): - out = out + oai_error("badArgument", "Missing metadataPrefix") - -## Metadata prefix defined and valid -# -# - if arguments.has_key('metadataPrefix') and \ - not arguments['metadataPrefix'] in params['metadataPrefix']: - out = out + oai_error("cannotDisseminateFormat", "Chosen format is not supported") + errors.append(("badArgument", "From and until have two different formats: %s Vs. %s" % (from_length, until_length))) + + if argd.has_key('from') and argd.has_key('until') \ + and argd['from'] > argd['until'] and \ + 'from' in CFG_VERBS.get(argd['verb'], []) and \ + 'until' in CFG_VERBS.get(argd['verb'], []): + errors.append(("badArgument", "from argument comes after until argument: %s > %s" % (argd['from'], argd['until']))) + + ## Identify exclusive + if argd['verb'] == "Identify" and \ + len(argd.keys()) != 1: + if not bad_arguments_error: # Do not repeat this error + errors.append(("badArgument", "The request includes illegal arguments")) + bad_arguments_error = True - return out + ## parameters for GetRecord + if argd['verb'] == "GetRecord" and \ + not argd.has_key('identifier'): + errors.append(("badArgument", "Record identifier missing")) + + if argd['verb'] == "GetRecord" and \ + not argd.has_key('metadataPrefix'): + errors.append(("badArgument", "Missing metadataPrefix")) + + ## parameters for ListRecords and ListIdentifiers + if (argd['verb'] == "ListRecords" or argd['verb'] == "ListIdentifiers") and \ + (not argd.has_key('resumptionToken') and not argd.has_key('metadataPrefix')): + errors.append(("badArgument", "Missing metadataPrefix")) + + ## Metadata prefix defined and valid + if argd.has_key('metadataPrefix') and \ + not argd['metadataPrefix'] in CFG_OAI_METADATA_FORMATS: + errors.append(("cannotDisseminateFormat", "Chosen format is not supported. Valid formats are: %s" % ', '.join(CFG_OAI_METADATA_FORMATS.keys()))) + + return errors def oai_profile(): """ Runs a benchmark """ - oailistrecords('set=&from=&metadataPrefix=oai_dc&verb=ListRecords&resumptionToken=&identifier=&until=') - #oailistrecords('set=&from=&metadataPrefix=marcxml&verb=ListRecords&resumptionToken=&identifier=&until=') - #oailistidentifiers('set=&from=&metadataPrefix=oai_dc&verb=ListIdentifiers&resumptionToken=&identifier=&until=') - + from cStringIO import StringIO + oai_list_records_or_identifiers(StringIO(), argd={"metadataPrefix": "oai_dc", "verb": "ListRecords"}) + oai_list_records_or_identifiers(StringIO(), argd={"metadataPrefix": "marcxml", "verb" :"ListRecords"}) + oai_list_records_or_identifiers(StringIO(), argd={"metadataPrefix": "oai_dc", "verb": "ListIdentifiers"}) return if __name__ == "__main__": import profile import pstats profile.run('oai_profile()', "oai_profile") p = pstats.Stats("oai_profile") p.strip_dirs().sort_stats("cumulative").print_stats() diff --git a/modules/bibharvest/lib/oai_repository_tests.py b/modules/bibharvest/lib/oai_repository_tests.py index 5c34c4523..04eb0ccf7 100644 --- a/modules/bibharvest/lib/oai_repository_tests.py +++ b/modules/bibharvest/lib/oai_repository_tests.py @@ -1,94 +1,87 @@ # -*- coding: utf-8 -*- ## Invenio OAI repository unit tests. ## ## This file is part of Invenio. ## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Unit tests for the oai repository.""" __revision__ = "$Id$" import unittest import re +from cStringIO import StringIO + from invenio import oai_repository_server from invenio.testutils import make_test_suite, run_test_suite class TestVerbs(unittest.TestCase): """Test for OAI verb functionality.""" def test_verbs(self): """oairepository - testing verbs""" - self.assertNotEqual(None, re.search("Identify", oai_repository_server.oaiidentify("", None))) - self.assertNotEqual(None, re.search("ListIdentifiers", oai_repository_server.oailistidentifiers(""))) - self.assertNotEqual(None, re.search("ListRecords", oai_repository_server.oailistrecords(""))) - self.assertNotEqual(None, re.search("ListMetadataFormats", oai_repository_server.oailistmetadataformats(""))) - self.assertNotEqual(None, re.search("ListSets", oai_repository_server.oailistsets(""))) - self.assertNotEqual(None, re.search("GetRecord", oai_repository_server.oaigetrecord(""))) + self.assertNotEqual(None, re.search("Identify", oai_repository_server.oai_identify({'verb': 'Identify'}))) + ret = StringIO() + oai_repository_server.oai_list_records_or_identifiers(ret, {'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml'}) + self.assertNotEqual(None, re.search("ListIdentifiers", ret.getvalue())) + ret = StringIO() + oai_repository_server.oai_list_records_or_identifiers(ret, {'verb': 'ListRecords', 'metadataPrefix': 'marcxml'}) + self.assertNotEqual(None, re.search("ListRecords", ret.getvalue())) + self.assertNotEqual(None, re.search("ListMetadataFormats", oai_repository_server.oai_list_metadata_formats({'verb': 'ListMetadataFormats'}))) + self.assertNotEqual(None, re.search("ListSets", oai_repository_server.oai_list_sets({'verb': 'ListSets'}))) + self.assertNotEqual(None, re.search("GetRecord", oai_repository_server.oai_get_record({'identifier': 'oai:atlantis.cern.ch:1', 'verb': 'GetRecord'}))) class TestErrorCodes(unittest.TestCase): """Test for handling OAI error codes.""" def test_issue_error_identify(self): """oairepository - testing error codes""" - self.assertNotEqual(None, re.search("badVerb", oai_repository_server.check_argd({'verb':"IllegalVerb"}))) - self.assertNotEqual(None, re.search("badArgument", oai_repository_server.check_argd({'verb':"Identify", - 'test':"test"}))) - self.assertNotEqual(None, re.search("badArgument", oai_repository_server.check_argd({'verb':"ListIdentifiers", + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"IllegalVerb"}) if code == 'badVerb']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"Identify", + 'test':"test"}) if code == 'badArgument']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"ListIdentifiers", 'metadataPrefix':"oai_dc", 'from':"some_random_date", - 'until':"some_random_date"}))) - self.assertNotEqual(None, re.search("badArgument", oai_repository_server.check_argd({'verb':"ListIdentifiers", + 'until':"some_random_date"}) if code == 'badArgument']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"ListIdentifiers", 'metadataPrefix':"oai_dc", 'from':"2001-01-01", - 'until':"2002-01-01T00:00:00Z"}))) - self.assertNotEqual(None, re.search("badArgument", oai_repository_server.check_argd({'verb':"ListIdentifiers"}))) - self.assertNotEqual(None, re.search("cannotDisseminateFormat", oai_repository_server.check_argd({'verb':"ListIdentifiers", - 'metadataPrefix':"illegal_mdp"}))) + 'until':"2002-01-01T00:00:00Z"}) if code == 'badArgument']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"ListIdentifiers"}) if code == 'badArgument']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"ListIdentifiers", + 'metadataPrefix':"illegal_mdp"}) if code == 'cannotDisseminateFormat']) - self.assertNotEqual(None, re.search("badArgument", oai_repository_server.check_argd({'verb':"ListIdentifiers", + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"ListIdentifiers", 'metadataPrefix':"oai_dc", - 'metadataPrefix':"oai_dc"}))) - self.assertNotEqual(None, re.search("badArgument", oai_repository_server.check_argd({'verb':"ListRecords", + 'metadataPrefix':"oai_dc"}) if code == 'badArgument']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"ListRecords", 'metadataPrefix':"oai_dc", 'set':"really_wrong_set", 'from':"some_random_date", - 'until':"some_random_date"}))) - self.assertNotEqual(None, re.search("badArgument", oai_repository_server.check_argd({'verb':"ListRecords"}))) - - self.assertNotEqual(None, re.search("badResumptionToken", oai_repository_server.check_argd({'verb': 'ListRecords', 'resumptionToken': ''}))) - -class TestEncodings(unittest.TestCase): - """Test for OAI response encodings.""" - - def test_encoding(self): - """oairepository - testing encodings""" - - self.assertEqual("<&>", oai_repository_server.encode_for_xml("<&>")) - self.assertEqual("%20", oai_repository_server.escape_space(" ")) - self.assertEqual("%25%20%3F%23%3D%26%2F%3A%3B%2B", oai_repository_server.encode_for_url("% ?#=&/:;+")) - + 'until':"some_random_date"}) if code == 'badArgument']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb':"ListRecords"}) if code == 'badArgument']) + self.assertNotEqual([], [code for (code, dummy_text) in oai_repository_server.check_argd({'verb': 'ListRecords', 'resumptionToken': ''}) if code == 'badResumptionToken']) TEST_SUITE = make_test_suite(TestVerbs, - TestErrorCodes, - TestEncodings,) + TestErrorCodes) if __name__ == "__main__": run_test_suite(TEST_SUITE) diff --git a/modules/bibharvest/lib/oai_repository_updater.py b/modules/bibharvest/lib/oai_repository_updater.py index c3110d7ce..52796dabb 100644 --- a/modules/bibharvest/lib/oai_repository_updater.py +++ b/modules/bibharvest/lib/oai_repository_updater.py @@ -1,543 +1,512 @@ ## This file is part of Invenio. ## Copyright (C) 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """OAI Repository administration tool - Updates the metadata of the records to include OAI identifiers and OAI SetSpec according to the settings defined in OAI Repository admin interface """ -__revision__ = "$Id$" - import os import sys import time if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from tempfile import mkstemp +from pprint import pformat from invenio.config import \ CFG_OAI_ID_FIELD, \ CFG_OAI_ID_PREFIX, \ CFG_OAI_SET_FIELD, \ - CFG_BINDIR, \ + CFG_OAI_PREVIOUS_SET_FIELD, \ CFG_SITE_NAME, \ CFG_TMPDIR -from invenio.search_engine import perform_request_search, get_record -from invenio.search_engine_utils import get_fieldvalues -from invenio.intbitset import intbitset as HitSet +from invenio.oai_repository_config import CFG_OAI_REPOSITORY_MARCXML_SIZE, \ + CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC +from invenio.search_engine import perform_request_search, get_record, search_unit_in_bibxxx +from invenio.intbitset import intbitset from invenio.dbquery import run_sql from invenio.bibtask import \ task_get_option, \ task_set_option, \ write_message, \ task_update_progress, \ task_init, \ - task_sleep_now_if_required + task_sleep_now_if_required, \ + task_low_level_submission from invenio.bibrecord import \ - record_delete_subfield, \ - field_xml_output - -DATAFIELD_SET_HEAD = \ - "" % \ - (CFG_OAI_SET_FIELD[0:3], - CFG_OAI_SET_FIELD[3:4].replace('_', ' '), - CFG_OAI_SET_FIELD[4:5].replace('_', ' ')) -DATAFIELD_ID_HEAD = \ - "" % \ - (CFG_OAI_ID_FIELD[0:3], - CFG_OAI_ID_FIELD[3:4].replace('_', ' '), - CFG_OAI_ID_FIELD[4:5].replace('_', ' ')) + record_get_field_value, \ + record_get_field_values, \ + record_add_field, \ + record_xml_output def get_set_definitions(set_spec): """ Retrieve set definitions from oaiREPOSITORY table. The set definitions are the search patterns that define the records which are in the set """ set_definitions = [] query = "select setName, setDefinition from oaiREPOSITORY where setSpec=%s" res = run_sql(query, (set_spec, )) for (set_name, set_definition) in res: params = parse_set_definition(set_definition) params['setSpec'] = set_spec params['setName'] = set_name + set_definitions.append(params) return set_definitions def parse_set_definition(set_definition): """ Returns the parameters for the given set definition. The returned structure is a dictionary with keys being c, p1, f1, m1, p2, f2, m2, p3, f3, m3 and corresponding values @param set_definition: a string as returned by the database for column 'setDefinition' @return: a dictionary """ params = {'c':'', 'p1':'', 'f1':'', 'm1':'', 'p2':'', 'f2':'', 'm2':'', 'p3':'', 'f3':'', 'm3':'', 'op1':'a', 'op2':'a'} definitions = set_definition.split(';') for definition in definitions: arguments = definition.split('=') if len(arguments) == 2: params[arguments[0]] = arguments[1] return params def all_set_specs(): """ Returns the list of (distinct) setSpecs defined in the settings. This also include the "empty" setSpec if any setting uses it. Note: there can be several times the same setSpec in the settings, given that a setSpec might be defined by several search queries. Here we return distinct values """ query = "SELECT DISTINCT setSpec FROM oaiREPOSITORY" res = run_sql(query) return [row[0] for row in res] def get_recids_for_set_spec(set_spec): """ - Returns the list (as HitSet) of recids belonging to 'set' + Returns the list (as intbitset) of recids belonging to 'set' Parameters: set_spec - *str* the set_spec for which we would like to get the recids """ - recids = HitSet() + recids = intbitset() for set_def in get_set_definitions(set_spec): new_recids = perform_request_search(c=[coll.strip() \ for coll in set_def['c'].split(',')], p1=set_def['p1'], f1=set_def['f1'], m1=set_def['m1'], op1=set_def['op1'], p2=set_def['p2'], f2=set_def['f2'], m2=set_def['m2'], op2=set_def['op2'], p3=set_def['p3'], f3=set_def['f3'], m3=set_def['m3'], ap=0) - recids = recids.union(HitSet(new_recids)) + recids |= intbitset(new_recids) return recids def get_set_name_for_set_spec(set_spec): """ Returns the OAI setName of a setSpec. Note that the OAI Repository admin lets the user add several set definition with the same setSpec, and possibly with different setNames... -> Returns the first (non empty) one found. Parameters: set_spec - *str* the set_spec for which we would like to get the setName """ query = "select setName from oaiREPOSITORY where setSpec=%s and setName!=''" res = run_sql(query, (set_spec, )) if len(res) > 0: return res[0][0] else: return "" -def print_repository_status(write_message=write_message, +def print_repository_status(local_write_message=write_message, verbose=0): """ Prints the repository status to the standard output. Parameters: write_message - *function* the function used to write the output verbose - *int* the verbosity of the output - 0: print repository size - 1: print quick status of each set (numbers can be wrong if the repository is in some inconsistent state, i.e. a record is in an OAI setSpec but has not OAI ID) - 2: print detailed status of repository, with number of records that needs to be synchronized according to the sets definitions. Precise, but ~slow... """ repository_size_s = "%d" % repository_size() - repository_recids_after_update = HitSet() + repository_recids_after_update = intbitset() - write_message(CFG_SITE_NAME) - write_message(" OAI Repository Status") + local_write_message(CFG_SITE_NAME) + local_write_message(" OAI Repository Status") set_spec_max_length = 19 # How many max char do we display for set_name_max_length = 20 # setName and setSpec? if verbose == 0: # Just print repository size - write_message(" Total(**)" + " " * 29 + + local_write_message(" Total(**)" + " " * 29 + " " * (9 - len(repository_size_s)) + repository_size_s) return elif verbose == 1: # We display few information: show longer set name and spec set_spec_max_length = 30 set_name_max_length = 30 - write_message("=" * 80) + local_write_message("=" * 80) header = " setSpec" + " " * (set_spec_max_length - 7) + \ " setName" + " " * (set_name_max_length - 5) + " Volume" if verbose > 1: header += " " * 5 + "After update(*):" - write_message(header) + local_write_message(header) if verbose > 1: - write_message(" " * 57 + "Additions Deletions") + local_write_message(" " * 57 + "Additions Deletions") - write_message("-" * 80) + local_write_message("-" * 80) for set_spec in all_set_specs(): if verbose <= 1: # Get the records that are in this set. This is an # incomplete check, as it can happen that some records are # in this set (according to the metadata) but have no OAI # ID (so they are not exported). This can happen if the # repository has some records coming from external # sources, or if it has never been synchronized with this # tool. - current_recids = perform_request_search(c=CFG_SITE_NAME, - p1=set_spec, - f1=CFG_OAI_SET_FIELD, - m1="e", ap=0) + current_recids = get_recids_for_set_spec(set_spec) nb_current_recids = len(current_recids) else: # Get the records that are *currently* exported for this # setSpec - current_recids = perform_request_search(c=CFG_SITE_NAME, - p1=set_spec, - f1=CFG_OAI_SET_FIELD, - m1="e", ap=0, op1="a", - p2="oai:*", - f2=CFG_OAI_ID_FIELD, - m2="e") + current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') nb_current_recids = len(current_recids) # Get the records that *should* be in this set according to # the admin defined settings, and compute how many should be # added or removed should_recids = get_recids_for_set_spec(set_spec) - repository_recids_after_update = repository_recids_after_update.union(should_recids) + repository_recids_after_update |= should_recids - nb_add_recids = len(HitSet(should_recids).difference(HitSet(current_recids))) - nb_remove_recids = len(HitSet(current_recids).difference(HitSet(should_recids))) + nb_add_recids = len(should_recids - current_recids) + nb_remove_recids = len(current_recids - should_recids) nb_should_recids = len(should_recids) - nb_recids_after_update = len(repository_recids_after_update) # Adapt setName and setSpec strings lengths set_spec_str = set_spec if len(set_spec_str) > set_spec_max_length : set_spec_str = "%s.." % set_spec_str[:set_spec_max_length] set_name_str = get_set_name_for_set_spec(set_spec) if len(set_name_str) > set_name_max_length : set_name_str = "%s.." % set_name_str[:set_name_max_length] row = " " + set_spec_str + \ " " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \ " " * ((set_name_max_length + 2) - len(set_name_str)) + \ " " * (7 - len(str(nb_current_recids))) + str(nb_current_recids) if verbose > 1: row += \ " " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \ " " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\ " " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids) - write_message(row) + local_write_message(row) - write_message("=" * 80) + local_write_message("=" * 80) footer = " Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \ " " * (9 - len(repository_size_s)) + repository_size_s if verbose > 1: - footer += ' ' * (28 - len(str(nb_recids_after_update))) + str(nb_recids_after_update) - write_message(footer) + footer += ' ' * (28 - len(str(len(repository_recids_after_update)))) + str(len(repository_recids_after_update)) + local_write_message(footer) if verbose > 1: - write_message(' *The "after update" columns show the repository after you run this tool.') + local_write_message(' *The "after update" columns show the repository after you run this tool.') else: - write_message(' *"Volume" is indicative if repository is out of sync. Use --detailed-report.') - write_message('**The "total" is not the sum of the above numbers, but the union of the records.') + local_write_message(' *"Volume" is indicative if repository is out of sync. Use --detailed-report.') + local_write_message('**The "total" is not the sum of the above numbers, but the union of the records.') def repository_size(): - "Read repository size" - return len(perform_request_search(p1="oai:*", - f1=CFG_OAI_ID_FIELD, - m1="e", - ap=0)) + """Read repository size""" + return len(search_unit_in_bibxxx(p="*", f=CFG_OAI_SET_FIELD, type="e")) ### MAIN ### - def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True + initial_snapshot = {} + for set_spec in all_set_specs(): + initial_snapshot[set_spec] = get_set_definitions(set_spec) + write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) + task_update_progress("Fetching records to process") - # Build the list of records to be processed, that is, search for - # the records that match one of the search queries defined in OAI - # Repository admin interface. - recids_for_set = {} # Remember exactly which record belongs to which set - recids = HitSet() # "Flat" set of the recids_for_set values + recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') + write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) + + all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') + no_more_exported_recids = intbitset(all_current_recids) + write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) + + all_affected_recids = intbitset() + all_should_recids = intbitset() + recids_for_set = {} for set_spec in all_set_specs(): - task_sleep_now_if_required(can_stop_too=True) - _recids = get_recids_for_set_spec(set_spec) - recids_for_set[set_spec] = _recids - recids = recids.union(_recids) - - # Also get the list of records that are currently exported through - # OAI and that might need to be refreshed - oai_recids = perform_request_search(c=CFG_SITE_NAME, - p1='oai:%s:*' % CFG_OAI_ID_PREFIX, - f1=CFG_OAI_ID_FIELD, - m1="e", ap=0) - recids = recids.union(HitSet(oai_recids)) + if not set_spec: + set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC + should_recids = get_recids_for_set_spec(set_spec) + recids_for_set[set_spec] = should_recids + no_more_exported_recids -= should_recids + all_should_recids |= should_recids + current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') + write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) + to_add = should_recids - current_recids + write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) + to_remove = current_recids - should_recids + write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) + affected_recids = to_add | to_remove + write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) + all_affected_recids |= affected_recids + + missing_oaiid = all_should_recids - recids_with_oaiid + write_message("%s recids are missing an oaiid" % len(missing_oaiid)) + write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) + + ## Let's add records with missing OAI ID + all_affected_recids |= missing_oaiid | no_more_exported_recids + write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) + + if not all_affected_recids: + write_message("Nothing to do!") + return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") - oai_out.write('') - has_updated_records = False + oai_out.write("") + + tot = 0 # Iterate over the recids - i = 0 - for recid in recids: - i += 1 + for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ - (i, len(recids))) + (i, len(all_affected_recids))) + + write_message("Elaborating recid %s" % recid, verbose=3) + record = get_record(recid) + if not record: + write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) + continue + new_record = {} # Check if an OAI identifier is already in the record or # not. - oai_id_entry = "oai:%s:%s\n" % \ - (CFG_OAI_ID_FIELD[5:6], CFG_OAI_ID_PREFIX, recid) - already_has_oai_id = True - oai_ids = [_oai_id for _oai_id in \ - get_fieldvalues(recid, CFG_OAI_ID_FIELD) \ - if _oai_id.strip() != ''] - if len(oai_ids) == 0: - already_has_oai_id = False + assign_oai_id_entry = False + oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) + if not oai_id_entry: + assign_oai_id_entry = True + oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) + write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) + else: + write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata - current_oai_sets = set(\ - [_oai_set for _oai_set in \ - get_fieldvalues(recid, CFG_OAI_SET_FIELD) \ - if _oai_set.strip() != '']) + current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) + write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) + + current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) + write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings - updated_oai_sets = set(\ - [_set for _set, _recids in recids_for_set.iteritems() - if recid in _recids if _set]) + updated_oai_sets = set(_set for _set, _recids in recids_for_set.iteritems() + if recid in _recids) + write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) + + updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) | + (current_oai_sets - updated_oai_sets)) + write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. - if current_oai_sets == updated_oai_sets and already_has_oai_id: + if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: + write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid - has_updated_records = True - - # Generate the xml sets entry - oai_set_entry = '\n'.join(["%s" % \ - (CFG_OAI_SET_FIELD[5:6], _oai_set) \ - for _oai_set in updated_oai_sets if \ - _oai_set]) + \ - "\n" - - # Also get all the datafields with tag and indicator matching - # CFG_OAI_SET_FIELD[:5] and CFG_OAI_ID_FIELD[:5] but with - # subcode != CFG_OAI_SET_FIELD[5:6] and subcode != - # CFG_OAI_SET_FIELD[5:6], so that we can preserve these values - other_data = marcxml_filter_out_tags(recid, [CFG_OAI_SET_FIELD, - CFG_OAI_ID_FIELD]) - - if CFG_OAI_ID_FIELD[0:5] == CFG_OAI_SET_FIELD[0:5]: - # Put set and OAI ID in the same datafield - oai_out.write("\n") - oai_out.write("%s" - "\n" % recid) - oai_out.write(DATAFIELD_ID_HEAD) - oai_out.write("\n") - #if oai_id_entry: - oai_out.write(oai_id_entry) - #if oai_set_entry: - oai_out.write(oai_set_entry) - oai_out.write("\n") - oai_out.write(other_data) - oai_out.write("\n") - else: - oai_out.write("\n") - oai_out.write("%s" - "\n" % recid) - oai_out.write(DATAFIELD_ID_HEAD) - oai_out.write("\n") - oai_out.write(oai_id_entry) - oai_out.write("\n") - oai_out.write(DATAFIELD_SET_HEAD) - oai_out.write("\n") - oai_out.write(oai_set_entry) - oai_out.write("\n") - oai_out.write(other_data) - oai_out.write("\n") - - oai_out.write('') + write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) + subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] + for oai_set in updated_oai_sets: + subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) + for oai_set in updated_previous_oai_sets: + subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) + + record_add_field(new_record, tag="001", controlfield_value=str(recid)) + record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) + oai_out.write(record_xml_output(new_record)) + tot += 1 + if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: + oai_out.write("") + oai_out.close() + write_message("Wrote to file %s" % filename) + if not no_upload: + task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') + # Prepare to save results in a tmp file + (fd, filename) = mkstemp(dir=CFG_TMPDIR, + prefix='oairepository_' + \ + time.strftime("%Y%m%d_%H%M%S_", + time.localtime())) + oai_out = os.fdopen(fd, "w") + oai_out.write("") + tot = 0 + task_sleep_now_if_required(can_stop_too=True) + + oai_out.write("") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_sleep_now_if_required(can_stop_too=True) - if has_updated_records: - command = "%s/bibupload -c %s -u oairepository" % (CFG_BINDIR, filename) - os.system(command) + if tot > 0: + task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: os.remove(filename) return True -def marcxml_filter_out_tags(recid, fields): - """ - Returns the fields of record 'recid' that share the same tag and - indicators as those specified in 'fields', but for which the - subfield is different. This is nice to emulate a bibupload -c that - corrects only specific subfields. - - Parameters: - recid - *int* the id of the record to process - - fields - *list(str)* the list of fields that we want to filter - out. Eg ['909COp', '909COo'] - """ - out = '' - - record = get_record(recid) - - # Delete subfields that we want to replace - for field in fields: - record_delete_subfield(record, - tag=field[0:3], - ind1=field[3:4], - ind2=field[4:5], - subfield_code=field[5:6]) - - # Select only datafields that share tag + indicators - processed_tags_and_ind = [] - for field in fields: - if not field[0:5] in processed_tags_and_ind: - # Ensure that we do not process twice the same datafields - processed_tags_and_ind.append(field[0:5]) - for datafield in record.get(field[0:3], []): - if datafield[1] == field[3:4].replace('_', ' ') and \ - datafield[2] == field[4:5].replace('_', ' ') and \ - datafield[0]: - out += field_xml_output(datafield, field[0:3]) + '\n' - - return out - ######################### def main(): """Main that construct all the bibtask.""" # if there is any -r or --report option (or other similar options) # in the arguments, just print the status and exit (do not run # through BibSched...) + if (CFG_OAI_ID_FIELD[:5] != CFG_OAI_SET_FIELD[:5]) or \ + (CFG_OAI_ID_FIELD[:5] != CFG_OAI_PREVIOUS_SET_FIELD[:5]): + print >> sys.stderr, """\ +ERROR: since Invenio 1.0 the OAI ID and the OAI Set must be stored in the same +field. Please revise your configuration for the variables + CFG_OAI_ID_FIELD (currently set to %s) + CFG_OAI_SET_FIELD (currently set to %s) + CFG_OAI_PREVIOUS_SET_FIELD (currently set to %s)""" % ( + CFG_OAI_ID_FIELD, + CFG_OAI_SET_FIELD, + CFG_OAI_PREVIOUS_SET_FIELD + ) + sys.exit(1) mode = -1 if '-d' in sys.argv[1:] or '--detailed-report' in sys.argv[1:]: mode = 2 elif '-r' in sys.argv[1:] or '--report' in sys.argv[1:]: mode = 1 if mode != -1: - def write_message(*args): + def local_write_message(*args): """Overload BibTask function so that it does not need to run in BibSched environment""" sys.stdout.write(args[0] + '\n') - print_repository_status(write_message=write_message, - verbose=mode) + print_repository_status(local_write_message=local_write_message, verbose=mode) return task_init(authorization_action='runoairepository', authorization_msg="OAI Archive Task Submission", description="Examples:\n" " Expose records according to sets defined in OAI Repository admin interface\n" " $ oairepositoryupdater \n" " Expose records according to sets defined in OAI Repository admin interface and update them every day\n" " $ oairepositoryupdater -s24\n" " Print OAI repository status\n" " $ oairepositoryupdater -r\n" " Print OAI repository detailed status\n" " $ oairepositoryupdater -d\n\n", help_specific_usage="Options:\n" " -r --report\t\tOAI repository status\n" " -d --detailed-report\t\tOAI repository detailed status\n" " -n --no-process\tDo no upload the modifications\n", - version=__revision__, specific_params=("rdn", [ "report", "detailed-report", "no-process"]), task_submit_elaborate_specific_parameter_fnc= task_submit_elaborate_specific_parameter, task_run_fnc=oairepositoryupdater_task) -def task_submit_elaborate_specific_parameter(key, value, opts, args): +def task_submit_elaborate_specific_parameter(key, _value, _opts, _args): """Elaborate specific CLI parameters of oairepositoryupdater""" if key in ("-r", "--report"): task_set_option("report", 1) if key in ("-d", "--detailed-report"): task_set_option("report", 2) elif key in ("-n", "--no-process"): task_set_option("no_upload", 1) else: return False return True ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/bibharvest/lib/oai_repository_webinterface.py b/modules/bibharvest/lib/oai_repository_webinterface.py index 936eea99a..f58b578d2 100644 --- a/modules/bibharvest/lib/oai_repository_webinterface.py +++ b/modules/bibharvest/lib/oai_repository_webinterface.py @@ -1,154 +1,148 @@ ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Invenio OAI provider interface, compliant with OAI-PMH/2.0""" __revision__ = "$Id$" import os -import urllib import time -from invenio import webinterface_handler_config as apache +import cStringIO +from invenio import webinterface_handler_config as apache from invenio import oai_repository_server -from invenio.config import CFG_CACHEDIR, CFG_OAI_SLEEP +from invenio.errorlib import register_exception +from invenio.config import CFG_CACHEDIR, CFG_OAI_SLEEP, CFG_DEVEL_SITE, \ + CFG_ETCDIR from invenio.webinterface_handler import wash_urlargd, WebInterfaceDirectory +CFG_VALIDATE_RESPONSES = False +OAI_PMH_VALIDATOR = None + +if CFG_DEVEL_SITE: + try: + from lxml import etree + OAI_PMH_VALIDATOR = etree.XMLSchema(etree.parse(open(os.path.join(CFG_ETCDIR, 'bibharvest', 'OAI-PMH.xsd')))) + CFG_VALIDATE_RESPONSES = True + except ImportError: + pass + class WebInterfaceOAIProviderPages(WebInterfaceDirectory): """Defines the set of /oai2d OAI provider pages.""" _exports = [''] def __call__(self, req, form): - "OAI repository interface" + """OAI repository interface""" # Clean input arguments. The protocol specifies that an error # has to be returned if the same argument is specified several # times. Eg: # oai2d?verb=ListIdentifiers&metadataPrefix=marcxml&metadataPrefix=marcxml # So keep the arguments as list for now so that check_argd can # return an error if needed (check_argd also transforms these # lists into strings) argd = wash_urlargd(form, {'verb': (list, []), 'metadataPrefix': (list, []), 'from': (list, []), 'until': (list, []), 'set': (list, []), 'identifier': (list, []), 'resumptionToken': (list, []), }) + if CFG_VALIDATE_RESPONSES: + req.track_writings = True + ## wash_urlargd(..) function cleaned everything, but also added ## unwanted parameters. Remove them now for param in argd.keys(): if not param in form and param != 'verb': del argd[param] ## wash_urlargd(..) function also removed unknown parameters ## that we would like to keep in order to send back an error ## as required by the protocol. But we do not need that value, ## so set it to empty string. for param in form.keys(): if param not in argd.keys(): argd[param] = '' ## But still remove 'ln' parameter that was automatically added. if argd.has_key('ln'): del argd['ln'] ## check request for OAI compliancy ## also transform all the list arguments into string - oai_error = oai_repository_server.check_argd(argd) + oai_errors = oai_repository_server.check_argd(argd) ## check availability (OAI requests for Identify, ListSets and ## ListMetadataFormats are served immediately, otherwise we ## shall wait for CFG_OAI_SLEEP seconds between requests): - if os.path.exists("%s/RTdata/RTdata" % CFG_CACHEDIR) and (argd['verb'] not in ["Identify", "ListMetadataFormats", "ListSets"]): + if os.path.exists("%s/RTdata/RTdata" % CFG_CACHEDIR) and (argd['verb'] not in ["Identify", "ListMetadataFormats", "ListSets"] and not argd.get('resumptionToken')): time_gap = int(time.time() - os.path.getmtime("%s/RTdata/RTdata" % CFG_CACHEDIR)) if(time_gap < CFG_OAI_SLEEP): req.headers_out["Status-Code"] = "503" req.headers_out["Retry-After"] = "%d" % (CFG_OAI_SLEEP - time_gap) req.status = apache.HTTP_SERVICE_UNAVAILABLE return "Retry after %d seconds" % (CFG_OAI_SLEEP - time_gap) command = "touch %s/RTdata/RTdata" % CFG_CACHEDIR os.system(command) - ## construct args (argd string equivalent) for the - ## oai_repository_server business logic (later it may be good if it - ## takes argd directly): - args = urllib.urlencode(argd) ## create OAI response - req.content_type = "text/xml" req.send_http_header() - if oai_error == "": - + if not oai_errors: ## OAI Identify - if argd['verb'] == "Identify": - req.write(oai_repository_server.oaiidentify(args, script_url=req.uri)) - + req.write(oai_repository_server.oai_identify(argd)) ## OAI ListSets - elif argd['verb'] == "ListSets": - req.write(oai_repository_server.oailistsets(args)) - - - ## OAI ListIdentifiers - - elif argd['verb'] == "ListIdentifiers": - req.write(oai_repository_server.oailistidentifiers(args)) - - - ## OAI ListRecords - - elif argd['verb'] == "ListRecords": - req.write(oai_repository_server.oailistrecords(args)) + req.write(oai_repository_server.oai_list_sets(argd)) + ## OAI ListIdentifiers or OAI ListRecords + elif argd['verb'] in ("ListIdentifiers", "ListRecords"): + oai_repository_server.oai_list_records_or_identifiers(req, argd) ## OAI GetRecord - elif argd['verb'] == "GetRecord": - req.write(oai_repository_server.oaigetrecord(args)) - + req.write(oai_repository_server.oai_get_record(argd)) ## OAI ListMetadataFormats - elif argd['verb'] == "ListMetadataFormats": - req.write(oai_repository_server.oailistmetadataformats(args)) - + req.write(oai_repository_server.oai_list_metadata_formats(argd)) ## Unknown verb - else: - req.write(oai_repository_server.oai_error("badVerb","Illegal OAI verb")) - - ## OAI error - else: - req.write(oai_repository_server.oai_header(args,"")) - req.write(oai_error) - req.write(oai_repository_server.oai_footer("")) - + req.write(oai_repository_server.oai_error(argd, oai_errors)) + + if CFG_VALIDATE_RESPONSES: + req.track_writings = False + try: + OAI_PMH_VALIDATOR.assertValid(etree.parse(cStringIO.StringIO(req.what_was_written))) + except etree.DocumentInvalid: + register_exception(req=req, alert_admin=True) + raise return "\n" ## Return the same page wether we ask for /oai2d?verb or /oai2d/?verb index = __call__ diff --git a/modules/bibrank/doc/hacking/bibrank-api.webdoc b/modules/bibrank/doc/hacking/bibrank-api.webdoc index 3fa9a488d..d0757e88b 100644 --- a/modules/bibrank/doc/hacking/bibrank-api.webdoc +++ b/modules/bibrank/doc/hacking/bibrank-api.webdoc @@ -1,137 +1,137 @@ ## -*- mode: html; coding: utf-8; -*- ## This file is part of Invenio. ## Copyright (C) 2007, 2008, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
     Invenio Bibrank Record Sorter can be called from within your Python
     programs via a high-level API and a mid-mid level API.
     
     1. High-level API
     
        Description:
     
           The high-level access to the BibRank Record Sorter is provided
           by exactly the same function as called from the web interface
           when users submit their queries, if a rank method has been
           selected. This should guarantee exactly the same behaviour if
           the same parameters are given.
     
           There are three thing to note: (i) When a search is done, the
    -      search engine is sending a HitSet containing all the records that
    -      matches the query. Since only records in the HitSet are ranked, a
    -      HitSet must be created containing wished records to rank and be sent
    +      search engine is sending a intbitset containing all the records that
    +      matches the query. Since only records in the intbitset are ranked, a
    +      intbitset must be created containing wished records to rank and be sent
           as a parameter to the function. (ii) Some rank methods may choose to
    -      ignore the HitSet, like the "Similar Records" function. (iii) In case
    +      ignore the intbitset, like the "Similar Records" function. (iii) In case
           of an error ranking the records, the returned data is different.
     
        Signature:
     
           def rank_records(rank_method_code, rank_limit_relevance,
                            hitset_global, pattern, verbose=0):
            """
            rank_method_code - 'jif', 'wrd' or other methods
            rank_limit_relevance - a number defining the threshold of which
            to rank records, may be ignored by rank method.
            hitset_global, search engine hits, if all records should be
    -       ranked, fill the HitSet with ones.
    +       ranked, fill the intbitset with ones.
            pattern, search engine query or record ID, must be a
            list. ['CERN', 'fermilab'] or ['recid:12345']
            verbose, verbose level - 0-9 defines how much debug information
            should be shown
     
            output if successfull:
            list of records - [123, 321, 12451, 123, 12, 4]
            list of rank values - ascending, same length as the list of
            records [0, 10, 20, 30, 40, 100]
            prefix - text to show before the rank value, '<--' hides rank
            value, defined in config file.
            postfix - text to show after the rank value, '-!>' hides rank
            value, defined in config file.
            verbose_output - the debug text depending on the verbose level.
     
            output if error:
            list of records - is None
            list of rank values - is None
            prefix - Contains headline of error
            postfix - Error message or error box if exception.
            verbose_output - the debug text depending on the verbose level.
            """
     
     
        Examples:
     
           >>> # import the function:
           >>> from invenio.bibrank_record_sorter import rank_records
           >>> # rank all records with the words 'higgs boson' according to the method "wrd"
           >>> rank_records('wrd', 0, a_hitset, ['higgs', 'boson'], 0)
           >>> # find similar records to the record 12345, hitset is here ignored because of 'similar records'
           >>> rank_records('wrd', 0, a_hitset, ['recid:12345'], 0)
           >>> # rank all records based on jif value
           >>> rank_records('jif', 0, a_hitset, [], 0)
     
     2. Mid-level API
     
        Description:
     
           Using the mid-level API, you can call directly the various methods
           for ranking. The functions will not return data in a way the search
           engine understands. They will neither find out if it is the correct
           function that is called, but return an error if wrong code/function
           is used.
     
        Signatures:
           def combine_method(rank_method_code, pattern, hitset, rank_limit_relevance,verbose):
           -This method calls each method mentioned in the config file and add the results together
     
           def find_similar(rank_method_code, recID, hitset, rank_limit_relevance,
           -This method finds similar records based on the one given in the recID field. The recID field
            must be a integer value. hitset is ignored. rank_method_code has to be 'wrd'.
     
           def word_frequency(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
           -This method ranks records based on the list of words in lwords field. rank_method_code has to be
            'wrd'. Only records in hitset is ranked.
     
           def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance,verbose):
           -All other rank methods uses this function together with data from the rnkMETHODDATA table
            (a dictionary of {recid: (text, value)} to rank the data. Only records in the hitset is ranked.
     
           These mid-level API functions demands that the function create_rnkmethod_cache() has been called,
           since it loads the config options needed.
           The rank methods returns all the same data:
           ([[recid, value],[recid, value]], prefix, postfix, verbose_output)
     
        Examples:
     
           >>> # import the function:
           >>> from invenio.bibrank_record_sorter import find_similar
           >>> # find records similar to 12345, hitset must be full
           >>> find_similar('wrd', 12345, hitset, 0, 0)
           >>> # rank records according to a method called jif, using the single_tag...based method.
           >>> # the list of words is here ignored, only the records in the hitset are used.
           >>> rank_by_method('jif',['higgs'], hitset, 0, 0)
           >>> # rank records containing ['higgs', 'boson'] using word similarity ('wrd')
           >>> word_similarity('wrd',['higgs', 'boson'], hitset, 0, 0)
           >>> # rank records using various methods, which methods to use is read from the config file.
           >>> combine_method('cmb', ['higgs','boson'], hitset, 0, 0)
     
    diff --git a/modules/bibrank/lib/bibrank_record_sorter_tests.py b/modules/bibrank/lib/bibrank_record_sorter_tests.py index 95e9c694c..e7fc70ec0 100644 --- a/modules/bibrank/lib/bibrank_record_sorter_tests.py +++ b/modules/bibrank/lib/bibrank_record_sorter_tests.py @@ -1,53 +1,53 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Unit tests for the ranking engine.""" __revision__ = "$Id$" import unittest from invenio import bibrank_record_sorter -from invenio.search_engine import HitSet +from invenio.intbitset import intbitset from invenio.testutils import make_test_suite, run_test_suite class TestListSetOperations(unittest.TestCase): """Test list set operations.""" def test_record_sorter(self): """bibrank record sorter - sorting records""" - hitset = HitSet() + hitset = intbitset() hitset += (1,2,5) - hitset2 = HitSet() + hitset2 = intbitset() hitset2.add(5) rec_termcount = {1: 1, 2: 1, 5: 1} (res1, res2) = bibrank_record_sorter.sort_record_relevance({1: 50, 2:30, 3:70,4:10},rec_termcount,hitset, 50,0) self.assertEqual(([(1, 71), (3, 100)], list(hitset2)), (res1, list(res2))) def test_calculate_record_relevance(self): """bibrank record sorter - calculating relevances""" - hitset = HitSet() + hitset = intbitset() hitset += (1,2,5) self.assertEqual(({1: 7, 2: 7, 5: 5}, {1: 1, 2: 1, 5: 1}), bibrank_record_sorter.calculate_record_relevance(("testterm", 2.0), {"Gi":(0, 50.0), 1: (3, 4.0), 2: (4, 5.0), 5: (1, 3.5)}, hitset, {}, {}, 0, None)) TEST_SUITE = make_test_suite(TestListSetOperations,) if __name__ == "__main__": run_test_suite(TEST_SUITE) diff --git a/modules/bibrank/lib/bibrank_tag_based_indexer.py b/modules/bibrank/lib/bibrank_tag_based_indexer.py index 63d2cc12e..e81c22c63 100644 --- a/modules/bibrank/lib/bibrank_tag_based_indexer.py +++ b/modules/bibrank/lib/bibrank_tag_based_indexer.py @@ -1,546 +1,546 @@ # -*- coding: utf-8 -*- ## Ranking of records using different parameters and methods. ## This file is part of Invenio. ## Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. import os import sys import time import ConfigParser from invenio.config import \ CFG_SITE_LANG, \ CFG_ETCDIR, \ CFG_PREFIX -from invenio.search_engine import perform_request_search, HitSet +from invenio.search_engine import perform_request_search from invenio.bibrank_citation_indexer import get_citation_weight, print_missing, get_cit_dict, insert_into_cit_db from invenio.bibrank_downloads_indexer import * from invenio.dbquery import run_sql, serialize_via_marshal, deserialize_via_marshal, \ wash_table_column_name, get_table_update_time from invenio.errorlib import register_exception from invenio.bibtask import task_get_option, write_message, task_sleep_now_if_required from invenio.bibindex_engine import create_range_list from invenio.intbitset import intbitset options = {} def remove_auto_cites(dic): """Remove auto-cites and dedupe.""" for key in dic.keys(): new_list = dic.fromkeys(dic[key]).keys() try: new_list.remove(key) except ValueError: pass dic[key] = new_list return dic def citation_repair_exec(): """Repair citation ranking method""" ## repair citations for rowname in ["citationdict","reversedict"]: ## get dic dic = get_cit_dict(rowname) ## repair write_message("Repairing %s" % rowname) dic = remove_auto_cites(dic) ## store healthy citation dic insert_into_cit_db(dic, rowname) return def download_weight_filtering_user_repair_exec (): """Repair download weight filtering user ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def download_weight_total_repair_exec(): """Repair download weight total ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def file_similarity_by_times_downloaded_repair_exec(): """Repair file similarity by times downloaded ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def single_tag_rank_method_repair_exec(): """Repair single tag ranking method""" write_message("Repairing for this ranking method is not defined. Skipping.") return def citation_exec(rank_method_code, name, config): """Rank method for citation analysis""" #first check if this is a specific task begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if task_get_option("cmd") == "print-missing": num = task_get_option("num") print_missing(num) else: dict = get_citation_weight(rank_method_code, config) if dict: intoDB(dict, begin_date, rank_method_code) else: write_message("no need to update the indexes for citations") def download_weight_filtering_user(run): return bibrank_engine(run) def download_weight_total(run): return bibrank_engine(run) def file_similarity_by_times_downloaded(run): return bibrank_engine(run) def download_weight_filtering_user_exec (rank_method_code, name, config): """Ranking by number of downloads per User. Only one full Text Download is taken in account for one specific userIP address""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time1 = time.time() dic = fromDB(rank_method_code) last_updated = get_lastupdated(rank_method_code) keys = new_downloads_to_index(last_updated) filter_downloads_per_hour(keys, last_updated) dic = get_download_weight_filtering_user(dic, keys) intoDB(dic, begin_date, rank_method_code) time2 = time.time() return {"time":time2-time1} def download_weight_total_exec(rank_method_code, name, config): """rankink by total number of downloads without check the user ip if users downloads 3 time the same full text document it has to be count as 3 downloads""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time1 = time.time() dic = fromDB(rank_method_code) last_updated = get_lastupdated(rank_method_code) keys = new_downloads_to_index(last_updated) filter_downloads_per_hour(keys, last_updated) dic = get_download_weight_total(dic, keys) intoDB(dic, begin_date, rank_method_code) time2 = time.time() return {"time":time2-time1} def file_similarity_by_times_downloaded_exec(rank_method_code, name, config): """update dictionnary {recid:[(recid, nb page similarity), ()..]}""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time1 = time.time() dic = fromDB(rank_method_code) last_updated = get_lastupdated(rank_method_code) keys = new_downloads_to_index(last_updated) filter_downloads_per_hour(keys, last_updated) dic = get_file_similarity_by_times_downloaded(dic, keys) intoDB(dic, begin_date, rank_method_code) time2 = time.time() return {"time":time2-time1} def single_tag_rank_method_exec(rank_method_code, name, config): """Creating the rank method data""" begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) rnkset = {} rnkset_old = fromDB(rank_method_code) rnkset_new = single_tag_rank(config) rnkset = union_dicts(rnkset_old, rnkset_new) intoDB(rnkset, begin_date, rank_method_code) def single_tag_rank(config): """Connect the given tag with the data from the kb file given""" write_message("Loading knowledgebase file", verbose=9) kb_data = {} records = [] write_message("Reading knowledgebase file: %s" % \ config.get(config.get("rank_method", "function"), "kb_src")) input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') data = input.readlines() for line in data: if not line[0:1] == "#": kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1] write_message("Number of lines read from knowledgebase file: %s" % len(kb_data)) tag = config.get(config.get("rank_method", "function"), "tag") tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ") if tags == ['']: tags = "" records = [] for (recids, recide) in options["recid_range"]: task_sleep_now_if_required(can_stop_too=True) write_message("......Processing records #%s-%s" % (recids, recide)) recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide)) - valid = HitSet(trailing_bits=1) + valid = intbitset(trailing_bits=1) valid.discard(0) for key in tags: - newset = HitSet() + newset = intbitset() newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))] valid.intersection_update(newset) if tags: recs = filter(lambda x: x[0] in valid, recs) records = records + list(recs) write_message("Number of records found with the necessary tags: %s" % len(records)) records = filter(lambda x: x[0] in options["validset"], records) rnkset = {} for key, value in records: if kb_data.has_key(value): if not rnkset.has_key(key): rnkset[key] = float(kb_data[value]) else: if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]): rnkset[key] = float(kb_data[value]) else: rnkset[key] = 0 write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset def get_lastupdated(rank_method_code): """Get the last time the rank method was updated""" res = run_sql("SELECT rnkMETHOD.last_updated FROM rnkMETHOD WHERE name=%s", (rank_method_code, )) if res: return res[0][0] else: raise Exception("Is this the first run? Please do a complete update.") def intoDB(dict, date, rank_method_code): """Insert the rank method data into the database""" mid = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) del_rank_method_codeDATA(rank_method_code) serdata = serialize_via_marshal(dict); midstr = str(mid[0][0]); run_sql("INSERT INTO rnkMETHODDATA(id_rnkMETHOD, relevance_data) VALUES (%s,%s)", (midstr, serdata,)) run_sql("UPDATE rnkMETHOD SET last_updated=%s WHERE name=%s", (date, rank_method_code)) # FIXME: the following is a workaround for the citation indexer # memory troubles, when Apache WSGI daemon processes may end up # doubling the memory after citation dictionary is updated; # therefore let us restart the WSGI daemon application after the # citation indexer finished, which relieves this problem. The # restart is done via touching invenio.wsgi file. The proper fix # for this problem would be strict separation between citation # indexer updating dicts and citation searcher loading dicts. if rank_method_code == 'citation': os.system('touch ' + os.path.join(CFG_PREFIX, 'var', 'www-wsgi', 'invenio.wsgi')) def fromDB(rank_method_code): """Get the data for a rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: return deserialize_via_marshal(res[0][0]) else: return {} def del_rank_method_codeDATA(rank_method_code): """Delete the data for a rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) run_sql("DELETE FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) def del_recids(rank_method_code, range_rec): """Delete some records from the rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: rec_dict = deserialize_via_marshal(res[0][0]) write_message("Old size: %s" % len(rec_dict)) for (recids, recide) in range_rec: for i in range(int(recids), int(recide)): if rec_dict.has_key(i): del rec_dict[i] write_message("New size: %s" % len(rec_dict)) intoDB(rec_dict, begin_date, rank_method_code) else: write_message("Create before deleting!") def union_dicts(dict1, dict2): "Returns union of the two dicts." union_dict = {} for (key, value) in dict1.iteritems(): union_dict[key] = value for (key, value) in dict2.iteritems(): union_dict[key] = value return union_dict def rank_method_code_statistics(rank_method_code): """Print statistics""" method = fromDB(rank_method_code) max = ('', -999999) maxcount = 0 min = ('', 999999) mincount = 0 for (recID, value) in method.iteritems(): if value < min and value > 0: min = value if value > max: max = value for (recID, value) in method.iteritems(): if value == min: mincount += 1 if value == max: maxcount += 1 write_message("Showing statistic for selected method") write_message("Method name: %s" % getName(rank_method_code)) write_message("Short name: %s" % rank_method_code) write_message("Last run: %s" % get_lastupdated(rank_method_code)) write_message("Number of records: %s" % len(method)) write_message("Lowest value: %s - Number of records: %s" % (min, mincount)) write_message("Highest value: %s - Number of records: %s" % (max, maxcount)) write_message("Divided into 10 sets:") for i in range(1, 11): setcount = 0 distinct_values = {} lower = -1.0 + ((float(max + 1) / 10)) * (i - 1) upper = -1.0 + ((float(max + 1) / 10)) * i for (recID, value) in method.iteritems(): if value >= lower and value <= upper: setcount += 1 distinct_values[value] = 1 write_message("Set %s (%s-%s) %s Distinct values: %s" % (i, lower, upper, len(distinct_values), setcount)) def check_method(rank_method_code): write_message("Checking rank method...") if len(fromDB(rank_method_code)) == 0: write_message("Rank method not yet executed, please run it to create the necessary data.") else: if len(add_recIDs_by_date(rank_method_code)) > 0: write_message("Records modified, update recommended") else: write_message("No records modified, update not necessary") def bibrank_engine(run): """Run the indexing task. Return 1 in case of success and 0 in case of failure. """ try: import psyco psyco.bind(single_tag_rank) psyco.bind(single_tag_rank_method_exec) psyco.bind(serialize_via_marshal) psyco.bind(deserialize_via_marshal) except StandardError, e: pass startCreate = time.time() try: options["run"] = [] options["run"].append(run) for rank_method_code in options["run"]: task_sleep_now_if_required(can_stop_too=True) cfg_name = getName(rank_method_code) write_message("Running rank method: %s." % cfg_name) file = CFG_ETCDIR + "/bibrank/" + rank_method_code + ".cfg" config = ConfigParser.ConfigParser() try: config.readfp(open(file)) except StandardError, e: write_message("Cannot find configurationfile: %s" % file, sys.stderr) raise StandardError cfg_short = rank_method_code cfg_function = config.get("rank_method", "function") + "_exec" cfg_repair_function = config.get("rank_method", "function") + "_repair_exec" cfg_name = getName(cfg_short) options["validset"] = get_valid_range(rank_method_code) if task_get_option("collection"): l_of_colls = string.split(task_get_option("collection"), ", ") recIDs = perform_request_search(c=l_of_colls) recIDs_range = [] for recID in recIDs: recIDs_range.append([recID, recID]) options["recid_range"] = recIDs_range elif task_get_option("id"): options["recid_range"] = task_get_option("id") elif task_get_option("modified"): options["recid_range"] = add_recIDs_by_date(rank_method_code, task_get_option("modified")) elif task_get_option("last_updated"): options["recid_range"] = add_recIDs_by_date(rank_method_code) else: write_message("No records specified, updating all", verbose=2) min_id = run_sql("SELECT min(id) from bibrec")[0][0] max_id = run_sql("SELECT max(id) from bibrec")[0][0] options["recid_range"] = [[min_id, max_id]] if task_get_option("quick") == "no": write_message("Recalculate parameter not used, parameter ignored.", verbose=9) if task_get_option("cmd") == "del": del_recids(cfg_short, options["recid_range"]) elif task_get_option("cmd") == "add": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "stat": rank_method_code_statistics(rank_method_code) elif task_get_option("cmd") == "check": check_method(rank_method_code) elif task_get_option("cmd") == "print-missing": func_object = globals().get(cfg_function) func_object(rank_method_code, cfg_name, config) elif task_get_option("cmd") == "repair": func_object = globals().get(cfg_repair_function) func_object() else: write_message("Invalid command found processing %s" % rank_method_code, sys.stderr) raise StandardError except StandardError, e: write_message("\nException caught: %s" % e, sys.stderr) register_exception() raise StandardError if task_get_option("verbose"): showtime((time.time() - startCreate)) return 1 def get_valid_range(rank_method_code): """Return a range of records""" write_message("Getting records from collections enabled for rank method.", verbose=9) res = run_sql("SELECT collection.name FROM collection, collection_rnkMETHOD, rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name=%s", (rank_method_code, )) l_of_colls = [] for coll in res: l_of_colls.append(coll[0]) if len(l_of_colls) > 0: recIDs = perform_request_search(c=l_of_colls) else: recIDs = [] - valid = HitSet() + valid = intbitset() valid += recIDs return valid def add_recIDs_by_date(rank_method_code, dates=""): """Return recID range from records modified between DATES[0] and DATES[1]. If DATES is not set, then add records modified since the last run of the ranking method RANK_METHOD_CODE. """ if not dates: try: dates = (get_lastupdated(rank_method_code), '') except Exception: dates = ("0000-00-00 00:00:00", '') if dates[0] is None: dates = ("0000-00-00 00:00:00", '') query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >= %s""" if dates[1]: query += " and b.modification_date <= %s" query += " ORDER BY b.id ASC""" if dates[1]: res = run_sql(query, (dates[0], dates[1])) else: res = run_sql(query, (dates[0], )) alist = create_range_list([row[0] for row in res]) if not alist: write_message("No new records added since last time method was run") return alist def getName(rank_method_code, ln=CFG_SITE_LANG, type='ln'): """Returns the name of the method if it exists""" try: rnkid = run_sql("SELECT id FROM rnkMETHOD where name=%s", (rank_method_code, )) if rnkid: rnkid = str(rnkid[0][0]) res = run_sql("SELECT value FROM rnkMETHODNAME where type=%s and ln=%s and id_rnkMETHOD=%s", (type, ln, rnkid)) if not res: res = run_sql("SELECT value FROM rnkMETHODNAME WHERE ln=%s and id_rnkMETHOD=%s and type=%s", (CFG_SITE_LANG, rnkid, type)) if not res: return rank_method_code return res[0][0] else: raise Exception except Exception: write_message("Cannot run rank method, either given code for method is wrong, or it has not been added using the webinterface.") raise Exception def single_tag_rank_method(run): return bibrank_engine(run) def showtime(timeused): """Show time used for method""" write_message("Time used: %d second(s)." % timeused, verbose=9) def citation(run): return bibrank_engine(run) # Hack to put index based sorting here, but this is very similar to tag #based method and should re-use a lot of this code, so better to have here #than separate # def index_term_count_exec(rank_method_code, name, config): """Creating the rank method data""" write_message("Recreating index weighting data") begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # we must recalculate these every time for all records, since the # weighting of a record is determined by the index entries of _other_ # records rnkset = calculate_index_term_count(config) intoDB(rnkset, begin_date, rank_method_code) def calculate_index_term_count(config): """Calculate the weight of a record set based on number of enries of a tag from the record in another index...useful for authority files""" records = [] if config.has_section("index_term_count"): index = config.get("index_term_count","index_table_name") tag = config.get("index_term_count","index_term_value_from_tag") # check against possible SQL injection: dummy = get_table_update_time(index) tag = wash_table_column_name(tag) else: raise Exception("Config file " + config + " does not have index_term_count section") return() task_sleep_now_if_required(can_stop_too=True) write_message("......Processing all records") query = "SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id" % \ (tag[0:2], tag[0:2]) # we checked that tag is safe records = list(run_sql(query, (tag,))) write_message("Number of records found with the necessary tags: %s" % len(records)) rnkset = {} for key, value in records: hits = 0 if len(value): query = "SELECT hitlist from %s where term = %%s" % index # we checked that index is a table row = run_sql(query, (value,)) if row and row[0] and row[0][0]: #has to be prepared for corrupted data! try: hits = len(intbitset(row[0][0])) except: hits = 0 rnkset[key] = hits write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset def index_term_count(run): return bibrank_engine(run) diff --git a/modules/bibupload/lib/bibupload.py b/modules/bibupload/lib/bibupload.py index 26906b70e..adae6d851 100644 --- a/modules/bibupload/lib/bibupload.py +++ b/modules/bibupload/lib/bibupload.py @@ -1,2164 +1,2175 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibUpload: Receive MARC XML file and update the appropriate database tables according to options. """ __revision__ = "$Id$" import os import re import sys import time from zlib import compress import socket import marshal import copy import tempfile import urlparse import urllib2 from invenio.config import CFG_OAI_ID_FIELD, \ CFG_BIBUPLOAD_REFERENCE_TAG, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, \ CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG, \ CFG_BIBUPLOAD_STRONG_TAGS, \ CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \ - CFG_SITE_URL, CFG_SITE_RECORD + CFG_SITE_URL, CFG_SITE_RECORD, \ + CFG_OAI_PROVENANCE_ALTERED_SUBFIELD from invenio.jsonutils import json, CFG_JSON_AVAILABLE from invenio.bibupload_config import CFG_BIBUPLOAD_CONTROLFIELD_TAGS, \ CFG_BIBUPLOAD_SPECIAL_TAGS from invenio.dbquery import run_sql, \ Error from invenio.bibrecord import create_records, \ record_add_field, \ record_delete_field, \ record_xml_output, \ record_get_field_instances, \ record_get_field_values, \ field_get_subfield_values, \ field_get_subfield_instances, \ record_modify_subfield, \ record_delete_subfield_from, \ record_delete_fields, \ record_add_subfield_into, \ record_find_field, \ record_extract_oai_id from invenio.search_engine import get_record from invenio.dateutils import convert_datestruct_to_datetext from invenio.errorlib import register_exception from invenio.intbitset import intbitset from invenio.config import CFG_WEBSUBMIT_FILEDIR from invenio.bibtask import task_init, write_message, \ task_set_option, task_get_option, task_get_task_param, task_update_status, \ task_update_progress, task_sleep_now_if_required, fix_argv_paths from invenio.bibdocfile import BibRecDocs, file_strip_ext, normalize_format, \ get_docname_from_url, check_valid_url, download_url, \ KEEP_OLD_VALUE, decompose_bibdocfile_url, InvenioWebSubmitFileError, \ bibdocfile_url_p, CFG_BIBDOCFILE_AVAILABLE_FLAGS, guess_format_from_url from invenio.search_engine import search_pattern #Statistic variables stat = {} stat['nb_records_to_upload'] = 0 stat['nb_records_updated'] = 0 stat['nb_records_inserted'] = 0 stat['nb_errors'] = 0 stat['nb_holdingpen'] = 0 stat['exectime'] = time.localtime() _WRITING_RIGHTS = None ## Let's set a reasonable timeout for URL request (e.g. FFT) socket.setdefaulttimeout(40) _re_find_001 = re.compile('\\s*(\\d*)\\s*', re.S) def bibupload_pending_recids(): """This function embed a bit of A.I. and is more a hack than an elegant algorithm. It should be updated in case bibupload/bibsched are modified in incompatible ways. This function return the intbitset of all the records that are being (or are scheduled to be) touched by other bibuploads. """ options = run_sql("""SELECT arguments FROM schTASK WHERE status<>'DONE' AND proc='bibupload' AND (status='RUNNING' OR status='CONTINUING' OR status='WAITING' OR status='SCHEDULED' OR status='ABOUT TO STOP' OR status='ABOUT TO SLEEP')""") ret = intbitset() xmls = [] if options: for arguments in options: arguments = marshal.loads(arguments[0]) for argument in arguments[1:]: if argument.startswith('/'): # XMLs files are recognizable because they're absolute # files... xmls.append(argument) for xmlfile in xmls: # Let's grep for the 001 try: xml = open(xmlfile).read() ret += [int(group[1]) for group in _re_find_001.findall(xml)] except: continue return ret ### bibupload engine functions: def bibupload(record, opt_tag=None, opt_mode=None, opt_stage_to_start_from=1, opt_notimechange=0, oai_rec_id = "", pretend=False): """Main function: process a record and fit it in the tables bibfmt, bibrec, bibrec_bibxxx, bibxxx with proper record metadata. Return (error_code, recID) of the processed record. """ assert(opt_mode in ('insert', 'replace', 'replace_or_insert', 'reference', 'correct', 'append', 'format', 'holdingpen', 'delete')) error = None # If there are special tags to proceed check if it exists in the record if opt_tag is not None and not(record.has_key(opt_tag)): msg = " Failed: Tag not found, enter a valid tag to update." write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) # Extraction of the Record Id from 001, SYSNO or OAIID tags: rec_id = retrieve_rec_id(record, opt_mode, pretend=pretend) if rec_id == -1: msg = " Failed: either the record already exists and insert was " \ "requested or the record does not exists and " \ "replace/correct/append has been used" write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) elif rec_id > 0: write_message(" -Retrieve record ID (found %s): DONE." % rec_id, verbose=2) if not record.has_key('001'): # Found record ID by means of SYSNO or OAIID, and the # input MARCXML buffer does not have this 001 tag, so we # should add it now: error = record_add_field(record, '001', controlfield_value=rec_id) if error is None: msg = " Failed: Error during adding the 001 controlfield " \ "to the record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None write_message(" -Added tag 001: DONE.", verbose=2) write_message(" -Check if the xml marc file is already in the database: DONE" , verbose=2) # Reference mode check if there are reference tag if opt_mode == 'reference': error = extract_tag_from_record(record, CFG_BIBUPLOAD_REFERENCE_TAG) if error is None: msg = " Failed: No reference tags has been found..." write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) else: error = None write_message(" -Check if reference tags exist: DONE", verbose=2) record_deleted_p = False if opt_mode == 'insert' or \ (opt_mode == 'replace_or_insert') and rec_id is None: insert_mode_p = True # Insert the record into the bibrec databases to have a recordId rec_id = create_new_record(pretend=pretend) write_message(" -Creation of a new record id (%d): DONE" % rec_id, verbose=2) # we add the record Id control field to the record error = record_add_field(record, '001', controlfield_value=rec_id) if error is None: msg = " Failed: Error during adding the 001 controlfield " \ "to the record" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: error = None elif opt_mode != 'insert' and opt_mode != 'format' and \ opt_stage_to_start_from != 5: insert_mode_p = False # Update Mode # Retrieve the old record to update rec_old = get_record(rec_id) + record_had_altered_bit = record_get_field_values(rec_old, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], CFG_OAI_PROVENANCE_ALTERED_SUBFIELD) # Also save a copy to restore previous situation in case of errors original_record = get_record(rec_id) if rec_old is None: msg = " Failed during the creation of the old record!" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: write_message(" -Retrieve the old record to update: DONE", verbose=2) # In Replace mode, take over old strong tags if applicable: if opt_mode == 'replace' or \ opt_mode == 'replace_or_insert': copy_strong_tags_from_old_record(record, rec_old) # Delete tags to correct in the record if opt_mode == 'correct' or opt_mode == 'reference': delete_tags_to_correct(record, rec_old, opt_tag) write_message(" -Delete the old tags to correct in the old record: DONE", verbose=2) # Delete tags specified if in delete mode if opt_mode == 'delete': record = delete_tags(record, rec_old) write_message(" -Delete specified tags in the old record: DONE", verbose=2) # Append new tag to the old record and update the new record with the old_record modified if opt_mode == 'append' or opt_mode == 'correct' or \ opt_mode == 'reference': record = append_new_tag_to_old_record(record, rec_old, opt_tag, opt_mode) write_message(" -Append new tags to the old record: DONE", verbose=2) + # if record_had_altered_bit, this must be set to true, since the + # record has been altered. + if record_had_altered_bit: + oai_provenance_fields = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) + for oai_provenance_field in oai_provenance_fields: + for i, (code, dummy_value) in enumerate(oai_provenance_field[0]): + if code == CFG_OAI_PROVENANCE_ALTERED_SUBFIELD: + oai_provenance_field[0][i] = (code, 'true') + # now we clear all the rows from bibrec_bibxxx from the old # record (they will be populated later (if needed) during # stage 4 below): delete_bibrec_bibxxx(rec_old, rec_id, pretend=pretend) record_deleted_p = True write_message(" -Clean bibrec_bibxxx: DONE", verbose=2) write_message(" -Stage COMPLETED", verbose=2) try: if not record_is_valid(record): msg = "ERROR: record is not valid" write_message(msg, verbose=1, stream=sys.stderr) return (1, -1, msg) # Have a look if we have FMT tags write_message("Stage 1: Start (Insert of FMT tags if exist).", verbose=2) if opt_stage_to_start_from <= 1 and \ extract_tag_from_record(record, 'FMT') is not None: record = insert_fmt_tags(record, rec_id, opt_mode, pretend=pretend) if record is None: msg = " Stage 1 failed: Error while inserting FMT tags" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) elif record == 0: # Mode format finished stat['nb_records_updated'] += 1 return (0, int(rec_id), "") write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Have a look if we have FFT tags write_message("Stage 2: Start (Process FFT tags if exist).", verbose=2) record_had_FFT = False if opt_stage_to_start_from <= 2 and \ extract_tag_from_record(record, 'FFT') is not None: record_had_FFT = True if not writing_rights_p(): write_message(" Stage 2 failed: Error no rights to write fulltext files", verbose=1, stream=sys.stderr) task_update_status("ERROR") sys.exit(1) try: record = elaborate_fft_tags(record, rec_id, opt_mode, pretend=pretend) except Exception, e: register_exception() msg = " Stage 2 failed: Error while elaborating FFT tags: %s" % e write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if record is None: msg = " Stage 2 failed: Error while elaborating FFT tags" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Have a look if we have FFT tags write_message("Stage 2B: Start (Synchronize 8564 tags).", verbose=2) has_bibdocs = run_sql("SELECT count(id_bibdoc) FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id WHERE id_bibrec=%s AND status<>'DELETED'", (rec_id, ))[0][0] > 0 if opt_stage_to_start_from <= 2 and (has_bibdocs or record_had_FFT or extract_tag_from_record(record, '856') is not None): try: record = synchronize_8564(rec_id, record, record_had_FFT, pretend=pretend) except Exception, e: register_exception(alert_admin=True) msg = " Stage 2B failed: Error while synchronizing 8564 tags: %s" % e write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if record is None: msg = " Stage 2B failed: Error while synchronizing 8564 tags" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Update of the BibFmt write_message("Stage 3: Start (Update bibfmt).", verbose=2) if opt_stage_to_start_from <= 3: # format the single record as xml rec_xml_new = record_xml_output(record) # Update bibfmt with the format xm of this record if opt_mode != 'format': error = update_bibfmt_format(rec_id, rec_xml_new, 'xm', pretend=pretend) if error == 1: msg = " Failed: error during update_bibfmt_format 'xm'" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE: error = update_bibfmt_format(rec_id, marshal.dumps(record), 'recstruct', pretend=pretend) if error == 1: msg = " Failed: error during update_bibfmt_format 'recstruct'" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) # archive MARCXML format of this record for version history purposes: error = archive_marcxml_for_history(rec_id, pretend=pretend) if error == 1: msg = " Failed to archive MARCXML for history" write_message(msg, verbose=1, stream=sys.stderr) return (1, int(rec_id), msg) else: write_message(" -Archived MARCXML for history : DONE", verbose=2) write_message(" -Stage COMPLETED", verbose=2) # Update the database MetaData write_message("Stage 4: Start (Update the database with the metadata).", verbose=2) if opt_stage_to_start_from <= 4: if opt_mode in ('insert', 'replace', 'replace_or_insert', 'append', 'correct', 'reference', 'delete'): update_database_with_metadata(record, rec_id, oai_rec_id, pretend=pretend) record_deleted_p = False else: write_message(" -Stage NOT NEEDED in mode %s" % opt_mode, verbose=2) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Finally we update the bibrec table with the current date write_message("Stage 5: Start (Update bibrec table with current date).", verbose=2) if opt_stage_to_start_from <= 5 and \ opt_notimechange == 0 and \ not insert_mode_p: now = convert_datestruct_to_datetext(time.localtime()) write_message(" -Retrieved current localtime: DONE", verbose=2) update_bibrec_modif_date(now, rec_id, pretend=pretend) write_message(" -Stage COMPLETED", verbose=2) else: write_message(" -Stage NOT NEEDED", verbose=2) # Increase statistics if insert_mode_p: stat['nb_records_inserted'] += 1 else: stat['nb_records_updated'] += 1 # Upload of this record finish write_message("Record "+str(rec_id)+" DONE", verbose=1) return (0, int(rec_id), "") finally: if record_deleted_p: ## BibUpload has failed living the record deleted. We should ## back the original record then. update_database_with_metadata(original_record, rec_id, oai_rec_id, pretend=pretend) write_message(" Restored original record", verbose=1, stream=sys.stderr) def record_is_valid(record): """ Check if the record is valid. Currently this simply checks if the record has exactly one rec_id. @param record: the record @type record: recstruct @return: True if the record is valid @rtype: bool """ rec_ids = record_get_field_values(record, tag="001") if len(rec_ids) != 1: write_message(" The record is not valid: it has not a single rec_id: %s" % (rec_ids), stream=sys.stderr) return False return True def find_record_ids_by_oai_id(oaiId): """ A method finding the records identifier provided the oai identifier returns a list of identifiers matching a given oai identifier """ # Is this record already in invenio (matching by oaiid) if oaiId: recids = search_pattern(p=oaiId, f=CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, m='e') # Is this record already in invenio (matching by reportnumber i.e. # particularly 037. Idea: to avoid doubbles insertions) repnumber = oaiId.split(":")[-1] if repnumber: recids |= search_pattern(p = repnumber, f = "reportnumber", m = 'e' ) # Is this record already in invenio (matching by reportnumber i.e. # particularly 037. Idea: to avoid double insertions) repnumber = "arXiv:" + oaiId.split(":")[-1] recids |= search_pattern(p = repnumber, f = "reportnumber", m = 'e' ) return recids else: return intbitset() def insert_record_into_holding_pen(record, oai_id, pretend=False): query = "INSERT INTO bibHOLDINGPEN (oai_id, changeset_date, changeset_xml, id_bibrec) VALUES (%s, NOW(), %s, %s)" xml_record = record_xml_output(record) bibrec_ids = find_record_ids_by_oai_id(oai_id) # here determining the identifier of the record if len(bibrec_ids) > 0: bibrec_id = bibrec_ids.pop() else: # id not found by using the oai_id, let's use a wider search based # on any information we might have. bibrec_id = retrieve_rec_id(record, 'holdingpen', pretend=pretend) if bibrec_id is None: bibrec_id = 0 if not pretend: run_sql(query, (oai_id, xml_record, bibrec_id)) # record_id is logged as 0! ( We are not inserting into the main database) log_record_uploading(oai_id, task_get_task_param('task_id', 0), 0, 'H', pretend=pretend) stat['nb_holdingpen'] += 1 def print_out_bibupload_statistics(): """Print the statistics of the process""" out = "Task stats: %(nb_input)d input records, %(nb_updated)d updated, " \ "%(nb_inserted)d inserted, %(nb_errors)d errors, %(nb_holdingpen)d inserted to holding pen. " \ "Time %(nb_sec).2f sec." % { \ 'nb_input': stat['nb_records_to_upload'], 'nb_updated': stat['nb_records_updated'], 'nb_inserted': stat['nb_records_inserted'], 'nb_errors': stat['nb_errors'], 'nb_holdingpen': stat['nb_holdingpen'], 'nb_sec': time.time() - time.mktime(stat['exectime']) } write_message(out) def open_marc_file(path): """Open a file and return the data""" try: # open the file containing the marc document marc_file = open(path,'r') marc = marc_file.read() marc_file.close() except IOError, erro: write_message("Error: %s" % erro, verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) return marc def xml_marc_to_records(xml_marc): """create the records""" # Creation of the records from the xml Marc in argument recs = create_records(xml_marc, 1, 1) if recs == []: write_message("Error: Cannot parse MARCXML file.", verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) elif recs[0][0] is None: write_message("Error: MARCXML file has wrong format: %s" % recs, verbose=1, stream=sys.stderr) write_message("Exiting.", sys.stderr) task_update_status("ERROR") sys.exit(1) else: recs = map((lambda x:x[0]), recs) return recs def find_record_format(rec_id, format): """Look whether record REC_ID is formatted in FORMAT, i.e. whether FORMAT exists in the bibfmt table for this record. Return the number of times it is formatted: 0 if not, 1 if yes, 2 if found more than once (should never occur). """ out = 0 query = """SELECT COUNT(id) FROM bibfmt WHERE id_bibrec=%s AND format=%s""" params = (rec_id, format) res = [] try: res = run_sql(query, params) out = res[0][0] except Error, error: write_message(" Error during find_record_format() : %s " % error, verbose=1, stream=sys.stderr) return out def find_record_from_recid(rec_id): """ Try to find record in the database from the REC_ID number. Return record ID if found, None otherwise. """ try: res = run_sql("SELECT id FROM bibrec WHERE id=%s", (rec_id,)) except Error, error: write_message(" Error during find_record_bibrec() : %s " % error, verbose=1, stream=sys.stderr) if res: return res[0][0] else: return None def find_record_from_sysno(sysno): """ Try to find record in the database from the external SYSNO number. Return record ID if found, None otherwise. """ bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx try: res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % \ {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, sysno,)) except Error, error: write_message(" Error during find_record_from_sysno(): %s " % error, verbose=1, stream=sys.stderr) if res: return res[0][0] else: return None def find_records_from_extoaiid(extoaiid, extoaisrc=None): """ Try to find records in the database from the external EXTOAIID number. Return list of record ID if found, None otherwise. """ assert(CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5] == CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[:5]) bibxxx = 'bib'+CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx try: write_message(' Looking for extoaiid="%s" with extoaisrc="%s"' % (extoaiid, extoaisrc), verbose=9) id_bibrecs = intbitset(run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % \ {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG, extoaiid,))) write_message(' Partially found %s for extoaiid="%s"' % (id_bibrecs, extoaiid), verbose=9) ret = intbitset() for id_bibrec in id_bibrecs: record = get_record(id_bibrec) instances = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]) write_message(' recid %s -> instances "%s"' % (id_bibrec, instances), verbose=9) for instance in instances: this_extoaisrc = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5]) this_extoaisrc = this_extoaisrc and this_extoaisrc[0] or None this_extoaiid = field_get_subfield_values(instance, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]) this_extoaiid = this_extoaiid and this_extoaiid[0] or None write_message(" this_extoaisrc -> %s, this_extoaiid -> %s" % (this_extoaisrc, this_extoaiid), verbose=9) if this_extoaiid == extoaiid: write_message(' recid %s -> provenance "%s"' % (id_bibrec, this_extoaisrc), verbose=9) if this_extoaisrc == extoaisrc: write_message('Found recid %s for extoaiid="%s" with provenance="%s"' % (id_bibrec, extoaiid, extoaisrc), verbose=9) ret.add(id_bibrec) break if this_extoaisrc is None: write_message('WARNING: Found recid %s for extoaiid="%s" that doesn\'t specify any provenance, while input record does.' % (id_bibrec, extoaiid), stream=sys.stderr) if extoaisrc is None: write_message('WARNING: Found recid %s for extoaiid="%s" that specify a provenance (%s), while input record does not have a provenance.' % (id_bibrec, extoaiid, this_extoaisrc), stream=sys.stderr) return ret except Error, error: write_message(" Error during find_records_from_extoaiid(): %s " % error, verbose=1, stream=sys.stderr) raise def find_record_from_oaiid(oaiid): """ Try to find record in the database from the OAI ID number and OAI SRC. Return record ID if found, None otherwise. """ bibxxx = 'bib'+CFG_OAI_ID_FIELD[0:2]+'x' bibrec_bibxxx = 'bibrec_' + bibxxx try: res = run_sql("""SELECT bb.id_bibrec FROM %(bibrec_bibxxx)s AS bb, %(bibxxx)s AS b WHERE b.tag=%%s AND b.value=%%s AND bb.id_bibxxx=b.id""" % \ {'bibxxx': bibxxx, 'bibrec_bibxxx': bibrec_bibxxx}, (CFG_OAI_ID_FIELD, oaiid,)) except Error, error: write_message(" Error during find_record_from_oaiid(): %s " % error, verbose=1, stream=sys.stderr) if res: return res[0][0] else: return None def extract_tag_from_record(record, tag_number): """ Extract the tag_number for record.""" # first step verify if the record is not already in the database if record: return record.get(tag_number, None) return None def retrieve_rec_id(record, opt_mode, pretend=False): """Retrieve the record Id from a record by using tag 001 or SYSNO or OAI ID tag. opt_mod is the desired mode.""" rec_id = None # 1st step: we look for the tag 001 tag_001 = extract_tag_from_record(record, '001') if tag_001 is not None: # We extract the record ID from the tag rec_id = tag_001[0][3] # if we are in insert mode => error if opt_mode == 'insert': write_message(" Failed: tag 001 found in the xml" \ " submitted, you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)", verbose=1, stream=sys.stderr) return -1 else: # we found the rec id and we are not in insert mode => continue # we try to match rec_id against the database: if find_record_from_recid(rec_id) is not None: # okay, 001 corresponds to some known record return int(rec_id) elif opt_mode in ('replace', 'replace_or_insert'): if task_get_option('force'): # we found the rec_id but it's not in the system and we are # requested to replace records. Therefore we create on the fly # a empty record allocating the recid. write_message(" Warning: tag 001 found in the xml with" " value %(rec_id)s, but rec_id %(rec_id)s does" " not exist. Since the mode replace was" " requested the rec_id %(rec_id)s is allocated" " on-the-fly." % {"rec_id" : rec_id}, stream=sys.stderr) return create_new_record(rec_id=rec_id, pretend=pretend) else: # Since --force was not used we are going to raise an error write_message(" Failed: tag 001 found in the xml" " submitted with value %(rec_id)s. The" " corresponding record however does not" " exists. If you want to really create" " such record, please use the --force" " parameter when calling bibupload." % { "rec_id": rec_id}, stream=sys.stderr) return -1 else: # The record doesn't exist yet. We shall have try to check # the SYSNO or OAI id later. write_message(" -Tag 001 value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag 001 not found in the xml marc file.", verbose=9) if rec_id is None: # 2nd step we look for the SYSNO sysnos = record_get_field_values(record, CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[3:4] or "", CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[4:5] or "", CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[5:6]) if sysnos: sysno = sysnos[0] # there should be only one external SYSNO write_message(" -Checking if SYSNO " + sysno + \ " exists in the database", verbose=9) # try to find the corresponding rec id from the database rec_id = find_record_from_sysno(sysno) if rec_id is not None: # rec_id found pass else: # The record doesn't exist yet. We will try to check # external and internal OAI ids later. write_message(" -Tag SYSNO value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag SYSNO not found in the xml marc file.", verbose=9) if rec_id is None: # 2nd step we look for the external OAIID extoai_fields = record_get_field_instances(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3:4] or "", CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] != "_" and \ CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4:5] or "") if extoai_fields: for field in extoai_fields: extoaiid = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5:6]) extoaisrc = field_get_subfield_values(field, CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG[5:6]) if extoaiid: extoaiid = extoaiid[0] if extoaisrc: extoaisrc = extoaisrc[0] else: extoaisrc = None write_message(" -Checking if EXTOAIID %s (%s) exists in the database" % (extoaiid, extoaisrc), verbose=9) # try to find the corresponding rec id from the database try: rec_ids = find_records_from_extoaiid(extoaiid, extoaisrc) except Error, e: write_message(e, verbose=1, stream=sys.stderr) return -1 if rec_ids: # rec_id found rec_id = rec_ids.pop() break else: # The record doesn't exist yet. We will try to check # OAI id later. write_message(" -Tag EXTOAIID value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag EXTOAIID not found in the xml marc file.", verbose=9) if rec_id is None: # 4th step we look for the OAI ID oaiidvalues = record_get_field_values(record, CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4] != "_" and \ CFG_OAI_ID_FIELD[3:4] or "", CFG_OAI_ID_FIELD[4:5] != "_" and \ CFG_OAI_ID_FIELD[4:5] or "", CFG_OAI_ID_FIELD[5:6]) if oaiidvalues: oaiid = oaiidvalues[0] # there should be only one OAI ID write_message(" -Check if local OAI ID " + oaiid + \ " exist in the database", verbose=9) # try to find the corresponding rec id from the database rec_id = find_record_from_oaiid(oaiid) if rec_id is not None: # rec_id found pass else: write_message(" -Tag OAI ID value not found in database.", verbose=9) rec_id = None else: write_message(" -Tag SYSNO not found in the xml marc file.", verbose=9) # Now we should have detected rec_id from SYSNO or OAIID # tags. (None otherwise.) if rec_id: if opt_mode == 'insert': write_message(" Failed : Record found in the database," \ " you should use the option replace," \ " correct or append to replace an existing" \ " record. (-h for help)", verbose=1, stream=sys.stderr) return -1 else: if opt_mode != 'insert' and \ opt_mode != 'replace_or_insert': write_message(" Failed : Record not found in the database."\ " Please insert the file before updating it."\ " (-h for help)", verbose=1, stream=sys.stderr) return -1 return rec_id and int(rec_id) or None ### Insert functions def create_new_record(rec_id=None, pretend=False): """ Create new record in the database @param rec_id: if specified the new record will have this rec_id. @type rec_id: int @return: the allocated rec_id @rtype: int @note: in case of errors will be returned None """ if rec_id is not None: try: rec_id = int(rec_id) except (ValueError, TypeError), error: write_message(" Error during the creation_new_record function : %s " % error, verbose=1, stream=sys.stderr) return None if run_sql("SELECT id FROM bibrec WHERE id=%s", (rec_id, )): write_message(" Error during the creation_new_record function : the requested rec_id %s already exists." % rec_id) return None if pretend: if rec_id: return rec_id else: return run_sql("SELECT max(id)+1 FROM bibrec")[0][0] try: if rec_id is not None: return run_sql("INSERT INTO bibrec (id, creation_date, modification_date) VALUES (%s, NOW(), NOW())", (rec_id, )) else: return run_sql("INSERT INTO bibrec (creation_date, modification_date) VALUES (NOW(), NOW())") except Error, error: write_message(" Error during the creation_new_record function : %s " % error, verbose=1, stream=sys.stderr) return None def insert_bibfmt(id_bibrec, marc, format, modification_date='1970-01-01 00:00:00', pretend=False): """Insert the format in the table bibfmt""" # compress the marc value pickled_marc = compress(marc) try: time.strptime(modification_date, "%Y-%m-%d %H:%M:%S") except ValueError: modification_date = '1970-01-01 00:00:00' query = """INSERT INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)""" try: if not pretend: row_id = run_sql(query, (id_bibrec, format, modification_date, pickled_marc)) return row_id else: return 1 except Error, error: write_message(" Error during the insert_bibfmt function : %s " % error, verbose=1, stream=sys.stderr) return None def insert_record_bibxxx(tag, value, pretend=False): """Insert the record into bibxxx""" # determine into which table one should insert the record table_name = 'bib'+tag[0:2]+'x' # check if the tag, value combination exists in the table query = """SELECT id,value FROM %s """ % table_name query += """ WHERE tag=%s AND value=%s""" params = (tag, value) try: res = run_sql(query, params) except Error, error: write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) # Note: compare now the found values one by one and look for # string binary equality (e.g. to respect lowercase/uppercase # match), regardless of the charset etc settings. Ideally we # could use a BINARY operator in the above SELECT statement, but # we would have to check compatibility on various MySQLdb versions # etc; this approach checks all matched values in Python, not in # MySQL, which is less cool, but more conservative, so it should # work better on most setups. for row in res: row_id = row[0] row_value = row[1] if row_value == value: return (table_name, row_id) # We got here only when the tag,value combination was not found, # so it is now necessary to insert the tag,value combination into # bibxxx table as new. query = """INSERT INTO %s """ % table_name query += """ (tag, value) values (%s , %s)""" params = (tag, value) try: if not pretend: row_id = run_sql(query, params) else: return (table_name, 1) except Error, error: write_message(" Error during the insert_record_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) return (table_name, row_id) def insert_record_bibrec_bibxxx(table_name, id_bibxxx, field_number, id_bibrec, pretend=False): """Insert the record into bibrec_bibxxx""" # determine into which table one should insert the record full_table_name = 'bibrec_'+ table_name # insert the proper row into the table query = """INSERT INTO %s """ % full_table_name query += """(id_bibrec,id_bibxxx, field_number) values (%s , %s, %s)""" params = (id_bibrec, id_bibxxx, field_number) try: if not pretend: res = run_sql(query, params) else: return 1 except Error, error: write_message(" Error during the insert_record_bibrec_bibxxx" " function 2nd query : %s " % error, verbose=1, stream=sys.stderr) return res def synchronize_8564(rec_id, record, record_had_FFT, pretend=False): """ Synchronize 8564_ tags and BibDocFile tables. This function directly manipulate the record parameter. @type rec_id: positive integer @param rec_id: the record identifier. @param record: the record structure as created by bibrecord.create_record @type record_had_FFT: boolean @param record_had_FFT: True if the incoming bibuploaded-record used FFT @return: the manipulated record (which is also modified as a side effect) """ def merge_marc_into_bibdocfile(field, pretend=False): """ Internal function that reads a single field and store its content in BibDocFile tables. @param field: the 8564_ field containing a BibDocFile URL. """ write_message('Merging field: %s' % (field, ), verbose=9) url = field_get_subfield_values(field, 'u')[:1] or field_get_subfield_values(field, 'q')[:1] description = field_get_subfield_values(field, 'y')[:1] comment = field_get_subfield_values(field, 'z')[:1] if url: recid, docname, format = decompose_bibdocfile_url(url[0]) if recid != rec_id: write_message("INFO: URL %s is not pointing to a fulltext owned by this record (%s)" % (url, recid), stream=sys.stderr) else: try: bibdoc = BibRecDocs(recid).get_bibdoc(docname) if description and not pretend: bibdoc.set_description(description[0], format) if comment and not pretend: bibdoc.set_comment(comment[0], format) except InvenioWebSubmitFileError: ## Apparently the referenced docname doesn't exist anymore. ## Too bad. Let's skip it. write_message("WARNING: docname %s does not seem to exist for record %s. Has it been renamed outside FFT?" % (docname, recid), stream=sys.stderr) def merge_bibdocfile_into_marc(field, subfields): """ Internal function that reads BibDocFile table entries referenced by the URL in the given 8564_ field and integrate the given information directly with the provided subfields. @param field: the 8564_ field containing a BibDocFile URL. @param subfields: the subfields corresponding to the BibDocFile URL generated after BibDocFile tables. """ write_message('Merging subfields %s into field %s' % (subfields, field), verbose=9) subfields = dict(subfields) ## We make a copy not to have side-effects subfield_to_delete = [] for subfield_position, (code, value) in enumerate(field_get_subfield_instances(field)): ## For each subfield instance already existing... if code in subfields: ## ...We substitute it with what is in BibDocFile tables record_modify_subfield(record, '856', code, subfields[code], subfield_position, field_position_global=field[4]) del subfields[code] else: ## ...We delete it otherwise subfield_to_delete.append(subfield_position) subfield_to_delete.sort() for counter, position in enumerate(subfield_to_delete): ## FIXME: Very hackish algorithm. Since deleting a subfield ## will alterate the position of following subfields, we ## are taking note of this and adjusting further position ## by using a counter. record_delete_subfield_from(record, '856', position - counter, field_position_global=field[4]) subfields = subfields.items() subfields.sort() for code, value in subfields: ## Let's add non-previously existing subfields record_add_subfield_into(record, '856', code, value, field_position_global=field[4]) def get_bibdocfile_managed_info(): """ Internal function to eturns a dictionary of BibDocFile URL -> wanna-be subfields. @rtype: mapping @return: BibDocFile URL -> wanna-be subfields dictionary """ ret = {} bibrecdocs = BibRecDocs(rec_id) latest_files = bibrecdocs.list_latest_files(list_hidden=False) for afile in latest_files: url = afile.get_url() ret[url] = {'u' : url} description = afile.get_description() comment = afile.get_comment() subformat = afile.get_subformat() if description: ret[url]['y'] = description if comment: ret[url]['z'] = comment if subformat: ret[url]['x'] = subformat return ret write_message("Synchronizing MARC of recid '%s' with:\n%s" % (rec_id, record), verbose=9) tags856s = record_get_field_instances(record, '856', '%', '%') write_message("Original 856%% instances: %s" % tags856s, verbose=9) tags8564s_to_add = get_bibdocfile_managed_info() write_message("BibDocFile instances: %s" % tags8564s_to_add, verbose=9) positions_tags8564s_to_remove = [] for local_position, field in enumerate(tags856s): if field[1] == '4' and field[2] == ' ': write_message('Analysing %s' % (field, ), verbose=9) for url in field_get_subfield_values(field, 'u') + field_get_subfield_values(field, 'q'): if url in tags8564s_to_add: if record_had_FFT: merge_bibdocfile_into_marc(field, tags8564s_to_add[url]) else: merge_marc_into_bibdocfile(field, pretend=pretend) del tags8564s_to_add[url] break elif bibdocfile_url_p(url) and decompose_bibdocfile_url(url)[0] == rec_id: positions_tags8564s_to_remove.append(local_position) write_message("%s to be deleted and re-synchronized" % (field, ), verbose=9) break record_delete_fields(record, '856', positions_tags8564s_to_remove) tags8564s_to_add = tags8564s_to_add.values() tags8564s_to_add.sort() for subfields in tags8564s_to_add: subfields = subfields.items() subfields.sort() record_add_field(record, '856', '4', ' ', subfields=subfields) write_message('Final record: %s' % record, verbose=9) return record def elaborate_fft_tags(record, rec_id, mode, pretend=False): """ Process FFT tags that should contain $a with file pathes or URLs to get the fulltext from. This function enriches record with proper 8564 URL tags, downloads fulltext files and stores them into var/data structure where appropriate. CFG_BIBUPLOAD_WGET_SLEEP_TIME defines time to sleep in seconds in between URL downloads. Note: if an FFT tag contains multiple $a subfields, we upload them into different 856 URL tags in the metadata. See regression test case test_multiple_fft_insert_via_http(). """ # Let's define some handy sub procedure. def _add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags, pretend=False): """Adds a new format for a given bibdoc. Returns True when everything's fine.""" write_message('Add new format to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s' % (repr(bibdoc), url, format, docname, doctype, newname, description, comment, flags), verbose=9) try: if not url: # Not requesting a new url. Just updating comment & description return _update_description_and_comment(bibdoc, docname, format, description, comment, flags, pretend=pretend) try: if not pretend: bibdoc.add_file_new_format(url, description=description, comment=comment, flags=flags) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because format already exists (%s)." % (url, format, docname, doctype, newname, description, comment, flags, e), stream=sys.stderr) raise except Exception, e: write_message("Error in adding '%s' as a new format because of: %s" % (url, e), stream=sys.stderr) raise return True def _add_new_version(bibdoc, url, format, docname, doctype, newname, description, comment, flags, pretend=False): """Adds a new version for a given bibdoc. Returns True when everything's fine.""" write_message('Add new version to %s url: %s, format: %s, docname: %s, doctype: %s, newname: %s, description: %s, comment: %s, flags: %s' % (repr(bibdoc), url, format, docname, doctype, newname, description, comment, flags)) try: if not url: return _update_description_and_comment(bibdoc, docname, format, description, comment, flags, pretend=pretend) try: if not pretend: bibdoc.add_file_new_version(url, description=description, comment=comment, flags=flags) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') not inserted because '%s'." % (url, format, docname, doctype, newname, description, comment, flags, e), stream=sys.stderr) raise except Exception, e: write_message("Error in adding '%s' as a new version because of: %s" % (url, e), stream=sys.stderr) raise return True def _update_description_and_comment(bibdoc, docname, format, description, comment, flags, pretend=False): """Directly update comments and descriptions.""" write_message('Just updating description and comment for %s with format %s with description %s, comment %s and flags %s' % (docname, format, description, comment, flags), verbose=9) try: if not pretend: bibdoc.set_description(description, format) bibdoc.set_comment(comment, format) for flag in CFG_BIBDOCFILE_AVAILABLE_FLAGS: if flag in flags: bibdoc.set_flag(flag, format) else: bibdoc.unset_flag(flag, format) except StandardError, e: write_message("('%s', '%s', '%s', '%s', '%s') description and comment not updated because '%s'." % (docname, format, description, comment, flags, e)) raise return True if mode == 'delete': raise StandardError('FFT tag specified but bibupload executed in --delete mode') tuple_list = extract_tag_from_record(record, 'FFT') if tuple_list: # FFT Tags analysis write_message("FFTs: "+str(tuple_list), verbose=9) docs = {} # docnames and their data for fft in record_get_field_instances(record, 'FFT', ' ', ' '): # Let's discover the type of the document # This is a legacy field and will not be enforced any particular # check on it. doctype = field_get_subfield_values(fft, 't') if doctype: doctype = doctype[0] else: # Default is Main doctype = 'Main' # Let's discover the url. url = field_get_subfield_values(fft, 'a') if url: url = url[0] try: check_valid_url(url) except StandardError, e: raise StandardError, "fft '%s' specifies in $a a location ('%s') with problems: %s" % (fft, url, e) else: url = '' # Let's discover the description description = field_get_subfield_values(fft, 'd') if description != []: description = description[0] else: if mode == 'correct' and doctype != 'FIX-MARC': ## If the user require to correct, and do not specify ## a description this means she really want to ## modify the description. description = '' else: description = KEEP_OLD_VALUE # Let's discover the desired docname to be created/altered name = field_get_subfield_values(fft, 'n') if name: ## Let's remove undesired extensions name = file_strip_ext(name[0] + '.pdf') else: if url: name = get_docname_from_url(url) elif mode != 'correct' and doctype != 'FIX-MARC': raise StandardError, "Warning: fft '%s' doesn't specifies either a location in $a or a docname in $n" % str(fft) else: continue # Let's discover the desired new docname in case we want to change it newname = field_get_subfield_values(fft, 'm') if newname: newname = file_strip_ext(newname[0] + '.pdf') else: newname = name # Let's discover the desired format format = field_get_subfield_values(fft, 'f') if format: format = normalize_format(format[0]) else: if url: format = guess_format_from_url(url) else: format = "" # Let's discover the icon icon = field_get_subfield_values(fft, 'x') if icon != []: icon = icon[0] if icon != KEEP_OLD_VALUE: try: check_valid_url(icon) except StandardError, e: raise StandardError, "fft '%s' specifies in $x an icon ('%s') with problems: %s" % (fft, icon, e) else: icon = '' # Let's discover the comment comment = field_get_subfield_values(fft, 'z') if comment != []: comment = comment[0] else: if mode == 'correct' and doctype != 'FIX-MARC': ## See comment on description comment = '' else: comment = KEEP_OLD_VALUE # Let's discover the restriction restriction = field_get_subfield_values(fft, 'r') if restriction != []: restriction = restriction[0] else: if mode == 'correct' and doctype != 'FIX-MARC': ## See comment on description restriction = '' else: restriction = KEEP_OLD_VALUE version = field_get_subfield_values(fft, 'v') if version: version = version[0] else: version = '' flags = field_get_subfield_values(fft, 'o') for flag in flags: if flag not in CFG_BIBDOCFILE_AVAILABLE_FLAGS: raise StandardError, "fft '%s' specifies a non available flag: %s" % (fft, flag) if docs.has_key(name): # new format considered (doctype2, newname2, restriction2, version2, urls) = docs[name] if doctype2 != doctype: raise StandardError, "fft '%s' specifies a different doctype from previous fft with docname '%s'" % (str(fft), name) if newname2 != newname: raise StandardError, "fft '%s' specifies a different newname from previous fft with docname '%s'" % (str(fft), name) if restriction2 != restriction: raise StandardError, "fft '%s' specifies a different restriction from previous fft with docname '%s'" % (str(fft), name) if version2 != version: raise StandardError, "fft '%x' specifies a different version than the previous fft with docname '%s'" % (str(fft), name) for (url2, format2, description2, comment2, flags2) in urls: if format == format2: raise StandardError, "fft '%s' specifies a second file '%s' with the same format '%s' from previous fft with docname '%s'" % (str(fft), url, format, name) if url or format: urls.append((url, format, description, comment, flags)) if icon: urls.append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags)) else: if url or format: docs[name] = (doctype, newname, restriction, version, [(url, format, description, comment, flags)]) if icon: docs[name][4].append((icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags)) elif icon: docs[name] = (doctype, newname, restriction, version, [(icon, icon[len(file_strip_ext(icon)):] + ';icon', description, comment, flags)]) else: docs[name] = (doctype, newname, restriction, version, []) write_message('Result of FFT analysis:\n\tDocs: %s' % (docs,), verbose=9) # Let's remove all FFT tags record_delete_field(record, 'FFT', ' ', ' ') # Preprocessed data elaboration bibrecdocs = BibRecDocs(rec_id) ## Let's pre-download all the URLs to see if, in case of mode 'correct' or 'append' ## we can avoid creating a new revision. for docname, (doctype, newname, restriction, version, urls) in docs.items(): downloaded_urls = [] try: bibdoc = bibrecdocs.get_bibdoc(docname) except InvenioWebSubmitFileError: ## A bibdoc with the given docname does not exists. ## So there is no chance we are going to revise an existing ## format with an identical file :-) bibdoc = None new_revision_needed = False for url, format, description, comment, flags in urls: if url: try: downloaded_url = download_url(url, format) write_message("%s saved into %s" % (url, downloaded_url), verbose=9) except Exception, err: write_message("Error in downloading '%s' because of: %s" % (url, err), stream=sys.stderr) raise if mode == 'correct' and bibdoc is not None and not new_revision_needed: downloaded_urls.append((downloaded_url, format, description, comment, flags)) if not bibdoc.check_file_exists(downloaded_url): new_revision_needed = True else: write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr) elif mode == 'append' and bibdoc is not None: if not bibdoc.check_file_exists(downloaded_url): downloaded_urls.append((downloaded_url, format, description, comment, flags)) else: write_message("WARNING: %s is already attached to bibdoc %s for recid %s" % (url, docname, rec_id), stream=sys.stderr) else: downloaded_urls.append((downloaded_url, format, description, comment, flags)) else: downloaded_urls.append(('', format, description, comment, flags)) if mode == 'correct' and bibdoc is not None and not new_revision_needed: ## Since we don't need a new revision (because all the files ## that are being uploaded are different) ## we can simply remove the urls but keep the other information write_message("No need to add a new revision for docname %s for recid %s" % (docname, rec_id), verbose=2) docs[docname] = (doctype, newname, restriction, version, [('', format, description, comment, flags) for (dummy, format, description, comment, flags) in downloaded_urls]) for downloaded_url, dummy, dummy, dummy, dummy in downloaded_urls: ## Let's free up some space :-) if downloaded_url and os.path.exists(downloaded_url): os.remove(downloaded_url) else: if downloaded_urls or mode != 'append': docs[docname] = (doctype, newname, restriction, version, downloaded_urls) else: ## In case we are in append mode and there are no urls to append ## we discard the whole FFT del docs[docname] if mode == 'replace': # First we erase previous bibdocs if not pretend: for bibdoc in bibrecdocs.list_bibdocs(): bibdoc.delete() bibrecdocs.build_bibdoc_list() for docname, (doctype, newname, restriction, version, urls) in docs.iteritems(): write_message("Elaborating olddocname: '%s', newdocname: '%s', doctype: '%s', restriction: '%s', urls: '%s', mode: '%s'" % (docname, newname, doctype, restriction, urls, mode), verbose=9) if mode in ('insert', 'replace'): # new bibdocs, new docnames, new marc if newname in bibrecdocs.get_bibdoc_names(): write_message("('%s', '%s') not inserted because docname already exists." % (newname, urls), stream=sys.stderr) raise StandardError try: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) bibdoc.set_status(restriction) else: bibdoc = None except Exception, e: write_message("('%s', '%s', '%s') not inserted because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr) raise StandardError for (url, format, description, comment, flags) in urls: assert(_add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags, pretend=pretend)) elif mode == 'replace_or_insert': # to be thought as correct_or_insert for bibdoc in bibrecdocs.list_bibdocs(): if bibdoc.get_docname() == docname: if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'): if newname != docname: try: if not pretend: bibdoc.change_name(newname) ## Let's refresh the list of bibdocs. bibrecdocs.build_bibdoc_list() except StandardError, e: write_message(e, stream=sys.stderr) raise found_bibdoc = False for bibdoc in bibrecdocs.list_bibdocs(): if bibdoc.get_docname() == newname: found_bibdoc = True if doctype == 'PURGE': if not pretend: bibdoc.purge() elif doctype == 'DELETE': if not pretend: bibdoc.delete() elif doctype == 'EXPUNGE': if not pretend: bibdoc.expunge() elif doctype == 'FIX-ALL': if not pretend: bibrecdocs.fix(docname) elif doctype == 'FIX-MARC': pass elif doctype == 'DELETE-FILE': if urls: for (url, format, description, comment, flags) in urls: if not pretend: bibdoc.delete_file(format, version) elif doctype == 'REVERT': try: if not pretend: bibdoc.revert(version) except Exception, e: write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr) raise else: if restriction != KEEP_OLD_VALUE: if not pretend: bibdoc.set_status(restriction) # Since the docname already existed we have to first # bump the version by pushing the first new file # then pushing the other files. if urls: (first_url, first_format, first_description, first_comment, first_flags) = urls[0] other_urls = urls[1:] assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, pretend=pretend)) for (url, format, description, comment, flags) in other_urls: assert(_add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags, pretend=pretend)) ## Let's refresh the list of bibdocs. bibrecdocs.build_bibdoc_list() if not found_bibdoc: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) for (url, format, description, comment, flags) in urls: assert(_add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags)) elif mode == 'correct': for bibdoc in bibrecdocs.list_bibdocs(): if bibdoc.get_docname() == docname: if doctype not in ('PURGE', 'DELETE', 'EXPUNGE', 'REVERT', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE'): if newname != docname: try: if not pretend: bibdoc.change_name(newname) ## Let's refresh the list of bibdocs. bibrecdocs.build_bibdoc_list() except StandardError, e: write_message('Error in renaming %s to %s: %s' % (docname, newname, e), stream=sys.stderr) raise found_bibdoc = False for bibdoc in bibrecdocs.list_bibdocs(): if bibdoc.get_docname() == newname: found_bibdoc = True if doctype == 'PURGE': if not pretend: bibdoc.purge() elif doctype == 'DELETE': if not pretend: bibdoc.delete() elif doctype == 'EXPUNGE': if not pretend: bibdoc.expunge() elif doctype == 'FIX-ALL': if not pretend: bibrecdocs.fix(newname) elif doctype == 'FIX-MARC': pass elif doctype == 'DELETE-FILE': if urls: for (url, format, description, comment, flags) in urls: if not pretend: bibdoc.delete_file(format, version) elif doctype == 'REVERT': try: if not pretend: bibdoc.revert(version) except Exception, e: write_message('(%s, %s) not correctly reverted: %s' % (newname, version, e), stream=sys.stderr) raise else: if restriction != KEEP_OLD_VALUE: if not pretend: bibdoc.set_status(restriction) if urls: (first_url, first_format, first_description, first_comment, first_flags) = urls[0] other_urls = urls[1:] assert(_add_new_version(bibdoc, first_url, first_format, docname, doctype, newname, first_description, first_comment, first_flags, pretend=pretend)) for (url, format, description, comment, flags) in other_urls: assert(_add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags, pretend=pretend)) ## Let's refresh the list of bibdocs. bibrecdocs.build_bibdoc_list() if not found_bibdoc: if doctype in ('PURGE', 'DELETE', 'EXPUNGE', 'FIX-ALL', 'FIX-MARC', 'DELETE-FILE', 'REVERT'): write_message("('%s', '%s', '%s') not performed because '%s' docname didn't existed." % (doctype, newname, urls, docname), stream=sys.stderr) raise StandardError else: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, newname) for (url, format, description, comment, flags) in urls: assert(_add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags)) elif mode == 'append': try: found_bibdoc = False for bibdoc in bibrecdocs.list_bibdocs(): if bibdoc.get_docname() == docname: found_bibdoc = True for (url, format, description, comment, flags) in urls: assert(_add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags, pretend=pretend)) if not found_bibdoc: try: if not pretend: bibdoc = bibrecdocs.add_bibdoc(doctype, docname) bibdoc.set_status(restriction) for (url, format, description, comment, flags) in urls: assert(_add_new_format(bibdoc, url, format, docname, doctype, newname, description, comment, flags)) except Exception, e: register_exception() write_message("('%s', '%s', '%s') not appended because: '%s'." % (doctype, newname, urls, e), stream=sys.stderr) raise except: register_exception() raise return record def insert_fmt_tags(record, rec_id, opt_mode, pretend=False): """Process and insert FMT tags""" fmt_fields = record_get_field_instances(record, 'FMT') if fmt_fields: for fmt_field in fmt_fields: # Get the d, f, g subfields of the FMT tag try: d_value = field_get_subfield_values(fmt_field, "d")[0] except IndexError: d_value = "" try: f_value = field_get_subfield_values(fmt_field, "f")[0] except IndexError: f_value = "" try: g_value = field_get_subfield_values(fmt_field, "g")[0] except IndexError: g_value = "" # Update the format if not pretend: res = update_bibfmt_format(rec_id, g_value, f_value, d_value, pretend=pretend) if res == 1: write_message(" Failed: Error during update_bibfmt", verbose=1, stream=sys.stderr) # If we are in format mode, we only care about the FMT tag if opt_mode == 'format': return 0 # We delete the FMT Tag of the record record_delete_field(record, 'FMT') write_message(" -Delete field FMT from record : DONE", verbose=2) return record elif opt_mode == 'format': write_message(" Failed: Format updated failed : No tag FMT found", verbose=1, stream=sys.stderr) return None else: return record ### Update functions def update_bibrec_modif_date(now, bibrec_id, pretend=False): """Update the date of the record in bibrec table """ query = """UPDATE bibrec SET modification_date=%s WHERE id=%s""" params = (now, bibrec_id) try: if not pretend: run_sql(query, params) write_message(" -Update record modification date : DONE" , verbose=2) except Error, error: write_message(" Error during update_bibrec_modif_date function : %s" % error, verbose=1, stream=sys.stderr) def update_bibfmt_format(id_bibrec, format_value, format_name, modification_date=None, pretend=False): """Update the format in the table bibfmt""" if modification_date is None: modification_date = time.strftime('%Y-%m-%d %H:%M:%S') else: try: time.strptime(modification_date, "%Y-%m-%d %H:%M:%S") except ValueError: modification_date = '1970-01-01 00:00:00' # We check if the format is already in bibFmt nb_found = find_record_format(id_bibrec, format_name) if nb_found == 1: # we are going to update the format # compress the format_value value pickled_format_value = compress(format_value) # update the format: query = """UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s""" params = (modification_date, pickled_format_value, id_bibrec, format_name) try: if not pretend: row_id = run_sql(query, params) if not pretend and row_id is None: write_message(" Failed: Error during update_bibfmt_format function", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Update the format %s in bibfmt : DONE" % format_name , verbose=2) return 0 except Error, error: write_message(" Error during the update_bibfmt_format function : %s " % error, verbose=1, stream=sys.stderr) elif nb_found > 1: write_message(" Failed: Same format %s found several time in bibfmt for the same record." % format_name, verbose=1, stream=sys.stderr) return 1 else: # Insert the format information in BibFMT res = insert_bibfmt(id_bibrec, format_value, format_name, modification_date, pretend=pretend) if res is None: write_message(" Failed: Error during insert_bibfmt", verbose=1, stream=sys.stderr) return 1 else: write_message(" -Insert the format %s in bibfmt : DONE" % format_name , verbose=2) return 0 def archive_marcxml_for_history(recID, pretend=False): """ Archive current MARCXML format of record RECID from BIBFMT table into hstRECORD table. Useful to keep MARCXML history of records. Return 0 if everything went fine. Return 1 otherwise. """ try: res = run_sql("SELECT id_bibrec, value, last_updated FROM bibfmt WHERE format='xm' AND id_bibrec=%s", (recID,)) if res and not pretend: run_sql("""INSERT INTO hstRECORD (id_bibrec, marcxml, job_id, job_name, job_person, job_date, job_details) VALUES (%s,%s,%s,%s,%s,%s,%s)""", (res[0][0], res[0][1], task_get_task_param('task_id', 0), 'bibupload', task_get_task_param('user','UNKNOWN'), res[0][2], 'mode: ' + task_get_option('mode','UNKNOWN') + '; file: ' + task_get_option('file_path','UNKNOWN') + '.')) except Error, error: write_message(" Error during archive_marcxml_for_history: %s " % error, verbose=1, stream=sys.stderr) return 1 return 0 def update_database_with_metadata(record, rec_id, oai_rec_id = "oai", pretend=False): """Update the database tables with the record and the record id given in parameter""" for tag in record.keys(): # check if tag is not a special one: if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: # for each tag there is a list of tuples representing datafields tuple_list = record[tag] # this list should contain the elements of a full tag [tag, ind1, ind2, subfield_code] tag_list = [] tag_list.append(tag) for single_tuple in tuple_list: # these are the contents of a single tuple subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # append the ind's to the full tag if ind1 == '' or ind1 == ' ': tag_list.append('_') else: tag_list.append(ind1) if ind2 == '' or ind2 == ' ': tag_list.append('_') else: tag_list.append(ind2) datafield_number = single_tuple[4] if tag in CFG_BIBUPLOAD_SPECIAL_TAGS: # nothing to do for special tags (FFT, FMT) pass elif tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS and tag != "001": value = single_tuple[3] # get the full tag full_tag = ''.join(tag_list) # update the tables write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend) #print 'tname, bibrow', table_name, bibxxx_row_id; if table_name is None or bibxxx_row_id is None: write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend) if res is None: write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) else: # get the tag and value from the content of each subfield for subfield in subfield_list: subtag = subfield[0] value = subfield[1] tag_list.append(subtag) # get the full tag full_tag = ''.join(tag_list) # update the tables write_message(" insertion of the tag "+full_tag+" with the value "+value, verbose=9) # insert the tag and value into into bibxxx (table_name, bibxxx_row_id) = insert_record_bibxxx(full_tag, value, pretend=pretend) if table_name is None or bibxxx_row_id is None: write_message(" Failed : during insert_record_bibxxx", verbose=1, stream=sys.stderr) # connect bibxxx and bibrec with the table bibrec_bibxxx res = insert_record_bibrec_bibxxx(table_name, bibxxx_row_id, datafield_number, rec_id, pretend=pretend) if res is None: write_message(" Failed : during insert_record_bibrec_bibxxx", verbose=1, stream=sys.stderr) # remove the subtag from the list tag_list.pop() tag_list.pop() tag_list.pop() tag_list.pop() write_message(" -Update the database with metadata : DONE", verbose=2) log_record_uploading(oai_rec_id, task_get_task_param('task_id', 0), rec_id, 'P', pretend=pretend) def append_new_tag_to_old_record(record, rec_old, opt_tag, opt_mode): """Append new tags to a old record""" def _append_tag(tag): # Reference mode append only reference tag if opt_mode == 'reference': if tag == CFG_BIBUPLOAD_REFERENCE_TAG: for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] # We add the datafield to the old record write_message(" Adding tag: %s ind1=%s ind2=%s code=%s" % (tag, ind1, ind2, subfield_list), verbose=9) newfield_number = record_add_field(rec_old, tag, ind1, ind2, subfields=subfield_list) if newfield_number is None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: if tag in CFG_BIBUPLOAD_CONTROLFIELD_TAGS: if tag == '001': pass else: # if it is a controlfield,just access the value for single_tuple in record[tag]: controlfield_value = single_tuple[3] # add the field to the old record newfield_number = record_add_field(rec_old, tag, controlfield_value=controlfield_value) if newfield_number is None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) else: # For each tag there is a list of tuples representing datafields for single_tuple in record[tag]: # We retrieve the information of the tag subfield_list = single_tuple[0] ind1 = single_tuple[1] ind2 = single_tuple[2] if '%s%s%s' % (tag, ind1 == ' ' and '_' or ind1, ind2 == ' ' and '_' or ind2) in (CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:5], CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG[:5]): ## We don't want to append the external identifier ## if it is already existing. if record_find_field(rec_old, tag, single_tuple)[0] is not None: write_message(" Not adding tag: %s ind1=%s ind2=%s subfields=%s: it's already there" % (tag, ind1, ind2, subfield_list), verbose=9) continue # We add the datafield to the old record write_message(" Adding tag: %s ind1=%s ind2=%s subfields=%s" % (tag, ind1, ind2, subfield_list), verbose=9) newfield_number = record_add_field(rec_old, tag, ind1, ind2, subfields=subfield_list) if newfield_number is None: write_message(" Error when adding the field"+tag, verbose=1, stream=sys.stderr) if opt_tag is not None: _append_tag(opt_tag) else: # Go through each tag in the appended record for tag in record: _append_tag(tag) return rec_old def copy_strong_tags_from_old_record(record, rec_old): """ Look for strong tags in RECORD and REC_OLD. If no strong tags are found in RECORD, then copy them over from REC_OLD. This function modifies RECORD structure on the spot. """ for strong_tag in CFG_BIBUPLOAD_STRONG_TAGS: if not record_get_field_instances(record, strong_tag): strong_tag_old_field_instances = record_get_field_instances(rec_old, strong_tag) if strong_tag_old_field_instances: for strong_tag_old_field_instance in strong_tag_old_field_instances: sf_vals, fi_ind1, fi_ind2, controlfield, dummy = strong_tag_old_field_instance record_add_field(record, strong_tag, fi_ind1, fi_ind2, controlfield, sf_vals) return ### Delete functions def delete_tags(record, rec_old): """ Returns a record structure with all the fields in rec_old minus the fields in record. @param record: The record containing tags to delete. @type record: record structure @param rec_old: The original record. @type rec_old: record structure @return: The modified record. @rtype: record structure """ returned_record = copy.deepcopy(rec_old) for tag, fields in record.iteritems(): if tag in ('001', ): continue for field in fields: local_position = record_find_field(returned_record, tag, field)[1] if local_position is not None: record_delete_field(returned_record, tag, field_position_local=local_position) return returned_record def delete_tags_to_correct(record, rec_old, opt_tag): """ Delete tags from REC_OLD which are also existing in RECORD. When deleting, pay attention not only to tags, but also to indicators, so that fields with the same tags but different indicators are not deleted. """ ## Some fields are controlled via provenance information. ## We should re-add saved fields at the end. fields_to_readd = {} for tag in CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS: if tag[:3] in record: tmp_field_instances = record_get_field_instances(record, tag[:3], tag[3], tag[4]) ## Let's discover the provenance that will be updated provenances_to_update = [] for instance in tmp_field_instances: for code, value in instance[0]: if code == tag[5]: if value not in provenances_to_update: provenances_to_update.append(value) break else: ## The provenance is not specified. ## let's add the special empty provenance. if '' not in provenances_to_update: provenances_to_update.append('') potential_fields_to_readd = record_get_field_instances(rec_old, tag[:3], tag[3], tag[4]) ## Let's take all the field corresponding to tag ## Let's save apart all the fields that should be updated, but ## since they have a different provenance not mentioned in record ## they should be preserved. fields = [] for sf_vals, ind1, ind2, dummy_cf, dummy_line in potential_fields_to_readd: for code, value in sf_vals: if code == tag[5]: if value not in provenances_to_update: fields.append(sf_vals) break else: if '' not in provenances_to_update: ## Empty provenance, let's protect in any case fields.append(sf_vals) fields_to_readd[tag] = fields # browse through all the tags from the MARCXML file: for tag in record: # do we have to delete only a special tag or any tag? if opt_tag is None or opt_tag == tag: # check if the tag exists in the old record too: if tag in rec_old and tag != '001': # the tag does exist, so delete all record's tag+ind1+ind2 combinations from rec_old for dummy_sf_vals, ind1, ind2, dummy_cf, field_number in record[tag]: write_message(" Delete tag: " + tag + " ind1=" + ind1 + " ind2=" + ind2, verbose=9) record_delete_field(rec_old, tag, ind1, ind2) ## Ok, we readd necessary fields! for tag, fields in fields_to_readd.iteritems(): for sf_vals in fields: write_message(" Adding tag: " + tag[:3] + " ind1=" + tag[3] + " ind2=" + tag[4] + " code=" + str(sf_vals), verbose=9) record_add_field(rec_old, tag[:3], tag[3], tag[4], subfields=sf_vals) def delete_bibrec_bibxxx(record, id_bibrec, pretend=False): """Delete the database record from the table bibxxx given in parameters""" # we clear all the rows from bibrec_bibxxx from the old record for tag in record.keys(): if tag not in CFG_BIBUPLOAD_SPECIAL_TAGS: # for each name construct the bibrec_bibxxx table name table_name = 'bibrec_bib'+tag[0:2]+'x' # delete all the records with proper id_bibrec query = """DELETE FROM `%s` where id_bibrec = %s""" params = (table_name, id_bibrec) if not pretend: try: run_sql(query % params) except Error, error: write_message(" Error during the delete_bibrec_bibxxx function : %s " % error, verbose=1, stream=sys.stderr) def main(): """Main that construct all the bibtask.""" task_init(authorization_action='runbibupload', authorization_msg="BibUpload Task Submission", description="""Receive MARC XML file and update appropriate database tables according to options. Examples: $ bibupload -i input.xml """, help_specific_usage=""" -a, --append\t\tnew fields are appended to the existing record -c, --correct\t\tfields are replaced by the new ones in the existing record -f, --format\t\ttakes only the FMT fields into account. Does not update -i, --insert\t\tinsert the new record in the database -r, --replace\t\tthe existing record is entirely replaced by the new one -z, --reference\tupdate references (update only 999 fields) -d, --delete\t\tspecified fields are deleted in existing record -S, --stage=STAGE\tstage to start from in the algorithm (0: always done; 1: FMT tags; \t\t\t2: FFT tags; 3: BibFmt; 4: Metadata update; 5: time update) -n, --notimechange\tdo not change record last modification date when updating -o, --holdingpen\tInsert record into holding pen instead of the normal database --pretend\t\tdo not really insert/append/correct/replace the input file --force\t\twhen --replace, use provided 001 tag values, even if the matching \t\t\trecord does not exist (thus allocating it on-the-fly) --callback-url\tSend via a POST request a JSON-serialized answer (see admin guide), in \t\t\torder to provide a feedback to an external service about the outcome of the operation. """, version=__revision__, specific_params=("ircazdS:fno", [ "insert", "replace", "correct", "append", "reference", "delete", "stage=", "format", "notimechange", "holdingpen", "pretend", "force", "callback-url=" ]), task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_run_fnc=task_run_core) def task_submit_elaborate_specific_parameter(key, value, opts, args): """ Given the string key it checks it's meaning, eventually using the value. Usually it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. eg: if key in ['-n', '--number']: task_get_option(\1) = value return True return False """ # No time change option if key in ("-n", "--notimechange"): task_set_option('notimechange', 1) # Insert mode option elif key in ("-i", "--insert"): if task_get_option('mode') == 'replace': # if also replace found, then set to replace_or_insert task_set_option('mode', 'replace_or_insert') else: task_set_option('mode', 'insert') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Replace mode option elif key in ("-r", "--replace"): if task_get_option('mode') == 'insert': # if also insert found, then set to replace_or_insert task_set_option('mode', 'replace_or_insert') else: task_set_option('mode', 'replace') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Holding pen mode option elif key in ("-o", "--holdingpen"): write_message("Holding pen mode", verbose=3) task_set_option('mode', 'holdingpen') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Correct mode option elif key in ("-c", "--correct"): task_set_option('mode', 'correct') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Append mode option elif key in ("-a", "--append"): task_set_option('mode', 'append') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Reference mode option elif key in ("-z", "--reference"): task_set_option('mode', 'reference') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("-d", "--delete"): task_set_option('mode', 'delete') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Format mode option elif key in ("-f", "--format"): task_set_option('mode', 'format') fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--pretend",): task_set_option('pretend', True) fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) elif key in ("--force",): task_set_option('force', True) fix_argv_paths([args[0]]) task_set_option('file_path', os.path.abspath(args[0])) # Stage elif key in ("-S", "--stage"): try: value = int(value) except ValueError: print >> sys.stderr, """The value specified for --stage must be a valid integer, not %s""" % value return False if not (0 <= value <= 5): print >> sys.stderr, """The value specified for --stage must be comprised between 0 and 5""" return False task_set_option('stage_to_start_from', value) elif key in ("--callback-url", ): task_set_option('callback_url', value) else: return False return True def task_submit_check_options(): """ Reimplement this method for having the possibility to check options before submitting the task, in order for example to provide default values. It must return False if there are errors in the options. """ if task_get_option('mode') is None: write_message("Please specify at least one update/insert mode!") return False if task_get_option('file_path') is None: write_message("Missing filename! -h for help.") return False return True def writing_rights_p(): """Return True in case bibupload has the proper rights to write in the fulltext file folder.""" global _WRITING_RIGHTS if _WRITING_RIGHTS is not None: return _WRITING_RIGHTS try: if not os.path.exists(CFG_WEBSUBMIT_FILEDIR): os.makedirs(CFG_WEBSUBMIT_FILEDIR) fd, filename = tempfile.mkstemp(suffix='.txt', prefix='test', dir=CFG_WEBSUBMIT_FILEDIR) test = os.fdopen(fd, 'w') test.write('TEST') test.close() if open(filename).read() != 'TEST': raise IOError("Can not successfully write and readback %s" % filename) os.remove(filename) except: register_exception(alert_admin=True) return False return True def post_results_to_callback_url(results, callback_url): if not CFG_JSON_AVAILABLE: from warnings import warn warn("--callback-url used but simplejson/json not available") return json_results = json.dumps(results) ## :///?# scheme, netloc, path, query, fragment = urlparse.urlsplit(callback_url) ## See: http://stackoverflow.com/questions/111945/is-there-any-way-to-do-http-put-in-python if scheme == 'http': opener = urllib2.build_opener(urllib2.HTTPHandler) elif scheme == 'https': opener = urllib2.build_opener(urllib2.HTTPSHandler) else: raise ValueError("Scheme not handled %s for callback_url %s" % (scheme, callback_url)) request = urllib2.Request(callback_url, data=json_results) request.add_header('Content-Type', 'application/json') request.get_method = lambda: 'POST' return opener.open(request) def task_run_core(): """ Reimplement to add the body of the task.""" error = 0 write_message("Input file '%s', input mode '%s'." % (task_get_option('file_path'), task_get_option('mode'))) write_message("STAGE 0:", verbose=2) if task_get_option('file_path') is not None: write_message("start preocessing", verbose=3) task_update_progress("Reading XML input") recs = xml_marc_to_records(open_marc_file(task_get_option('file_path'))) stat['nb_records_to_upload'] = len(recs) write_message(" -Open XML marc: DONE", verbose=2) task_sleep_now_if_required(can_stop_too=True) write_message("Entering records loop", verbose=3) callback_url = task_get_option('callback_url') results_for_callback = {'results': []} if recs is not None: # We proceed each record by record for record in recs: record_id = record_extract_oai_id(record) task_sleep_now_if_required(can_stop_too=True) if task_get_option("mode") == "holdingpen": #inserting into the holding pen write_message("Inserting into holding pen", verbose=3) insert_record_into_holding_pen(record, record_id) else: write_message("Inserting into main database", verbose=3) error = bibupload( record, opt_tag=task_get_option('tag'), opt_mode=task_get_option('mode'), opt_stage_to_start_from=task_get_option('stage_to_start_from'), opt_notimechange=task_get_option('notimechange'), oai_rec_id=record_id, pretend=task_get_option('pretend')) if error[0] == 1: if record: write_message(record_xml_output(record), stream=sys.stderr) else: write_message("Record could not have been parsed", stream=sys.stderr) stat['nb_errors'] += 1 if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) elif error[0] == 2: if record: write_message(record_xml_output(record), stream=sys.stderr) else: write_message("Record could not have been parsed", stream=sys.stderr) if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) elif error[0] == 0: if callback_url: from invenio.search_engine import print_record results_for_callback['results'].append({'recid': error[1], 'success': True, "marcxml": print_record(error[1], 'xm'), 'url': "%s/%s/%s" % (CFG_SITE_URL, CFG_SITE_RECORD, error[1])}) else: if callback_url: results_for_callback['results'].append({'recid': error[1], 'success': False, 'error_message': error[2]}) task_update_progress("Done %d out of %d." % \ (stat['nb_records_inserted'] + \ stat['nb_records_updated'], stat['nb_records_to_upload'])) else: write_message(" Error bibupload failed: No record found", verbose=1, stream=sys.stderr) callback_url = task_get_option("callback_url") if callback_url: post_results_to_callback_url(results_for_callback, callback_url) if task_get_task_param('verbose') >= 1: # Print out the statistics print_out_bibupload_statistics() # Check if they were errors return not stat['nb_errors'] >= 1 def log_record_uploading(oai_rec_id, task_id, bibrec_id, insertion_db, pretend=False): if oai_rec_id != "" and oai_rec_id != None: query = """UPDATE oaiHARVESTLOG SET date_inserted=NOW(), inserted_to_db=%s, id_bibrec=%s WHERE oai_id = %s AND bibupload_task_id = %s ORDER BY date_harvested LIMIT 1""" try: if not pretend: run_sql(query, (str(insertion_db), str(bibrec_id), str(oai_rec_id), str(task_id), )) except Error, error: write_message(" Error during the log_record_uploading function : %s " % error, verbose=1, stream=sys.stderr) if __name__ == "__main__": main() diff --git a/modules/bibupload/lib/bibupload_config.py b/modules/bibupload/lib/bibupload_config.py index 690e5a2e0..641cf8cef 100644 --- a/modules/bibupload/lib/bibupload_config.py +++ b/modules/bibupload/lib/bibupload_config.py @@ -1,31 +1,30 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibUpload Engine configuration. """ __revision__ = "$Id$" CFG_BIBUPLOAD_CONTROLFIELD_TAGS = ['001', '002', '003', '004', '005', '006', '007', '008'] CFG_BIBUPLOAD_SPECIAL_TAGS = ['FMT', 'FFT'] - diff --git a/modules/miscutil/lib/htmlutils.py b/modules/miscutil/lib/htmlutils.py index 06fece685..1b3a95e25 100644 --- a/modules/miscutil/lib/htmlutils.py +++ b/modules/miscutil/lib/htmlutils.py @@ -1,636 +1,786 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """HTML utilities.""" __revision__ = "$Id$" from HTMLParser import HTMLParser from invenio.config import CFG_SITE_URL, \ CFG_MATHJAX_HOSTING, \ CFG_SITE_LANG, \ CFG_WEBDIR -from invenio.textutils import indent_text +from invenio.textutils import indent_text, encode_for_xml import re import cgi import os try: from BeautifulSoup import BeautifulSoup CFG_BEAUTIFULSOUP_INSTALLED = True except ImportError: CFG_BEAUTIFULSOUP_INSTALLED = False try: import tidy CFG_TIDY_INSTALLED = True except ImportError: CFG_TIDY_INSTALLED = False # List of allowed tags (tags that won't create any XSS risk) -cfg_html_buffer_allowed_tag_whitelist = ('a', +CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST = ('a', 'p', 'br', 'blockquote', 'strong', 'b', 'u', 'i', 'em', 'ul', 'ol', 'li', 'sub', 'sup', 'div', 'strike') # List of allowed attributes. Be cautious, some attributes may be risky: #

    -cfg_html_buffer_allowed_attribute_whitelist = ('href', 'name', 'class') +CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST = ('href', 'name', 'class') ## precompile some often-used regexp for speed reasons: -re_html = re.compile("(?s)<[^>]*>|&#?\w+;") +RE_HTML = re.compile("(?s)<[^>]*>|&#?\w+;") def nmtoken_from_string(text): """ Returns a Nmtoken from a string. It is useful to produce XHTML valid values for the 'name' attribute of an anchor. CAUTION: the function is surjective: 2 different texts might lead to the same result. This is improbable on a single page. Nmtoken is the type that is a mixture of characters supported in attributes such as 'name' in HTML 'a' tag. For example, should be tranformed to using this function. http://www.w3.org/TR/2000/REC-xml-20001006#NT-Nmtoken Also note that this function filters more characters than specified by the definition of Nmtoken ('CombiningChar' and 'Extender' charsets are filtered out). """ text = text.replace('-', '--') return ''.join( [( ((not char.isalnum() and not char in ['.', '-', '_', ':']) and str(ord(char))) or char) for char in text] ) def escape_html(text, escape_quotes=False): """Escape all HTML tags, avoiding XSS attacks. < => < > => > & => &: @param text: text to be escaped from HTML tags @param escape_quotes: if True, escape any quote mark to its HTML entity: " => " ' => " """ text = text.replace('&', '&') text = text.replace('<', '<') text = text.replace('>', '>') if escape_quotes: text = text.replace('"', '"') text = text.replace("'", '"') return text class HTMLWasher(HTMLParser): """ Creates a washer for HTML, avoiding XSS attacks. See wash function for details on parameters. Usage:: from invenio.htmlutils import HTMLWasher washer = HTMLWasher() escaped_text = washer.wash(unescaped_text) Examples:: a.wash('Spam and eggs') => 'Spam and eggs' a.wash('Spam and eggs', True) => 'Spam and <blink>eggs</blink>' a.wash('Spam and eggs') => 'Spam and eggs' a.wash('Spam and eggs') =>'Spam and eggs' a.wash('Spam and poilu') =>'Spam and eggs' """ silent = False def __init__(self): """ Constructor; initializes washer """ HTMLParser.__init__(self) self.result = '' self.nb = 0 self.previous_nbs = [] self.previous_type_lists = [] self.url = '' self.render_unallowed_tags = False self.allowed_tag_whitelist = \ - cfg_html_buffer_allowed_tag_whitelist + CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST self.allowed_attribute_whitelist = \ - cfg_html_buffer_allowed_attribute_whitelist + CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST # javascript: self.re_js = re.compile( ".*(j|j|J)"\ "\s*(a|a|A)"\ "\s*(v|v|V)"\ "\s*(a|a|A)"\ "\s*(s|s|S)"\ "\s*(c|c|C)"\ "\s*(r|r|R)"\ "\s*(i|Ã|I)"\ "\s*(p|p|P)"\ "\s*(t|p|T)"\ "\s*(:|:).*", re.IGNORECASE | re.DOTALL) # vbscript: self.re_vb = re.compile( ".*(v|v|V)"\ "\s*(b|b|B)"\ "\s*(s|s|S)"\ "\s*(c|c|C)"\ "\s*(r|r|R)"\ "\s*(i|Ã|I)"\ "\s*(p|p|P)"\ "\s*(t|p|T)"\ "\s*(:|:).*", re.IGNORECASE | re.DOTALL) def wash(self, html_buffer, render_unallowed_tags=False, - allowed_tag_whitelist=cfg_html_buffer_allowed_tag_whitelist, + allowed_tag_whitelist=CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST, allowed_attribute_whitelist=\ - cfg_html_buffer_allowed_attribute_whitelist): + CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST): """ Wash HTML buffer, escaping XSS attacks. @param html_buffer: text to escape @param render_unallowed_tags: if True, print unallowed tags escaping < and >. Else, only print content of unallowed tags. @param allowed_tag_whitelist: list of allowed tags @param allowed_attribute_whitelist: list of allowed attributes """ self.reset() self.result = '' self.nb = 0 self.previous_nbs = [] self.previous_type_lists = [] self.url = '' self.render_unallowed_tags = render_unallowed_tags self.allowed_tag_whitelist = allowed_tag_whitelist self.allowed_attribute_whitelist = allowed_attribute_whitelist self.feed(html_buffer) self.close() return self.result def handle_starttag(self, tag, attrs): """Function called for new opening tags""" if tag.lower() in self.allowed_tag_whitelist: self.result += '<' + tag for (attr, value) in attrs: if attr.lower() in self.allowed_attribute_whitelist: self.result += ' %s="%s"' % \ (attr, self.handle_attribute_value(value)) self.result += '>' else: if self.render_unallowed_tags: self.result += '<' + cgi.escape(tag) for (attr, value) in attrs: self.result += ' %s="%s"' % \ (attr, cgi.escape(value, True)) self.result += '>' elif tag == 'style' or tag == 'script': # In that case we want to remove content too self.silent = True def handle_data(self, data): """Function called for text nodes""" if not self.silent: # let's to check if data contains a link import string if string.find(str(data),'http://') == -1: self.result += cgi.escape(data, True) else: if self.url: if self.url <> data: self.url = '' self.result += '(' + cgi.escape(data, True) + ')' def handle_endtag(self, tag): """Function called for ending of tags""" if tag.lower() in self.allowed_tag_whitelist: self.result += '' else: if self.render_unallowed_tags: self.result += '</' + cgi.escape(tag) + '>' if tag == 'style' or tag == 'script': self.silent = False def handle_startendtag(self, tag, attrs): """Function called for empty tags (e.g.
    )""" if tag.lower() in self.allowed_tag_whitelist: self.result += '<' + tag for (attr, value) in attrs: if attr.lower() in self.allowed_attribute_whitelist: self.result += ' %s="%s"' % \ (attr, self.handle_attribute_value(value)) self.result += ' />' else: if self.render_unallowed_tags: self.result += '<' + cgi.escape(tag) for (attr, value) in attrs: self.result += ' %s="%s"' % \ (attr, cgi.escape(value, True)) self.result += ' />' def handle_attribute_value(self, value): """Check attribute. Especially designed for avoiding URLs in the form: javascript:myXSSFunction();""" if self.re_js.match(value) or self.re_vb.match(value): return '' return value def handle_charref(self, name): """Process character references of the form "&#ref;". Return it as it is.""" self.result += '&#' + name + ';' def handle_entityref(self, name): """Process a general entity reference of the form "&name;". Return it as it is.""" self.result += '&' + name + ';' def tidy_html(html_buffer, cleaning_lib='utidylib'): """ Tidy up the input HTML using one of the installed cleaning libraries. @param html_buffer: the input HTML to clean up @type html_buffer: string @param cleaning_lib: chose the preferred library to clean the HTML. One of: - utidylib - beautifulsoup @return: a cleaned version of the input HTML @note: requires uTidylib or BeautifulSoup to be installed. If the chosen library is missing, the input X{html_buffer} is returned I{as is}. """ if CFG_TIDY_INSTALLED and cleaning_lib == 'utidylib': options = dict(output_xhtml=1, show_body_only=1, merge_divs=0, wrap=0) try: output = str(tidy.parseString(html_buffer, **options)) except: output = html_buffer elif CFG_BEAUTIFULSOUP_INSTALLED and cleaning_lib == 'beautifulsoup': try: output = str(BeautifulSoup(html_buffer).prettify()) except: output = html_buffer else: output = html_buffer return output def get_mathjax_header(https=False): """ Return the snippet of HTML code to put in HTML HEAD tag, in order to enable MathJax support. @param https: when using the CDN, whether to use the HTTPS URL rather than the HTTP one. @type https: bool @note: with new releases of MathJax, update this function toghether with $MJV variable in the root Makefile.am """ if CFG_MATHJAX_HOSTING.lower() == 'cdn': if https: mathjax_path = "https://d3eoax9i5htok0.cloudfront.net/mathjax/1.1-latest" else: mathjax_path = "http://cdn.mathjax.org/mathjax/1.1-latest" else: mathjax_path = "/MathJax" return """ """ % { 'mathjax_path': mathjax_path } def is_html_text_editor_installed(): """ Returns True if the wysiwyg editor (CKeditor) is installed """ return os.path.exists(os.path.join(CFG_WEBDIR, 'ckeditor', 'ckeditor.js')) ckeditor_available = is_html_text_editor_installed() def get_html_text_editor(name, id=None, content='', textual_content=None, width='300px', height='200px', enabled=True, file_upload_url=None, toolbar_set="Basic", custom_configurations_path='/ckeditor/invenio-ckeditor-config.js', ln=CFG_SITE_LANG): """ Returns a wysiwyg editor (CKEditor) to embed in html pages. Fall back to a simple textarea when the library is not installed, or when the user's browser is not compatible with the editor, or when 'enable' is False, or when javascript is not enabled. NOTE that the output also contains a hidden field named 'editor_type' that contains the kind of editor used, 'textarea' or 'ckeditor'. Based on 'editor_type' you might want to take different actions, like replace CRLF with
    when editor_type equals to 'textarea', but not when editor_type equals to 'ckeditor'. @param name: *str* the name attribute of the returned editor @param id: *str* the id attribute of the returned editor (when applicable) @param content: *str* the default content of the editor. @param textual_content: *str* a content formatted for the case where the wysiwyg editor is not available for user. When not specified, use value of 'content' @param width: *str* width of the editor in an html compatible unit: Eg: '400px', '50%'. @param height: *str* height of the editor in an html compatible unit: Eg: '400px', '50%'. @param enabled: *bool* if the wysiwyg editor is return (True) or if a simple texteara is returned (False) @param file_upload_url: *str* the URL used to upload new files via the editor upload panel. You have to implement the handler for your own use. The URL handler will get form variables 'File' as POST for the uploaded file, and 'Type' as GET for the type of file ('file', 'image', 'flash', 'media') When value is not given, the file upload is disabled. @param toolbar_set: *str* the name of the toolbar layout to use. CKeditor comes by default with 'Basic' and 'Default'. To define other sets, customize the config file in /opt/cds-invenio/var/www/ckeditor/invenio-ckconfig.js @param custom_configurations_path: *str* value for the CKeditor config variable 'CustomConfigurationsPath', which allows to specify the path of a file that contains a custom configuration for the editor. The path is relative to /opt/invenio/var/www/ @return: the HTML markup of the editor """ if textual_content is None: textual_content = content editor = '' if enabled and ckeditor_available: # Prepare upload path settings file_upload_script = '' if file_upload_url is not None: file_upload_script = ''', filebrowserLinkUploadUrl: '%(file_upload_url)s', filebrowserImageUploadUrl: '%(file_upload_url)s?type=Image', filebrowserFlashUploadUrl: '%(file_upload_url)s?type=Flash' ''' % {'file_upload_url': file_upload_url} # Prepare code to instantiate an editor editor += ''' + ''' % \ {'textual_content': cgi.escape(textual_content), 'html_content': content, 'width': width, 'height': height, 'name': name, 'id': id or name, 'custom_configurations_path': custom_configurations_path, 'toolbar': toolbar_set, 'file_upload_script': file_upload_script, 'CFG_SITE_URL': CFG_SITE_URL, 'ln': ln} else: # CKedior is not installed textarea = '' \ % {'content': cgi.escape(textual_content), 'width': width, 'height': height, 'name': name, 'id': id and ('id="%s"' % id) or ''} editor += textarea editor += '' return editor def remove_html_markup(text, replacechar=' '): """ Remove HTML markup from text. @param text: Input text. @type text: string. @param replacechar: By which character should we replace HTML markup. Usually, a single space or an empty string are nice values. @type replacechar: string @return: Input text with HTML markup removed. @rtype: string """ - return re_html.sub(replacechar, text) + return RE_HTML.sub(replacechar, text) + + +class EscapedString(str): + """ + This class is a stub used by the MLClass machinery in order + to distinguish native string, from string that don't need to be + escaped. + """ + pass + +class EscapedHTMLString(EscapedString): + """ + This class automatically escape a non-escaped string used to initialize + it, using the HTML escaping method (i.e. cgi.escape). + """ + def __new__(cls, original_string='', escape_quotes=False): + if isinstance(original_string, EscapedString): + escaped_string = str(original_string) + else: + if original_string and not str(original_string).strip(): + escaped_string = ' ' + else: + escaped_string = cgi.escape(str(original_string), escape_quotes) + obj = str.__new__(cls, escaped_string) + obj.original_string = original_string + obj.escape_quotes = escape_quotes + return obj + + def __repr__(self): + return 'EscapedHTMLString(%s, %s)' % (repr(self.original_string), repr(self.escape_quotes)) + + def __add__(self, rhs): + return EscapedHTMLString(EscapedString(str(self) + str(rhs))) + +class EscapedXMLString(EscapedString): + """ + This class automatically escape a non-escaped string used to initialize + it, using the XML escaping method (i.e. encode_for_xml). + """ + def __new__(cls, original_string='', escape_quotes=False): + if isinstance(original_string, EscapedString): + escaped_string = str(original_string) + else: + if original_string and not str(original_string).strip(): + escaped_string = ' ' + else: + escaped_string = encode_for_xml(str(original_string), wash=True, quote=escape_quotes) + obj = str.__new__(cls, escaped_string) + obj.original_string = original_string + obj.escape_quotes = escape_quotes + return obj + + def __repr__(self): + return 'EscapedXMLString(%s, %s)' % (repr(self.original_string), repr(self.escape_quotes)) + + def __add__(self, rhs): + return EscapedXMLString(EscapedString(str(self) + str(rhs))) -def create_html_tag(tag, body=None, escape_body=False, escape_attr=True, indent=0, attrs=None, **other_attrs): +def create_tag(tag, escaper=EscapedHTMLString, opening_only=False, body=None, escape_body=False, escape_attr=True, indent=0, attrs=None, **other_attrs): """ - Create an HTML tag. + Create an XML/HTML tag. - This function create a full HTML tag, putting toghether an + This function create a full XML/HTML tag, putting toghether an optional inner body and a dictionary of attributes. >>> print create_html_tag ("select", create_html_tag("h1", ... "hello", other_attrs={'class': "foo"})) @param tag: the tag (e.g. "select", "body", "h1"...). @type tag: string @param body: some text/HTML to put in the body of the tag (this body will be indented WRT the tag). @type body: string @param escape_body: wether the body (if any) must be escaped. @type escape_body: boolean @param escape_attr: wether the attribute values (if any) must be escaped. @type escape_attr: boolean @param indent: number of level of indentation for the tag. @type indent: integer @param attrs: map of attributes to add to the tag. @type attrs: dict @return: the HTML tag. @rtype: string """ if attrs is None: attrs = {} - attrs.update(other_attrs) + for key, value in other_attrs.iteritems(): + if value is not None: + if key.endswith('_'): + attrs[key[:-1]] = value + else: + attrs[key] = value out = "<%s" % tag for key, value in attrs.iteritems(): if escape_attr: - value = escape_html(value, escape_quotes=True) + value = escaper(value, escape_quotes=True) out += ' %s="%s"' % (key, value) - if body: - out += ">\n" - if escape_body: - body = escape_html(body) - out += indent_text(body, 1) - out += "" % tag - else: + if body is not None: + if callable(body) and body.__name__ == 'handle_body': + body = body() + out += ">" + if escape_body and not isinstance(body, EscapedString): + body = escaper(body) + out += body + if not opening_only: + out += "" % tag + elif not opening_only: out += " />" - out = indent_text(out, indent) - out = out[:-1] # Let's remove trailing new line - return out + if indent: + out = indent_text(out, indent)[:-1] + return EscapedString(out) -def create_html_select(options, selected=None, attrs=None, **other_attrs): +class MLClass(object): + """ + Swiss army knife to generate XML or HTML strings a la carte. + + >>> from invenio.htmlutils import X, H + >>> X.foo()() + ... '' + >>> X.foo(bar='baz')() + ... '' + >>> X.foo(bar='baz&pi')() + ... '' + >>> X.foo("", bar='baz') + ... '' + >>> X.foo(bar='baz')(X.body()) + ... '' + >>> X.foo(bar='baz')("") ## automatic escaping + ... '<body />' + >>> X.foo()(X.p(), X.p()) ## magic concatenation + ... '

    ' + >>> X.foo(class_='bar')() ## protected keywords... + ... '' + >>> X["xml-bar"]()() + ... '' + """ + + def __init__(self, escaper): + self.escaper = escaper + + def __getattr__(self, tag): + def tag_creator(body=None, opening_only=False, escape_body=False, escape_attr=True, indent=0, attrs=None, **other_attrs): + if body: + return create_tag(tag, body=body, opening_only=opening_only, escape_body=escape_body, escape_attr=escape_attr, indent=indent, attrs=attrs, **other_attrs) + else: + def handle_body(*other_bodies): + full_body = None + if other_bodies: + full_body = "" + for body in other_bodies: + if callable(body) and body.__name__ == 'handle_body': + full_body += body() + elif isinstance(body, EscapedString): + full_body += body + else: + full_body += self.escaper(str(body)) + return create_tag(tag, body=full_body, opening_only=opening_only, escape_body=escape_body, escape_attr=escape_attr, indent=indent, attrs=attrs, **other_attrs) + return handle_body + return tag_creator + + __getitem__ = __getattr__ + + +H = MLClass(EscapedHTMLString) +X = MLClass(EscapedXMLString) + +def create_html_select(options, name=None, selected=None, disabled=None, multiple=False, attrs=None, **other_attrs): """ Create an HTML select box. >>> print create_html_select(["foo", "bar"], selected="bar", name="baz") - >>> print create_html_select({"foo": "oof", "bar": "rab"}, selected="bar", name="baz") + >>> print create_html_select([("foo", "oof"), ("bar", "rab")], selected="bar", name="baz") - @param options: this can either be a sequence of strings or a map of - C{key->value}. In the former case, the C{select} tag will contain - a list of C{option} tags (in alphabetical order), where the - C{value} attribute is set to C{value}. In the latter case, the - C{value} attribute will be set to the C{key}, while the body + @param options: this can either be a sequence of strings, or a sequence + of couples or a map of C{key->value}. In the former case, the C{select} + tag will contain a list of C{option} tags (in alphabetical order), + where the C{value} attribute is not specified. In the latter case, + the C{value} attribute will be set to the C{key}, while the body of the C{option} will be set to C{value}. @type options: sequence or map - @param selected: optional key/value to select by default. In case - a map has been used for options, C{selected} must be set to an - existing C{key}, otherwise it must be set to an existing - C{value}. - @type selected: string + @param name: the name of the form element. + @type name: string + @param selected: optional key(s)/value(s) to select by default. In case + a map has been used for options. + @type selected: string (or list of string) + @param disabled: optional key(s)/value(s) to disable. + @type disabled: string (or list of string) + @param multiple: whether a multiple select box must be created. + @type mutable: bool @param attrs: optional attributes to create the select tag. @type attrs: dict @param other_attrs: other optional attributes. @return: the HTML output. @rtype: string @note: the values and keys will be escaped for HTML. @note: it is important that parameter C{value} is always specified, in case some browser plugin play with the markup, for eg. when translating the page. """ body = [] - try: + if selected is None: + selected = [] + elif isinstance(selected, (str, unicode)): + selected = [selected] + if disabled is None: + disabled = [] + elif isinstance(disabled, (str, unicode)): + disabled = [disabled] + if name is not None and multiple and not name.endswith('[]'): + name += "[]" + if isinstance(options, dict): items = options.items() items.sort(lambda item1, item2: cmp(item1[1], item2[1])) - for key, value in items: - option_attrs = key == selected and {"selected": "selected"} or {} - body.append(create_html_tag("option", body=value, escape_body=True, value=key, attrs=option_attrs)) - except AttributeError: - options.sort() - for value in options: - option_attrs = value == selected and {"selected": "selected"} or {} - body.append(create_html_tag("option", body=value, escape_body=True, value=value, attrs=option_attrs)) - return create_html_tag("select", body='\n'.join(body), attrs=attrs, **other_attrs) + elif isinstance(options, (list, tuple)): + options = list(options) + items = [] + for item in options: + if isinstance(item, (str, unicode)): + items.append((item, item)) + elif isinstance(item, (tuple, list)) and len(item) == 2: + items.append(tuple(item)) + else: + raise ValueError('Item "%s" of incompatible type: %s' % (item, type(item))) + else: + raise ValueError('Options of incompatible type: %s' % type(options)) + for key, value in items: + option_attrs = {} + if key in selected: + option_attrs['selected'] = 'selected' + if key in disabled: + option_attrs['disabled'] = 'disabled' + body.append(create_tag("option", body=value, escape_body=True, value=key, attrs=option_attrs)) + if attrs is None: + attrs = {} + if name is not None: + attrs['name'] = name + if multiple: + attrs['multiple'] = 'multiple' + return create_tag("select", body='\n'.join(body), attrs=attrs, **other_attrs) class _LinkGetter(HTMLParser): """ Hidden class that, by deriving from HTMLParser, will intercept all tags and retrieve the corresponding href attribute. All URLs are available in the urls attribute of the class. """ def __init__(self): HTMLParser.__init__(self) self.urls = set() def handle_starttag(self, tag, attrs): if tag == 'a': for (name, value) in attrs: if name == 'href': self.urls.add(value) def get_links_in_html_page(html): """ @param html: the HTML text to parse @type html: str @return: the list of URLs that were referenced via tags. @rtype: set of str """ parser = _LinkGetter() parser.feed(html) return parser.urls diff --git a/modules/miscutil/lib/htmlutils_tests.py b/modules/miscutil/lib/htmlutils_tests.py index eeee301c2..fa51720e3 100644 --- a/modules/miscutil/lib/htmlutils_tests.py +++ b/modules/miscutil/lib/htmlutils_tests.py @@ -1,243 +1,241 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Unit tests for htmlutils library.""" __revision__ = "$Id$" import unittest from invenio.htmlutils import HTMLWasher, nmtoken_from_string, \ remove_html_markup, create_html_select, \ CFG_TIDY_INSTALLED, \ CFG_BEAUTIFULSOUP_INSTALLED, tidy_html from invenio.testutils import make_test_suite, run_test_suite class XSSEscapingTest(unittest.TestCase): """Test functions related to the prevention of XSS attacks.""" def __init__(self, methodName='test'): self.washer = HTMLWasher() unittest.TestCase.__init__(self, methodName) def test_forbidden_formatting_tags(self): """htmlutils - washing of tags altering formatting of a page (e.g. )""" test_str = """""" self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '</html></body></pre>') def test_forbidden_script_tags(self): """htmlutils - washing of tags defining scripts (e.g. """ self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '<script>malicious_function();</script>') def test_forbidden_attributes(self): """htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)""" # onload test_str = """

    """ self.assertEqual(self.washer.wash(html_buffer=test_str), '

    ') # tricky: css calling a javascript test_str = """

    """ self.assertEqual(self.washer.wash(html_buffer=test_str), '

    ') def test_fake_url(self): """htmlutils - washing of fake URLs which execute scripts""" test_str = """link""" self.assertEqual(self.washer.wash(html_buffer=test_str), 'link') # Pirates could encode ascii values, or use uppercase letters... test_str = """link""" self.assertEqual(self.washer.wash(html_buffer=test_str), 'link') # MSIE treats 'java\ns\ncript:' the same way as 'javascript:' # Here we test with: # j # avas # crIPt : test_str = """link""" self.assertEqual(self.washer.wash(html_buffer=test_str), 'link') class CharactersEscapingTest(unittest.TestCase): """Test functions related to escaping reserved or forbidden characters """ def test_convert_string_to_nmtoken(self): """htmlutils - converting string to Nmtoken""" # TODO: possibly extend this test to include 'extenders' and # 'combining characters' as defined in # http://www.w3.org/TR/2000/REC-xml-20001006#NT-Nmtoken ascii_str = "".join([chr(i) for i in range(0, 256)]) nmtoken = nmtoken_from_string(ascii_str) for char in nmtoken: self.assert_(char in ['.', '-', '_', ':'] or char.isalnum()) class HTMLWashingTest(unittest.TestCase): """Test functions related to general washing of HTML source""" def __init__(self, methodName='test'): self.washer = HTMLWasher() unittest.TestCase.__init__(self, methodName) def test_wash_html(self): """htmlutils - washing HTML tags""" # Simple test case test_str = 'Spam and eggs' self.assertEqual(self.washer.wash(html_buffer=test_str), 'Spam and eggs') # Show 'escaped' tags test_str = 'Spam and eggs' self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), 'Spam and <blink>eggs</blink>') # Keep entity and character references test_str = ' a < b > c ÷' self.assertEqual(self.washer.wash(html_buffer=test_str), ' a < b > c ÷') # Remove content of bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') test_str = 'bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') # Remove content of styled text' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text') test_str = 'styled text' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text') class HTMLTidyingTest(unittest.TestCase): """Test functions related to tidying up HTML source""" html_buffer_1 = 'test' html_buffer_2 = '

    test
    test2' html_buffer_3 = '''
    • Merge adjacent lists
      ''' # Input test 427841 from Tidy def test_tidy_html_with_utidylib(self): """htmlutils - Tidying up HTML with µTidylib """ res1 = tidy_html(self.html_buffer_1, 'utidylib') res2 = tidy_html(self.html_buffer_2, 'utidylib') res3 = tidy_html(self.html_buffer_3, 'utidylib') if CFG_TIDY_INSTALLED: self.assertEqual(res1.replace('\n', '').replace(' ', ''), 'test') self.assertEqual(res2.replace('\n', '').replace(' ', ''), '
      test
      test2
      ') self.assertEqual(res3.replace('\n', '').replace(' ', ''), '
        • Next
        • Back
        • NewStuff
      • Mergeadjacentlists
      • One
      • Two
      • Three
    ') else: self.assertEqual(res1, res1) self.assertEqual(res2, res2) self.assertEqual(res3, res3) def test_tidy_html_with_beautifulsoup(self): """htmlutils - Tidying up HTML with BeautifulSoup""" res1 = tidy_html(self.html_buffer_1, 'beautifulsoup') res2 = tidy_html(self.html_buffer_2, 'beautifulsoup') res3 = tidy_html(self.html_buffer_3, 'beautifulsoup') if CFG_TIDY_INSTALLED: self.assertEqual(res1.replace('\n', '').replace(' ', ''), 'test') self.assertEqual(res2.replace('\n', '').replace(' ', ''), '
    test
    test2
    ') self.assertEqual(res3.replace('\n', '').replace(' ', ''), '
      • Next
      • Back
      • NewStuff
    • Mergeadjacentlists
      • One
      • Two
      • Three
    ') else: self.assertEqual(res1, res1) self.assertEqual(res2, res2) self.assertEqual(res3, res3) def test_tidy_html_with_unknown_lib(self): """htmlutils - Tidying up HTML with non existing library""" res = tidy_html(self.html_buffer_1, 'foo') self.assertEqual(res.replace('\n', '').replace(' ', ''), self.html_buffer_1.replace('\n', '').replace(' ', '')) class HTMLMarkupRemovalTest(unittest.TestCase): """Test functions related to removing HTML markup.""" def test_remove_html_markup_empty(self): """htmlutils - remove HTML markup, empty replacement""" test_input = 'This is test.' test_expected = 'This is test.' self.assertEqual(remove_html_markup(test_input, ''), test_expected) def test_remove_html_markup_replacement(self): """htmlutils - remove HTML markup, some replacement""" test_input = 'This is test.' test_expected = 'This is XtestX.' self.assertEqual(remove_html_markup(test_input, 'X'), test_expected) class HTMLCreation(unittest.TestCase): """Test functions related to creation of HTML markup.""" def test_create_html_select(self): """htmlutils - create HTML \n \n \n') + self.assertEqual(create_html_select(["foo", "bar"], selected="bar", name="baz"), + '') TEST_SUITE = make_test_suite(XSSEscapingTest, CharactersEscapingTest, HTMLWashingTest, HTMLMarkupRemovalTest, HTMLTidyingTest, HTMLCreation) if __name__ == "__main__": run_test_suite(TEST_SUITE) - - diff --git a/modules/miscutil/lib/inveniocfg.py b/modules/miscutil/lib/inveniocfg.py index 3e56bb34e..df73a7373 100644 --- a/modules/miscutil/lib/inveniocfg.py +++ b/modules/miscutil/lib/inveniocfg.py @@ -1,1321 +1,1326 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Invenio configuration and administration CLI tool. Usage: inveniocfg [options] General options: -h, --help print this help -V, --version print version number Options to finish your installation: --create-apache-conf create Apache configuration files --create-tables create DB tables for Invenio --load-webstat-conf load the WebStat configuration --drop-tables drop DB tables of Invenio --check-openoffice check for correctly set up of openoffice temporary directory Options to set up and test a demo site: --create-demo-site create demo site --load-demo-records load demo records --remove-demo-records remove demo records, keeping demo site --drop-demo-site drop demo site configurations too --run-unit-tests run unit test suite (needs demo site) --run-regression-tests run regression test suite (needs demo site) --run-web-tests run web tests in a browser (needs demo site, Firefox, Selenium IDE) Options to update config files in situ: --update-all perform all the update options --update-config-py update config.py file from invenio.conf file --update-dbquery-py update dbquery.py with DB credentials from invenio.conf --update-dbexec update dbexec with DB credentials from invenio.conf --update-bibconvert-tpl update bibconvert templates with CFG_SITE_URL from invenio.conf --update-web-tests update web test cases with CFG_SITE_URL from invenio.conf Options to update DB tables: --reset-all perform all the reset options --reset-sitename reset tables to take account of new CFG_SITE_NAME* --reset-siteadminemail reset tables to take account of new CFG_SITE_ADMIN_EMAIL --reset-fieldnames reset tables to take account of new I18N names from PO files --reset-recstruct-cache reset record structure cache according to CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE Options to help the work: --list print names and values of all options from conf files --get get value of a given option from conf files --conf-dir path to directory where invenio*.conf files are [optional] --detect-system-details print system details such as Apache/Python/MySQL versions """ __revision__ = "$Id$" from ConfigParser import ConfigParser import os import re import shutil import socket import sys def print_usage(): """Print help.""" print __doc__ def print_version(): """Print version information.""" print __revision__ def convert_conf_option(option_name, option_value): """ Convert conf option into Python config.py line, converting values to ints or strings as appropriate. """ ## 1) convert option name to uppercase: option_name = option_name.upper() ## 2) convert option value to int or string: if option_name in ['CFG_BIBUPLOAD_REFERENCE_TAG', 'CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG', 'CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG', 'CFG_BIBUPLOAD_EXTERNAL_OAIID_PROVENANCE_TAG', 'CFG_BIBUPLOAD_STRONG_TAGS', 'CFG_BIBFORMAT_HIDDEN_TAGS',]: # some options are supposed be string even when they look like # numeric option_value = '"' + option_value + '"' else: try: option_value = int(option_value) except ValueError: option_value = '"' + option_value + '"' ## 3a) special cases: chars regexps if option_name in ['CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS', 'CFG_BIBINDEX_CHARS_PUNCTUATION']: option_value = 'r"[' + option_value[1:-1] + ']"' ## 3abis) special cases: real regexps if option_name in ['CFG_BIBINDEX_PERFORM_OCR_ON_DOCNAMES']: option_value = 'r"' + option_value[1:-1] + '"' ## 3b) special cases: True, False, None if option_value in ['"True"', '"False"', '"None"']: option_value = option_value[1:-1] ## 3c) special cases: dicts if option_name in ['CFG_WEBSEARCH_FIELDS_CONVERT', 'CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS', 'CFG_SITE_EMERGENCY_EMAIL_ADDRESSES', 'CFG_BIBMATCH_FUZZY_WORDLIMITS', 'CFG_BIBMATCH_QUERY_TEMPLATES', 'CFG_WEBSEARCH_SYNONYM_KBRS', 'CFG_BIBINDEX_SYNONYM_KBRS', 'CFG_WEBCOMMENT_EMAIL_REPLIES_TO', 'CFG_WEBCOMMENT_RESTRICTION_DATAFIELD', 'CFG_WEBCOMMENT_ROUND_DATAFIELD', 'CFG_BIBUPLOAD_FFT_ALLOWED_EXTERNAL_URLS', 'CFG_BIBSCHED_NODE_TASKS', - 'CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE']: + 'CFG_BIBEDIT_EXTEND_RECORD_WITH_COLLECTION_TEMPLATE', + 'CFG_OAI_METADATA_FORMATS']: option_value = option_value[1:-1] ## 3cbis) very special cases: dicts with backward compatible string if option_name in ['CFG_BIBINDEX_SPLASH_PAGES']: if option_value.startswith('"{') and option_value.endswith('}"'): option_value = option_value[1:-1] else: option_value = """{%s: ".*"}""" % option_value ## 3d) special cases: comma-separated lists if option_name in ['CFG_SITE_LANGS', 'CFG_WEBSUBMIT_ADDITIONAL_KNOWN_FILE_EXTENSIONS', 'CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS', 'CFG_BIBUPLOAD_STRONG_TAGS', 'CFG_BIBFORMAT_HIDDEN_TAGS', 'CFG_BIBSCHED_GC_TASKS_TO_REMOVE', 'CFG_BIBSCHED_GC_TASKS_TO_ARCHIVE', 'CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS', 'CFG_BIBUPLOAD_CONTROLLED_PROVENANCE_TAGS', 'CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES', 'CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST', 'CFG_WEBSEARCH_RSS_I18N_COLLECTIONS', 'CFG_BATCHUPLOADER_FILENAME_MATCHING_POLICY', 'CFG_BATCHUPLOADER_WEB_ROBOT_AGENT', 'CFG_BIBAUTHORID_EXTERNAL_CLAIMED_RECORDS_KEY', - 'CFG_PLOTEXTRACTOR_DISALLOWED_TEX']: + 'CFG_PLOTEXTRACTOR_DISALLOWED_TEX', + 'CFG_OAI_FRIENDS']: out = "[" for elem in option_value[1:-1].split(","): if elem: if option_name in ['CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES']: # 3d1) integer values out += "%i, " % int(elem) else: # 3d2) string values out += "'%s', " % elem out += "]" option_value = out ## 3e) special cases: multiline if option_name == 'CFG_OAI_IDENTIFY_DESCRIPTION': # make triple quotes option_value = '""' + option_value + '""' ## 3f) ignore some options: if option_name.startswith('CFG_SITE_NAME_INTL'): # treated elsewhere return ## 3g) special cases: float if option_name in ['CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY', 'CFG_BIBMATCH_LOCAL_SLEEPTIME', 'CFG_BIBMATCH_REMOTE_SLEEPTIME', 'CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_BCTKD_RA', 'CFG_BIBAUTHORID_PERSONID_MIN_P_FROM_NEW_RA', 'CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH', 'CFG_BIBAUTHORID_PERSONID_MAX_COMP_LIST_MIN_TRSH_P_N', 'CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT']: option_value = float(option_value[1:-1]) ## 4) finally, return output line: return '%s = %s' % (option_name, option_value) def cli_cmd_update_config_py(conf): """ Update new config.py from conf options, keeping previous config.py in a backup copy. """ print ">>> Going to update config.py..." ## location where config.py is: configpyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \ os.sep + 'invenio' + os.sep + 'config.py' ## backup current config.py file: if os.path.exists(configpyfile): shutil.copy(configpyfile, configpyfile + '.OLD') ## here we go: fdesc = open(configpyfile, 'w') ## generate preamble: fdesc.write("# -*- coding: utf-8 -*-\n") fdesc.write("# DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED\n") fdesc.write("# FROM INVENIO.CONF BY EXECUTING:\n") fdesc.write("# " + " ".join(sys.argv) + "\n") ## special treatment for CFG_SITE_NAME_INTL options: fdesc.write("CFG_SITE_NAME_INTL = {}\n") for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","): fdesc.write("CFG_SITE_NAME_INTL['%s'] = \"%s\"\n" % (lang, conf.get("Invenio", "CFG_SITE_NAME_INTL_" + lang))) ## special treatment for CFG_SITE_SECURE_URL that may be empty, in ## which case it should be put equal to CFG_SITE_URL: if not conf.get("Invenio", "CFG_SITE_SECURE_URL"): conf.set("Invenio", "CFG_SITE_SECURE_URL", conf.get("Invenio", "CFG_SITE_URL")) ## process all the options normally: sections = conf.sections() sections.sort() for section in sections: options = conf.options(section) options.sort() for option in options: if not option.startswith('CFG_DATABASE_'): # put all options except for db credentials into config.py line_out = convert_conf_option(option, conf.get(section, option)) if line_out: fdesc.write(line_out + "\n") ## FIXME: special treatment for experimental variables ## CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES and CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE ## (not offering them in invenio.conf since they will be refactored) fdesc.write("CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE = 0\n") fdesc.write("CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES = [0, 1,]\n") ## generate postamble: fdesc.write("") fdesc.write("# END OF GENERATED FILE") ## we are done: fdesc.close() print "You may want to restart Apache now." print ">>> config.py updated successfully." def cli_cmd_update_dbquery_py(conf): """ Update lib/dbquery.py file with DB parameters read from conf file. Note: this edits dbquery.py in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update dbquery.py..." ## location where dbquery.py is: dbquerypyfile = conf.get("Invenio", "CFG_PYLIBDIR") + \ os.sep + 'invenio' + os.sep + 'dbquery.py' ## backup current dbquery.py file: if os.path.exists(dbquerypyfile): shutil.copy(dbquerypyfile, dbquerypyfile + '.OLD') ## replace db parameters: out = '' for line in open(dbquerypyfile, 'r').readlines(): match = re.search(r'^CFG_DATABASE_(HOST|PORT|NAME|USER|PASS)(\s*=\s*)\'.*\'$', line) if match: dbparam = 'CFG_DATABASE_' + match.group(1) out += "%s%s'%s'\n" % (dbparam, match.group(2), conf.get('Invenio', dbparam)) else: out += line fdesc = open(dbquerypyfile, 'w') fdesc.write(out) fdesc.close() print "You may want to restart Apache now." print ">>> dbquery.py updated successfully." def cli_cmd_update_dbexec(conf): """ Update bin/dbexec file with DB parameters read from conf file. Note: this edits dbexec in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update dbexec..." ## location where dbexec is: dbexecfile = conf.get("Invenio", "CFG_BINDIR") + \ os.sep + 'dbexec' ## backup current dbexec file: if os.path.exists(dbexecfile): shutil.copy(dbexecfile, dbexecfile + '.OLD') ## replace db parameters via sed: out = '' for line in open(dbexecfile, 'r').readlines(): match = re.search(r'^CFG_DATABASE_(HOST|PORT|NAME|USER|PASS)(\s*=\s*)\'.*\'$', line) if match: dbparam = 'CFG_DATABASE_' + match.group(1) out += "%s%s'%s'\n" % (dbparam, match.group(2), conf.get("Invenio", dbparam)) else: out += line fdesc = open(dbexecfile, 'w') fdesc.write(out) fdesc.close() print ">>> dbexec updated successfully." def cli_cmd_update_bibconvert_tpl(conf): """ Update bibconvert/config/*.tpl files looking for 856 http://.../CFG_SITE_RECORD lines, replacing URL with CFG_SITE_URL taken from conf file. Note: this edits tpl files in situ, taking a backup first. Use only when you know what you are doing. """ print ">>> Going to update bibconvert templates..." ## location where bibconvert/config/*.tpl are: tpldir = conf.get("Invenio", 'CFG_ETCDIR') + \ os.sep + 'bibconvert' + os.sep + 'config' ## find all *.tpl files: for tplfilename in os.listdir(tpldir): if tplfilename.endswith(".tpl"): ## change tpl file: tplfile = tpldir + os.sep + tplfilename shutil.copy(tplfile, tplfile + '.OLD') out = '' for line in open(tplfile, 'r').readlines(): match = re.search(r'^(.*)http://.*?/%s/(.*)$' % conf.get("Invenio", 'CFG_SITE_RECORD'), line) if match: out += "%s%s/%s/%s\n" % (match.group(1), conf.get("Invenio", 'CFG_SITE_URL'), conf.get("Invenio", 'CFG_SITE_RECORD'), match.group(2)) else: out += line fdesc = open(tplfile, 'w') fdesc.write(out) fdesc.close() print ">>> bibconvert templates updated successfully." def cli_cmd_update_web_tests(conf): """ Update web test cases lib/webtest/test_*.html looking for http://.+?[>> Going to update web tests..." ## location where test_*.html files are: testdir = conf.get("Invenio", 'CFG_PREFIX') + os.sep + \ 'lib' + os.sep + 'webtest' + os.sep + 'invenio' ## find all test_*.html files: for testfilename in os.listdir(testdir): if testfilename.startswith("test_") and \ testfilename.endswith(".html"): ## change test file: testfile = testdir + os.sep + testfilename shutil.copy(testfile, testfile + '.OLD') out = '' for line in open(testfile, 'r').readlines(): match = re.search(r'^(.*)http://.+?([)/opt/invenio(.*)$', line) if match: out += "%s%s%s\n" % (match.group(1), conf.get("Invenio", 'CFG_PREFIX'), match.group(2)) else: out += line fdesc = open(testfile, 'w') fdesc.write(out) fdesc.close() print ">>> web tests updated successfully." def cli_cmd_reset_sitename(conf): """ Reset collection-related tables with new CFG_SITE_NAME and CFG_SITE_NAME_INTL* read from conf files. """ print ">>> Going to reset CFG_SITE_NAME and CFG_SITE_NAME_INTL..." from invenio.dbquery import run_sql, IntegrityError # reset CFG_SITE_NAME: sitename = conf.get("Invenio", "CFG_SITE_NAME") try: run_sql("""INSERT INTO collection (id, name, dbquery, reclist) VALUES (1,%s,NULL,NULL)""", (sitename,)) except IntegrityError: run_sql("""UPDATE collection SET name=%s WHERE id=1""", (sitename,)) # reset CFG_SITE_NAME_INTL: for lang in conf.get("Invenio", "CFG_SITE_LANGS").split(","): sitename_lang = conf.get("Invenio", "CFG_SITE_NAME_INTL_" + lang) try: run_sql("""INSERT INTO collectionname (id_collection, ln, type, value) VALUES (%s,%s,%s,%s)""", (1, lang, 'ln', sitename_lang)) except IntegrityError: run_sql("""UPDATE collectionname SET value=%s WHERE ln=%s AND id_collection=1 AND type='ln'""", (sitename_lang, lang)) print "You may want to restart Apache now." print ">>> CFG_SITE_NAME and CFG_SITE_NAME_INTL* reset successfully." def cli_cmd_reset_recstruct_cache(conf): """If CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE is changed, this function will adapt the database to either store or not store the recstruct format.""" from invenio.intbitset import intbitset from invenio.dbquery import run_sql, serialize_via_marshal from invenio.search_engine import get_record from invenio.bibsched import server_pid, pidfile enable_recstruct_cache = conf.get("Invenio", "CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE") enable_recstruct_cache = enable_recstruct_cache in ('True', '1') pid = server_pid(ping_the_process=False) if pid: print >> sys.stderr, "ERROR: bibsched seems to run with pid %d, according to %s." % (pid, pidfile) print >> sys.stderr, " Please stop bibsched before running this procedure." sys.exit(1) if enable_recstruct_cache: print ">>> Searching records which need recstruct cache resetting; this may take a while..." all_recids = intbitset(run_sql("SELECT id FROM bibrec")) good_recids = intbitset(run_sql("SELECT bibrec.id FROM bibrec JOIN bibfmt ON bibrec.id = bibfmt.id_bibrec WHERE format='recstruct' AND modification_date < last_updated")) recids = all_recids - good_recids print ">>> Generating recstruct cache..." tot = len(recids) count = 0 for recid in recids: value = serialize_via_marshal(get_record(recid)) run_sql("DELETE FROM bibfmt WHERE id_bibrec=%s AND format='recstruct'", (recid, )) run_sql("INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, 'recstruct', NOW(), %s)", (recid, value)) count += 1 if count % 1000 == 0: print " ... done records %s/%s" % (count, tot) if count % 1000 != 0: print " ... done records %s/%s" % (count, tot) print ">>> recstruct cache generated successfully." else: print ">>> Cleaning recstruct cache..." run_sql("DELETE FROM bibfmt WHERE format='recstruct'") def cli_cmd_reset_siteadminemail(conf): """ Reset user-related tables with new CFG_SITE_ADMIN_EMAIL read from conf files. """ print ">>> Going to reset CFG_SITE_ADMIN_EMAIL..." from invenio.dbquery import run_sql siteadminemail = conf.get("Invenio", "CFG_SITE_ADMIN_EMAIL") run_sql("DELETE FROM user WHERE id=1") run_sql("""INSERT INTO user (id, email, password, note, nickname) VALUES (1, %s, AES_ENCRYPT(email, ''), 1, 'admin')""", (siteadminemail,)) print "You may want to restart Apache now." print ">>> CFG_SITE_ADMIN_EMAIL reset successfully." def cli_cmd_reset_fieldnames(conf): """ Reset I18N field names such as author, title, etc and other I18N ranking method names such as word similarity. Their translations are taken from the PO files. """ print ">>> Going to reset I18N field names..." from invenio.messages import gettext_set_language, language_list_long from invenio.dbquery import run_sql, IntegrityError ## get field id and name list: field_id_name_list = run_sql("SELECT id, name FROM field") ## get rankmethod id and name list: rankmethod_id_name_list = run_sql("SELECT id, name FROM rnkMETHOD") ## update names for every language: for lang, dummy in language_list_long(): _ = gettext_set_language(lang) ## this list is put here in order for PO system to pick names ## suitable for translation field_name_names = {"any field": _("any field"), "title": _("title"), "author": _("author"), "abstract": _("abstract"), "keyword": _("keyword"), "report number": _("report number"), "subject": _("subject"), "reference": _("reference"), "fulltext": _("fulltext"), "collection": _("collection"), "division": _("division"), "year": _("year"), "journal": _("journal"), "experiment": _("experiment"), "record ID": _("record ID")} ## update I18N names for every language: for (field_id, field_name) in field_id_name_list: if field_name_names.has_key(field_name): try: run_sql("""INSERT INTO fieldname (id_field,ln,type,value) VALUES (%s,%s,%s,%s)""", (field_id, lang, 'ln', field_name_names[field_name])) except IntegrityError: run_sql("""UPDATE fieldname SET value=%s WHERE id_field=%s AND ln=%s AND type=%s""", (field_name_names[field_name], field_id, lang, 'ln',)) ## ditto for rank methods: rankmethod_name_names = {"wrd": _("word similarity"), "demo_jif": _("journal impact factor"), "citation": _("times cited"), "citerank_citation_t": _("time-decay cite count"), "citerank_pagerank_c": _("all-time-best cite rank"), "citerank_pagerank_t": _("time-decay cite rank"),} for (rankmethod_id, rankmethod_name) in rankmethod_id_name_list: if rankmethod_name_names.has_key(rankmethod_name): try: run_sql("""INSERT INTO rnkMETHODNAME (id_rnkMETHOD,ln,type,value) VALUES (%s,%s,%s,%s)""", (rankmethod_id, lang, 'ln', rankmethod_name_names[rankmethod_name])) except IntegrityError: run_sql("""UPDATE rnkMETHODNAME SET value=%s WHERE id_rnkMETHOD=%s AND ln=%s AND type=%s""", (rankmethod_name_names[rankmethod_name], rankmethod_id, lang, 'ln',)) print ">>> I18N field names reset successfully." def cli_check_openoffice(conf): """ If OpenOffice.org integration is enabled, checks whether the system is properly configured. """ from invenio.bibtask import check_running_process_user from invenio.websubmit_file_converter import can_unoconv, get_file_converter_logger logger = get_file_converter_logger() for handler in logger.handlers: logger.removeHandler(handler) check_running_process_user() print ">>> Checking if Libre/OpenOffice.org is correctly integrated...", sys.stdout.flush() if can_unoconv(True): print "ok" else: sys.exit(1) def test_db_connection(): """ Test DB connection, and if fails, advise user how to set it up. Useful to be called during table creation. """ print "Testing DB connection...", from invenio.textutils import wrap_text_in_a_box from invenio.dbquery import run_sql, Error ## first, test connection to the DB server: try: run_sql("SHOW TABLES") except Error, err: from invenio.dbquery import CFG_DATABASE_HOST, CFG_DATABASE_PORT, \ CFG_DATABASE_NAME, CFG_DATABASE_USER, CFG_DATABASE_PASS print wrap_text_in_a_box("""\ DATABASE CONNECTIVITY ERROR %(errno)d: %(errmsg)s.\n Perhaps you need to set up database and connection rights? If yes, then please login as MySQL admin user and run the following commands now: $ mysql -h %(dbhost)s -P %(dbport)s -u root -p mysql mysql> CREATE DATABASE %(dbname)s DEFAULT CHARACTER SET utf8; mysql> GRANT ALL PRIVILEGES ON %(dbname)s.* TO %(dbuser)s@%(webhost)s IDENTIFIED BY '%(dbpass)s'; mysql> QUIT The values printed above were detected from your configuration. If they are not right, then please edit your invenio-local.conf file and rerun 'inveniocfg --update-all' first. If the problem is of different nature, then please inspect the above error message and fix the problem before continuing.""" % \ {'errno': err.args[0], 'errmsg': err.args[1], 'dbname': CFG_DATABASE_NAME, 'dbhost': CFG_DATABASE_HOST, 'dbport': CFG_DATABASE_PORT, 'dbuser': CFG_DATABASE_USER, 'dbpass': CFG_DATABASE_PASS, 'webhost': CFG_DATABASE_HOST == 'localhost' and 'localhost' or os.popen('hostname -f', 'r').read().strip(), }) sys.exit(1) print "ok" ## second, test insert/select of a Unicode string to detect ## possible Python/MySQL/MySQLdb mis-setup: print "Testing Python/MySQL/MySQLdb UTF-8 chain...", try: beta_in_utf8 = "β" # Greek beta in UTF-8 is 0xCEB2 run_sql("CREATE TEMPORARY TABLE test__invenio__utf8 (x char(1), y varbinary(2)) DEFAULT CHARACTER SET utf8") run_sql("INSERT INTO test__invenio__utf8 (x, y) VALUES (%s, %s)", (beta_in_utf8, beta_in_utf8)) res = run_sql("SELECT x,y,HEX(x),HEX(y),LENGTH(x),LENGTH(y),CHAR_LENGTH(x),CHAR_LENGTH(y) FROM test__invenio__utf8") assert res[0] == ('\xce\xb2', '\xce\xb2', 'CEB2', 'CEB2', 2L, 2L, 1L, 2L) run_sql("DROP TEMPORARY TABLE test__invenio__utf8") except Exception, err: print wrap_text_in_a_box("""\ DATABASE RELATED ERROR %s\n A problem was detected with the UTF-8 treatment in the chain between the Python application, the MySQLdb connector, and the MySQL database. You may perhaps have installed older versions of some prerequisite packages?\n Please check the INSTALL file and please fix this problem before continuing.""" % err) sys.exit(1) print "ok" def cli_cmd_create_tables(conf): """Create and fill Invenio DB tables. Useful for the installation process.""" print ">>> Going to create and fill tables..." from invenio.config import CFG_PREFIX test_db_connection() for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabcreate.sql" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/dbexec < %s/lib/sql/invenio/tabfill.sql" % (CFG_PREFIX, CFG_PREFIX)]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) cli_cmd_reset_sitename(conf) cli_cmd_reset_siteadminemail(conf) cli_cmd_reset_fieldnames(conf) for cmd in ["%s/bin/webaccessadmin -u admin -c -a" % CFG_PREFIX]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Tables created and filled successfully." def cli_cmd_load_webstat_conf(conf): print ">>> Going to load WebStat config..." from invenio.config import CFG_PREFIX cmd = "%s/bin/webstatadmin --load-config" % CFG_PREFIX if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> WebStat config load successfully." def cli_cmd_drop_tables(conf): """Drop Invenio DB tables. Useful for the uninstallation process.""" print ">>> Going to drop tables..." from invenio.config import CFG_PREFIX from invenio.textutils import wrap_text_in_a_box, wait_for_user wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your database tables!""")) cmd = "%s/bin/dbexec < %s/lib/sql/invenio/tabdrop.sql" % (CFG_PREFIX, CFG_PREFIX) if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Tables dropped successfully." def cli_cmd_create_demo_site(conf): """Create demo site. Useful for testing purposes.""" print ">>> Going to create demo site..." from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql run_sql("TRUNCATE schTASK") run_sql("TRUNCATE session") run_sql("DELETE FROM user WHERE email=''") for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/democfgdata.sql" % \ (CFG_PREFIX, CFG_PREFIX),]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) cli_cmd_reset_fieldnames(conf) # needed for I18N demo ranking method names for cmd in ["%s/bin/webaccessadmin -u admin -c -r -D" % CFG_PREFIX, "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 1" % CFG_PREFIX,]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo site created successfully." def cli_cmd_load_demo_records(conf): """Load demo records. Useful for testing purposes.""" from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql print ">>> Going to load demo records..." run_sql("TRUNCATE schTASK") for cmd in ["%s/bin/bibupload -u admin -i %s/var/tmp/demobibdata.xml" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/bibupload 1" % CFG_PREFIX, "%s/bin/bibdocfile --textify --with-ocr --recid 97" % CFG_PREFIX, "%s/bin/bibdocfile --textify --all" % CFG_PREFIX, "%s/bin/bibindex -u admin" % CFG_PREFIX, "%s/bin/bibindex 2" % CFG_PREFIX, "%s/bin/bibreformat -u admin -o HB" % CFG_PREFIX, "%s/bin/bibreformat 3" % CFG_PREFIX, "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 4" % CFG_PREFIX, "%s/bin/bibrank -u admin" % CFG_PREFIX, - "%s/bin/bibrank 5" % CFG_PREFIX,]: + "%s/bin/bibrank 5" % CFG_PREFIX, + "%s/bin/oairepositoryupdater -u admin" % CFG_PREFIX, + "%s/bin/oairepositoryupdater 6" % CFG_PREFIX, + "%s/bin/bibupload 7" % CFG_PREFIX,]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo records loaded successfully." def cli_cmd_remove_demo_records(conf): """Remove demo records. Useful when you are finished testing.""" print ">>> Going to remove demo records..." from invenio.config import CFG_PREFIX from invenio.dbquery import run_sql from invenio.textutils import wrap_text_in_a_box, wait_for_user wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your records and documents!""")) if os.path.exists(CFG_PREFIX + os.sep + 'var' + os.sep + 'data'): shutil.rmtree(CFG_PREFIX + os.sep + 'var' + os.sep + 'data') run_sql("TRUNCATE schTASK") for cmd in ["%s/bin/dbexec < %s/lib/sql/invenio/tabbibclean.sql" % (CFG_PREFIX, CFG_PREFIX), "%s/bin/webcoll -u admin" % CFG_PREFIX, "%s/bin/webcoll 1" % CFG_PREFIX,]: if os.system(cmd): print "ERROR: failed execution of", cmd sys.exit(1) print ">>> Demo records removed successfully." def cli_cmd_drop_demo_site(conf): """Drop demo site completely. Useful when you are finished testing.""" print ">>> Going to drop demo site..." from invenio.textutils import wrap_text_in_a_box, wait_for_user wait_for_user(wrap_text_in_a_box("""WARNING: You are going to destroy your site and documents!""")) cli_cmd_drop_tables(conf) cli_cmd_create_tables(conf) cli_cmd_remove_demo_records(conf) print ">>> Demo site dropped successfully." def cli_cmd_run_unit_tests(conf): """Run unit tests, usually on the working demo site.""" from invenio.testutils import build_and_run_unit_test_suite build_and_run_unit_test_suite() def cli_cmd_run_regression_tests(conf): """Run regression tests, usually on the working demo site.""" from invenio.testutils import build_and_run_regression_test_suite build_and_run_regression_test_suite() def cli_cmd_run_web_tests(conf): """Run web tests in a browser. Requires Firefox with Selenium.""" from invenio.testutils import build_and_run_web_test_suite build_and_run_web_test_suite() def _detect_ip_address(): """Detect IP address of this computer. Useful for creating Apache vhost conf snippet on RHEL like machines. @return: IP address, or '*' if cannot detect @rtype: string @note: creates socket for real in order to detect real IP address, not the loopback one. """ try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('invenio-software.org', 0)) return s.getsockname()[0] except: return '*' def cli_cmd_create_apache_conf(conf): """ Create Apache conf files for this site, keeping previous files in a backup copy. """ print ">>> Going to create Apache conf files..." from invenio.textutils import wrap_text_in_a_box from invenio.access_control_config import CFG_EXTERNAL_AUTH_USING_SSO apache_conf_dir = conf.get("Invenio", 'CFG_ETCDIR') + \ os.sep + 'apache' ## Preparation of XSendFile directive xsendfile_directive_needed = int(conf.get("Invenio", 'CFG_BIBDOCFILE_USE_XSENDFILE')) != 0 if xsendfile_directive_needed: xsendfile_directive = "XSendFile On\n" else: xsendfile_directive = "#XSendFile On\n" for path in (conf.get('Invenio', 'CFG_WEBSUBMIT_FILEDIR'), # BibDocFile conf.get('Invenio', 'CFG_WEBDIR'), conf.get('Invenio', 'CFG_WEBSUBMIT_STORAGEDIR'), # WebSubmit conf.get('Invenio', 'CFG_TMPDIR'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'tmp', 'attachfile'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'comments'), os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'data', 'baskets', 'comments'), '/tmp'): # BibExport if xsendfile_directive_needed: xsendfile_directive += ' XSendFilePath %s\n' % path else: xsendfile_directive += ' #XSendFilePath %s\n' % path xsendfile_directive = xsendfile_directive.strip() ## Preparation of deflate directive deflate_directive_needed = int(conf.get("Invenio", 'CFG_WEBSTYLE_HTTP_USE_COMPRESSION')) != 0 if deflate_directive_needed: deflate_directive = r""" ## Configuration snippet taken from: ## SetOutputFilter DEFLATE # Netscape 4.x has some problems... BrowserMatch ^Mozilla/4 gzip-only-text/html # Netscape 4.06-4.08 have some more problems BrowserMatch ^Mozilla/4\.0[678] no-gzip # MSIE masquerades as Netscape, but it is fine # BrowserMatch \bMSIE !no-gzip !gzip-only-text/html # NOTE: Due to a bug in mod_setenvif up to Apache 2.0.48 # the above regex won't work. You can use the following # workaround to get the desired effect: BrowserMatch \bMSI[E] !no-gzip !gzip-only-text/html # Don't compress images SetEnvIfNoCase Request_URI \ \.(?:gif|jpe?g|png)$ no-gzip dont-vary # Make sure proxies don't deliver the wrong content Header append Vary User-Agent env=!dont-vary """ else: deflate_directive = "" if CFG_EXTERNAL_AUTH_USING_SSO: shibboleth_directive = r""" SSLRequireSSL # The modules only work using HTTPS AuthType shibboleth ShibRequireSession On ShibRequireAll On ShibExportAssertion Off require valid-user """ else: shibboleth_directive = "" ## Apache vhost conf file is distro specific, so analyze needs: # Gentoo (and generic defaults): listen_directive_needed = True ssl_pem_directive_needed = False ssl_pem_path = '/etc/apache2/ssl/apache.pem' ssl_crt_path = '/etc/apache2/ssl/server.crt' ssl_key_path = '/etc/apache2/ssl/server.key' vhost_ip_address_needed = False wsgi_socket_directive_needed = False # Debian: if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'debian_version'): listen_directive_needed = False ssl_pem_directive_needed = True # RHEL/SLC: if os.path.exists(os.path.sep + 'etc' + os.path.sep + 'redhat-release'): listen_directive_needed = False ssl_crt_path = '/etc/pki/tls/certs/localhost.crt' ssl_key_path = '/etc/pki/tls/private/localhost.key' vhost_ip_address_needed = True wsgi_socket_directive_needed = True # maybe we are using non-standard ports? vhost_site_url = conf.get('Invenio', 'CFG_SITE_URL').replace("http://", "") if vhost_site_url.startswith("https://"): ## The installation is configured to require HTTPS for any connection vhost_site_url = vhost_site_url.replace("https://", "") vhost_site_url_port = '80' vhost_site_secure_url = conf.get('Invenio', 'CFG_SITE_SECURE_URL').replace("https://", "") vhost_site_secure_url_port = '443' if ':' in vhost_site_url: vhost_site_url, vhost_site_url_port = vhost_site_url.split(':', 1) if ':' in vhost_site_secure_url: vhost_site_secure_url, vhost_site_secure_url_port = vhost_site_secure_url.split(':', 1) if vhost_site_url_port != '80' or vhost_site_secure_url_port != '443': listen_directive_needed = True ## OK, let's create Apache vhost files: if not os.path.exists(apache_conf_dir): os.mkdir(apache_conf_dir) apache_vhost_file = apache_conf_dir + os.sep + \ 'invenio-apache-vhost.conf' apache_vhost_ssl_file = apache_conf_dir + os.sep + \ 'invenio-apache-vhost-ssl.conf' apache_vhost_body = """\ AddDefaultCharset UTF-8 ServerSignature Off ServerTokens Prod NameVirtualHost %(vhost_ip_address)s:%(vhost_site_url_port)s %(listen_directive)s %(wsgi_socket_directive)s WSGIRestrictStdout Off deny from all deny from all ServerName %(servername)s ServerAlias %(serveralias)s ServerAdmin %(serveradmin)s DocumentRoot %(webdir)s Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all ErrorLog %(logdir)s/apache.err LogLevel warn CustomLog %(logdir)s/apache.log combined DirectoryIndex index.en.html index.html Alias /img/ %(webdir)s/img/ Alias /js/ %(webdir)s/js/ Alias /flash/ %(webdir)s/flash/ Alias /css/ %(webdir)s/css/ Alias /export/ %(webdir)s/export/ Alias /MathJax/ %(webdir)s/MathJax/ Alias /jsCalendar/ %(webdir)s/jsCalendar/ Alias /ckeditor/ %(webdir)s/ckeditor/ Alias /mediaelement/ %(webdir)s/mediaelement/ AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1 Alias /robots.txt %(webdir)s/robots.txt Alias /favicon.ico %(webdir)s/favicon.ico WSGIDaemonProcess invenio processes=5 threads=1 display-name=%%{GROUP} inactivity-timeout=3600 maximum-requests=10000 WSGIImportScript %(wsgidir)s/invenio.wsgi process-group=invenio application-group=%%{GLOBAL} WSGIScriptAlias / %(wsgidir)s/invenio.wsgi WSGIPassAuthorization On %(xsendfile_directive)s WSGIProcessGroup invenio WSGIApplicationGroup %%{GLOBAL} Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all %(deflate_directive)s """ % {'vhost_site_url_port': vhost_site_url_port, 'servername': vhost_site_url, 'serveralias': vhost_site_url.split('.')[0], 'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'), 'webdir': conf.get('Invenio', 'CFG_WEBDIR'), 'logdir': conf.get('Invenio', 'CFG_LOGDIR'), 'libdir' : conf.get('Invenio', 'CFG_PYLIBDIR'), 'wsgidir': os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'), 'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address() or '*', 'listen_directive': listen_directive_needed and 'Listen ' + vhost_site_url_port or \ '#Listen ' + vhost_site_url_port, 'wsgi_socket_directive': (wsgi_socket_directive_needed and \ 'WSGISocketPrefix ' or '#WSGISocketPrefix ') + \ conf.get('Invenio', 'CFG_PREFIX') + os.sep + 'var' + os.sep + 'run', 'xsendfile_directive' : xsendfile_directive, 'deflate_directive': deflate_directive, } apache_vhost_ssl_body = """\ ServerSignature Off ServerTokens Prod %(listen_directive)s NameVirtualHost %(vhost_ip_address)s:%(vhost_site_secure_url_port)s %(ssl_pem_directive)s %(ssl_crt_directive)s %(ssl_key_directive)s WSGIRestrictStdout Off deny from all deny from all ServerName %(servername)s ServerAlias %(serveralias)s ServerAdmin %(serveradmin)s SSLEngine on DocumentRoot %(webdir)s Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all ErrorLog %(logdir)s/apache-ssl.err LogLevel warn CustomLog %(logdir)s/apache-ssl.log combined DirectoryIndex index.en.html index.html Alias /img/ %(webdir)s/img/ Alias /js/ %(webdir)s/js/ Alias /flash/ %(webdir)s/flash/ Alias /css/ %(webdir)s/css/ Alias /export/ %(webdir)s/export/ Alias /MathJax/ %(webdir)s/MathJax/ Alias /jsCalendar/ %(webdir)s/jsCalendar/ Alias /ckeditor/ %(webdir)s/ckeditor/ Alias /mediaelement/ %(webdir)s/mediaelement/ AliasMatch /sitemap-(.*) %(webdir)s/sitemap-$1 Alias /robots.txt %(webdir)s/robots.txt Alias /favicon.ico %(webdir)s/favicon.ico RedirectMatch /sslredirect/(.*) http://$1 WSGIScriptAlias / %(wsgidir)s/invenio.wsgi WSGIPassAuthorization On %(xsendfile_directive)s WSGIProcessGroup invenio WSGIApplicationGroup %%{GLOBAL} Options FollowSymLinks MultiViews AllowOverride None Order allow,deny Allow from all %(deflate_directive)s %(shibboleth_directive)s """ % {'vhost_site_secure_url_port': vhost_site_secure_url_port, 'servername': vhost_site_secure_url, 'serveralias': vhost_site_secure_url.split('.')[0], 'serveradmin': conf.get('Invenio', 'CFG_SITE_ADMIN_EMAIL'), 'webdir': conf.get('Invenio', 'CFG_WEBDIR'), 'logdir': conf.get('Invenio', 'CFG_LOGDIR'), 'libdir' : conf.get('Invenio', 'CFG_PYLIBDIR'), 'wsgidir' : os.path.join(conf.get('Invenio', 'CFG_PREFIX'), 'var', 'www-wsgi'), 'vhost_ip_address': vhost_ip_address_needed and _detect_ip_address() or '*', 'listen_directive' : listen_directive_needed and 'Listen ' + vhost_site_secure_url_port or \ '#Listen ' + vhost_site_secure_url_port, 'ssl_pem_directive': ssl_pem_directive_needed and \ 'SSLCertificateFile %s' % ssl_pem_path or \ '#SSLCertificateFile %s' % ssl_pem_path, 'ssl_crt_directive': ssl_pem_directive_needed and \ '#SSLCertificateFile %s' % ssl_crt_path or \ 'SSLCertificateFile %s' % ssl_crt_path, 'ssl_key_directive': ssl_pem_directive_needed and \ '#SSLCertificateKeyFile %s' % ssl_key_path or \ 'SSLCertificateKeyFile %s' % ssl_key_path, 'xsendfile_directive' : xsendfile_directive, 'deflate_directive': deflate_directive, 'shibboleth_directive': shibboleth_directive, } # write HTTP vhost snippet: if os.path.exists(apache_vhost_file): shutil.copy(apache_vhost_file, apache_vhost_file + '.OLD') fdesc = open(apache_vhost_file, 'w') fdesc.write(apache_vhost_body) fdesc.close() print print "Created file", apache_vhost_file # write HTTPS vhost snippet: vhost_ssl_created = False if conf.get('Invenio', 'CFG_SITE_SECURE_URL').startswith("https://"): if os.path.exists(apache_vhost_ssl_file): shutil.copy(apache_vhost_ssl_file, apache_vhost_ssl_file + '.OLD') fdesc = open(apache_vhost_ssl_file, 'w') fdesc.write(apache_vhost_ssl_body) fdesc.close() vhost_ssl_created = True print "Created file", apache_vhost_ssl_file print wrap_text_in_a_box("""\ Apache virtual host configuration file(s) for your Invenio site was(were) created. Please check created file(s) and activate virtual host(s). For example, you can put the following include statements in your httpd.conf:\n Include %s %s Please see the INSTALL file for more details. """ % (apache_vhost_file, (vhost_ssl_created and 'Include ' or '#Include ') + apache_vhost_ssl_file)) print ">>> Apache conf files created." def cli_cmd_get(conf, varname): """ Return value of VARNAME read from CONF files. Useful for third-party programs to access values of conf options such as CFG_PREFIX. Return None if VARNAME is not found. """ # do not pay attention to upper/lower case: varname = varname.lower() # do not pay attention to section names yet: all_options = {} for section in conf.sections(): for option in conf.options(section): all_options[option] = conf.get(section, option) return all_options.get(varname, None) def cli_cmd_list(conf): """ Print a list of all conf options and values from CONF. """ sections = conf.sections() sections.sort() for section in sections: options = conf.options(section) options.sort() for option in options: print option.upper(), '=', conf.get(section, option) def _grep_version_from_executable(path_to_exec, version_regexp): """ Try to detect a program version by digging into its binary PATH_TO_EXEC and looking for VERSION_REGEXP. Return program version as a string. Return empty string if not succeeded. """ from invenio.shellutils import run_shell_command exec_version = "" if os.path.exists(path_to_exec): dummy1, cmd2_out, dummy2 = run_shell_command("strings %s | grep %s", (path_to_exec, version_regexp)) if cmd2_out: for cmd2_out_line in cmd2_out.split("\n"): if len(cmd2_out_line) > len(exec_version): # the longest the better exec_version = cmd2_out_line return exec_version def detect_apache_version(): """ Try to detect Apache version by localizing httpd or apache executables and grepping inside binaries. Return list of all found Apache versions and paths. (For a given executable, the returned format is 'apache_version [apache_path]'.) Return empty list if no success. """ from invenio.shellutils import run_shell_command out = [] dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache") for apache in cmd_out.split("\n"): apache_version = _grep_version_from_executable(apache, '^Apache\/') if apache_version: out.append("%s [%s]" % (apache_version, apache)) return out def cli_cmd_detect_system_details(conf): """ Detect and print system details such as Apache/Python/MySQL versions etc. Useful for debugging problems on various OS. """ import MySQLdb print ">>> Going to detect system details..." print "* Hostname: " + socket.gethostname() print "* Invenio version: " + conf.get("Invenio", "CFG_VERSION") print "* Python version: " + sys.version.replace("\n", " ") print "* Apache version: " + ";\n ".join(detect_apache_version()) print "* MySQLdb version: " + MySQLdb.__version__ try: from invenio.dbquery import run_sql print "* MySQL version:" for key, val in run_sql("SHOW VARIABLES LIKE 'version%'") + \ run_sql("SHOW VARIABLES LIKE 'charact%'") + \ run_sql("SHOW VARIABLES LIKE 'collat%'"): if False: print " - %s: %s" % (key, val) elif key in ['version', 'character_set_client', 'character_set_connection', 'character_set_database', 'character_set_results', 'character_set_server', 'character_set_system', 'collation_connection', 'collation_database', 'collation_server']: print " - %s: %s" % (key, val) except ImportError: print "* ERROR: cannot import dbquery" print ">>> System details detected successfully." def main(): """Main entry point.""" conf = ConfigParser() if '--help' in sys.argv or \ '-h' in sys.argv: print_usage() elif '--version' in sys.argv or \ '-V' in sys.argv: print_version() else: confdir = None if '--conf-dir' in sys.argv: try: confdir = sys.argv[sys.argv.index('--conf-dir') + 1] except IndexError: pass # missing --conf-dir argument value if not os.path.exists(confdir): print "ERROR: bad or missing --conf-dir option value." sys.exit(1) else: ## try to detect path to conf dir (relative to this bin dir): confdir = re.sub(r'/bin$', '/etc', sys.path[0]) ## read conf files: for conffile in [confdir + os.sep + 'invenio.conf', confdir + os.sep + 'invenio-autotools.conf', confdir + os.sep + 'invenio-local.conf',]: if os.path.exists(conffile): conf.read(conffile) else: if not conffile.endswith("invenio-local.conf"): # invenio-local.conf is optional, otherwise stop print "ERROR: Badly guessed conf file location", conffile print "(Please use --conf-dir option.)" sys.exit(1) ## decide what to do: done = False for opt_idx in range(0, len(sys.argv)): opt = sys.argv[opt_idx] if opt == '--conf-dir': # already treated before, so skip silently: pass elif opt == '--get': try: varname = sys.argv[opt_idx + 1] except IndexError: print "ERROR: bad or missing --get option value." sys.exit(1) if varname.startswith('-'): print "ERROR: bad or missing --get option value." sys.exit(1) varvalue = cli_cmd_get(conf, varname) if varvalue is not None: print varvalue else: sys.exit(1) done = True elif opt == '--list': cli_cmd_list(conf) done = True elif opt == '--detect-system-details': cli_cmd_detect_system_details(conf) done = True elif opt == '--create-tables': cli_cmd_create_tables(conf) done = True elif opt == '--load-webstat-conf': cli_cmd_load_webstat_conf(conf) done = True elif opt == '--drop-tables': cli_cmd_drop_tables(conf) done = True elif opt == '--check-openoffice': cli_check_openoffice(conf) done = True elif opt == '--create-demo-site': cli_cmd_create_demo_site(conf) done = True elif opt == '--load-demo-records': cli_cmd_load_demo_records(conf) done = True elif opt == '--remove-demo-records': cli_cmd_remove_demo_records(conf) done = True elif opt == '--drop-demo-site': cli_cmd_drop_demo_site(conf) done = True elif opt == '--run-unit-tests': cli_cmd_run_unit_tests(conf) done = True elif opt == '--run-regression-tests': cli_cmd_run_regression_tests(conf) done = True elif opt == '--run-web-tests': cli_cmd_run_web_tests(conf) done = True elif opt == '--update-all': cli_cmd_update_config_py(conf) cli_cmd_update_dbquery_py(conf) cli_cmd_update_dbexec(conf) cli_cmd_update_bibconvert_tpl(conf) cli_cmd_update_web_tests(conf) done = True elif opt == '--update-config-py': cli_cmd_update_config_py(conf) done = True elif opt == '--update-dbquery-py': cli_cmd_update_dbquery_py(conf) done = True elif opt == '--update-dbexec': cli_cmd_update_dbexec(conf) done = True elif opt == '--update-bibconvert-tpl': cli_cmd_update_bibconvert_tpl(conf) done = True elif opt == '--update-web-tests': cli_cmd_update_web_tests(conf) done = True elif opt == '--reset-all': cli_cmd_reset_sitename(conf) cli_cmd_reset_siteadminemail(conf) cli_cmd_reset_fieldnames(conf) cli_cmd_reset_recstruct_cache(conf) done = True elif opt == '--reset-sitename': cli_cmd_reset_sitename(conf) done = True elif opt == '--reset-siteadminemail': cli_cmd_reset_siteadminemail(conf) done = True elif opt == '--reset-fieldnames': cli_cmd_reset_fieldnames(conf) done = True elif opt == '--reset-recstruct-cache': cli_cmd_reset_recstruct_cache(conf) done = True elif opt == '--create-apache-conf': cli_cmd_create_apache_conf(conf) done = True elif opt.startswith("-") and opt != '--yes-i-know': print "ERROR: unknown option", opt sys.exit(1) if not done: print """ERROR: Please specify a command. Please see '--help'.""" sys.exit(1) if __name__ == '__main__': main() diff --git a/modules/miscutil/sql/tabcreate.sql b/modules/miscutil/sql/tabcreate.sql index eb01eadc9..16319220c 100644 --- a/modules/miscutil/sql/tabcreate.sql +++ b/modules/miscutil/sql/tabcreate.sql @@ -1,4009 +1,4009 @@ -- This file is part of Invenio. -- Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. -- -- Invenio is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License as -- published by the Free Software Foundation; either version 2 of the -- License, or (at your option) any later version. -- -- Invenio is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with Invenio; if not, write to the Free Software Foundation, Inc., -- 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -- tables for bibliographic records: CREATE TABLE IF NOT EXISTS bibrec ( id mediumint(8) unsigned NOT NULL auto_increment, creation_date datetime NOT NULL default '0000-00-00', modification_date datetime NOT NULL default '0000-00-00', PRIMARY KEY (id), KEY creation_date (creation_date), KEY modification_date (modification_date) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib00x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib01x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib02x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib03x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib04x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib05x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib06x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib07x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib08x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib09x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib10x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib11x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib12x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib13x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib14x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib15x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib16x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib17x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib18x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib19x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib20x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib21x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib22x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib23x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib24x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib25x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib26x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib27x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib28x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib29x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib30x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib31x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib32x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib33x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib34x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib35x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib36x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib37x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib38x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib39x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib40x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib41x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib42x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib43x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib44x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib45x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib46x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib47x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib48x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib49x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib50x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib51x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib52x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib53x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib54x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib55x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib56x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib57x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib58x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib59x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib60x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib61x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib62x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib63x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib64x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib65x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib66x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib67x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib68x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib69x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib70x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib71x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib72x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib73x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib74x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib75x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib76x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib77x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib78x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib79x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib80x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib81x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib82x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib83x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib84x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib85x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(100)) -- URLs need usually a larger index for speedy lookups ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib86x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib87x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib88x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib89x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib90x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib91x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib92x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib93x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib94x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib95x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib96x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib97x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib98x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bib99x ( id mediumint(8) unsigned NOT NULL auto_increment, tag varchar(6) NOT NULL default '', value text NOT NULL, PRIMARY KEY (id), KEY kt (tag), KEY kv (value(35)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib00x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib01x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib02x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib03x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib04x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib05x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib06x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib07x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib08x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib09x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib10x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib11x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib12x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib13x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib14x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib15x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib16x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib17x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib18x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib19x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib20x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib21x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib22x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib23x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib24x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib25x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib26x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib27x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib28x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib29x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib30x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib31x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib32x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib33x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib34x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib35x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib36x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib37x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib38x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib39x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib40x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib41x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib42x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib43x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib44x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib45x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib46x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib47x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib48x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib49x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib50x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib51x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib52x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib53x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib54x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib55x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib56x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib57x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib58x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib59x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib60x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib61x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib62x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib63x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib64x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib65x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib66x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib67x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib68x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib69x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib70x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib71x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib72x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib73x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib74x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib75x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib76x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib77x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib78x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib79x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib80x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib81x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib82x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib83x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib84x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib85x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib86x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib87x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib88x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib89x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib90x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib91x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib92x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib93x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib94x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib95x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib96x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib97x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib98x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bib99x ( id_bibrec mediumint(8) unsigned NOT NULL default '0', id_bibxxx mediumint(8) unsigned NOT NULL default '0', field_number smallint(5) unsigned default NULL, KEY id_bibxxx (id_bibxxx), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; -- tables for bibliographic records formatted: CREATE TABLE IF NOT EXISTS bibfmt ( id mediumint(8) unsigned NOT NULL auto_increment, id_bibrec int(8) unsigned NOT NULL default '0', format varchar(10) NOT NULL default '', last_updated datetime NOT NULL default '0000-00-00', value longblob, PRIMARY KEY (id), KEY id_bibrec (id_bibrec), KEY format (format) ) ENGINE=MyISAM; -- tables for index files: CREATE TABLE IF NOT EXISTS idxINDEX ( id mediumint(9) unsigned NOT NULL, name varchar(50) NOT NULL default '', description varchar(255) NOT NULL default '', last_updated datetime NOT NULL default '0000-00-00 00:00:00', stemming_language varchar(10) NOT NULL default '', PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxINDEXNAME ( id_idxINDEX mediumint(9) unsigned NOT NULL, ln char(5) NOT NULL default '', type char(3) NOT NULL default 'sn', value varchar(255) NOT NULL, PRIMARY KEY (id_idxINDEX,ln,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxINDEX_field ( id_idxINDEX mediumint(9) unsigned NOT NULL, id_field mediumint(9) unsigned NOT NULL, regexp_punctuation varchar(255) NOT NULL default "[\.\,\:\;\?\!\"]", regexp_alphanumeric_separators varchar(255) NOT NULL default "[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]", PRIMARY KEY (id_idxINDEX,id_field) ) ENGINE=MyISAM; -- this comment line here is just to fix the SQL display mode in Emacs ' CREATE TABLE IF NOT EXISTS idxWORD01F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD01R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD02F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD02R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD03F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD03R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD04F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD04R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD05F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD05R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD06F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD06R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD07F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD07R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD08F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD08R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD09F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD09R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD10F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD10R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD11F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD11R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD12F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD12R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD13F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD13R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD14F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD14R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD15F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD15R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD16F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD16R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD17F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxWORD17R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR01F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR01R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR02F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR02R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR03F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR03R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR04F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR04R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR05F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR05R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR06F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR06R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR07F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR07R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR08F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR08R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR09F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR09R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR10F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR10R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR11F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR11R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR12F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR12R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR13F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR13R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR14F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR14R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR15F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR15R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR16F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR16R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR17F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(100) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPAIR17R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE01F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE01R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE02F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE02R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE03F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE03R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE04F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE04R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE05F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE05R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE06F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE06R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE07F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE07R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE08F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE08R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE09F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE09R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE10F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE10R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE11F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE11R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE12F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE12R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE13F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE13R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE14F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE14R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE15F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE15R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE16F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE16R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE17F ( id mediumint(9) unsigned NOT NULL auto_increment, term text default NULL, hitlist longblob, PRIMARY KEY (id), KEY term (term(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS idxPHRASE17R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; -- tables for ranking: CREATE TABLE IF NOT EXISTS rnkMETHOD ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(20) NOT NULL default '', last_updated datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS rnkMETHODNAME ( id_rnkMETHOD mediumint(9) unsigned NOT NULL, ln char(5) NOT NULL default '', type char(3) NOT NULL default 'sn', value varchar(255) NOT NULL, PRIMARY KEY (id_rnkMETHOD,ln,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS rnkMETHODDATA ( id_rnkMETHOD mediumint(9) unsigned NOT NULL, relevance_data longblob, PRIMARY KEY (id_rnkMETHOD) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS collection_rnkMETHOD ( id_collection mediumint(9) unsigned NOT NULL, id_rnkMETHOD mediumint(9) unsigned NOT NULL, score tinyint(4) unsigned NOT NULL default '0', PRIMARY KEY (id_collection,id_rnkMETHOD) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS rnkWORD01F ( id mediumint(9) unsigned NOT NULL auto_increment, term varchar(50) default NULL, hitlist longblob, PRIMARY KEY (id), UNIQUE KEY term (term) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS rnkWORD01R ( id_bibrec mediumint(9) unsigned NOT NULL, termlist longblob, type enum('CURRENT','FUTURE','TEMPORARY') NOT NULL default 'CURRENT', PRIMARY KEY (id_bibrec,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS rnkAUTHORDATA ( aterm varchar(50) default NULL, hitlist longblob, UNIQUE KEY aterm (aterm) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS rnkPAGEVIEWS ( id_bibrec mediumint(8) unsigned default NULL, id_user int(15) unsigned default '0', client_host int(10) unsigned default NULL, view_time datetime default '0000-00-00 00:00:00', KEY view_time (view_time), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS rnkDOWNLOADS ( id_bibrec mediumint(8) unsigned default NULL, download_time datetime default '0000-00-00 00:00:00', client_host int(10) unsigned default NULL, id_user int(15) unsigned default NULL, id_bibdoc mediumint(9) unsigned default NULL, file_version smallint(2) unsigned default NULL, file_format varchar(10) NULL default NULL, KEY download_time (download_time), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; -- a table for citations. record-cites-record CREATE TABLE IF NOT EXISTS rnkCITATIONDATA ( id mediumint(8) unsigned NOT NULL auto_increment, object_name varchar(255) NOT NULL, object_value longblob, last_updated datetime NOT NULL default '0000-00-00', PRIMARY KEY id (id), UNIQUE KEY object_name (object_name) ) ENGINE=MyISAM; -- a table for missing citations. This should be scanned by a program -- occasionally to check if some publication has been cited more than -- 50 times (or such), and alert cataloguers to create record for that -- external citation -- -- id_bibrec is the id of the record. extcitepubinfo is publication info -- that looks in general like hep-th/0112088 CREATE TABLE IF NOT EXISTS rnkCITATIONDATAEXT ( id_bibrec int(8) unsigned, extcitepubinfo varchar(255) NOT NULL, PRIMARY KEY (id_bibrec, extcitepubinfo), KEY extcitepubinfo (extcitepubinfo) ) ENGINE=MyISAM; -- tables for collections and collection tree: CREATE TABLE IF NOT EXISTS collection ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL, dbquery text, nbrecs int(10) unsigned default '0', reclist longblob, PRIMARY KEY (id), UNIQUE KEY name (name), KEY dbquery (dbquery(50)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS collectionname ( id_collection mediumint(9) unsigned NOT NULL, ln char(5) NOT NULL default '', type char(3) NOT NULL default 'sn', value varchar(255) NOT NULL, PRIMARY KEY (id_collection,ln,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS collection_collection ( id_dad mediumint(9) unsigned NOT NULL, id_son mediumint(9) unsigned NOT NULL, type char(1) NOT NULL default 'r', score tinyint(4) unsigned NOT NULL default '0', PRIMARY KEY (id_dad,id_son) ) ENGINE=MyISAM; -- tables for OAI sets: CREATE TABLE IF NOT EXISTS oaiREPOSITORY ( id mediumint(9) unsigned NOT NULL auto_increment, setName varchar(255) NOT NULL default '', - setSpec varchar(255) NOT NULL default '', + setSpec varchar(255) NOT NULL default 'GLOBAL_SET', setCollection varchar(255) NOT NULL default '', setDescription text NOT NULL default '', setDefinition text NOT NULL default '', setRecList longblob, p1 text NOT NULL default '', f1 text NOT NULL default '', m1 text NOT NULL default '', p2 text NOT NULL default '', f2 text NOT NULL default '', m2 text NOT NULL default '', p3 text NOT NULL default '', f3 text NOT NULL default '', m3 text NOT NULL default '', PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS oaiHARVEST ( id mediumint(9) unsigned NOT NULL auto_increment, baseurl varchar(255) NOT NULL default '', metadataprefix varchar(255) NOT NULL default 'oai_dc', arguments text, comment text, bibconvertcfgfile varchar(255), name varchar(255) NOT NULL, lastrun datetime, frequency mediumint(12) NOT NULL default '0', postprocess varchar(20) NOT NULL default 'h', bibfilterprogram varchar(255) NOT NULL default '', setspecs text NOT NULL default '', PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS oaiHARVESTLOG ( id_oaiHARVEST mediumint(9) unsigned NOT NULL REFERENCES oaiHARVEST, -- source we harvest from id_bibrec mediumint(8) unsigned NOT NULL default '0', -- internal record id ( filled by bibupload ) bibupload_task_id int NOT NULL default 0, -- bib upload task number oai_id varchar(40) NOT NULL default "", -- OAI record identifier we harvested date_harvested datetime NOT NULL default '0000-00-00', -- when we harvested date_inserted datetime NOT NULL default '0000-00-00', -- when it was inserted inserted_to_db char(1) NOT NULL default 'P', -- where it was inserted (P=prod, H=holding-pen, etc) PRIMARY KEY (bibupload_task_id, oai_id, date_harvested) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibHOLDINGPEN ( changeset_id INT NOT NULL AUTO_INCREMENT, -- the identifier of the changeset stored in the holding pen changeset_date datetime NOT NULL DEFAULT '0000:00:00 00:00:00', -- when was the changeset inserted changeset_xml TEXT NOT NULL DEFAULT '', oai_id varchar(40) NOT NULL DEFAULT '', -- OAI identifier of concerned record id_bibrec mediumint(8) unsigned NOT NULL default '0', -- record ID of concerned record (filled by bibupload) PRIMARY KEY (changeset_id), KEY changeset_date (changeset_date), KEY id_bibrec (id_bibrec) ) ENGINE=MyISAM; -- tables for portal elements: CREATE TABLE IF NOT EXISTS collection_portalbox ( id_collection mediumint(9) unsigned NOT NULL, id_portalbox mediumint(9) unsigned NOT NULL, ln char(5) NOT NULL default '', position char(3) NOT NULL default 'top', score tinyint(4) unsigned NOT NULL default '0', PRIMARY KEY (id_collection,id_portalbox,ln) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS portalbox ( id mediumint(9) unsigned NOT NULL auto_increment, title text NOT NULL, body text NOT NULL, UNIQUE KEY id (id) ) ENGINE=MyISAM; -- tables for search examples: CREATE TABLE IF NOT EXISTS collection_example ( id_collection mediumint(9) unsigned NOT NULL, id_example mediumint(9) unsigned NOT NULL, score tinyint(4) unsigned NOT NULL default '0', PRIMARY KEY (id_collection,id_example) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS example ( id mediumint(9) unsigned NOT NULL auto_increment, type text NOT NULL default '', body text NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; -- tables for collection formats: CREATE TABLE IF NOT EXISTS collection_format ( id_collection mediumint(9) unsigned NOT NULL, id_format mediumint(9) unsigned NOT NULL, score tinyint(4) unsigned NOT NULL default '0', PRIMARY KEY (id_collection,id_format) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS format ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL, code varchar(6) NOT NULL, description varchar(255) default '', content_type varchar(255) default '', visibility tinyint NOT NULL default '1', PRIMARY KEY (id), UNIQUE KEY code (code) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS formatname ( id_format mediumint(9) unsigned NOT NULL, ln char(5) NOT NULL default '', type char(3) NOT NULL default 'sn', value varchar(255) NOT NULL, PRIMARY KEY (id_format,ln,type) ) ENGINE=MyISAM; -- tables for collection detailed page options CREATE TABLE IF NOT EXISTS collectiondetailedrecordpagetabs ( id_collection mediumint(9) unsigned NOT NULL, tabs varchar(255) NOT NULL default '', PRIMARY KEY (id_collection) ) ENGINE=MyISAM; -- tables for search options and MARC tags: CREATE TABLE IF NOT EXISTS collection_field_fieldvalue ( id_collection mediumint(9) unsigned NOT NULL, id_field mediumint(9) unsigned NOT NULL, id_fieldvalue mediumint(9) unsigned, type char(3) NOT NULL default 'src', score tinyint(4) unsigned NOT NULL default '0', score_fieldvalue tinyint(4) unsigned NOT NULL default '0', KEY id_collection (id_collection), KEY id_field (id_field), KEY id_fieldvalue (id_fieldvalue) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS field ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL, code varchar(255) NOT NULL, PRIMARY KEY (id), UNIQUE KEY code (code) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS fieldname ( id_field mediumint(9) unsigned NOT NULL, ln char(5) NOT NULL default '', type char(3) NOT NULL default 'sn', value varchar(255) NOT NULL, PRIMARY KEY (id_field,ln,type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS fieldvalue ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL, value text NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS field_tag ( id_field mediumint(9) unsigned NOT NULL, id_tag mediumint(9) unsigned NOT NULL, score tinyint(4) unsigned NOT NULL default '0', PRIMARY KEY (id_field,id_tag) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS tag ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL, value char(6) NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; -- tables for file management CREATE TABLE IF NOT EXISTS bibdoc ( id mediumint(9) unsigned NOT NULL auto_increment, status text NOT NULL default '', docname varchar(250) COLLATE utf8_bin NOT NULL default 'file', creation_date datetime NOT NULL default '0000-00-00', modification_date datetime NOT NULL default '0000-00-00', text_extraction_date datetime NOT NULL default '0000-00-00', more_info mediumblob NULL default NULL, PRIMARY KEY (id), KEY docname (docname), KEY creation_date (creation_date), KEY modification_date (modification_date) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibrec_bibdoc ( id_bibrec mediumint(9) unsigned NOT NULL default '0', id_bibdoc mediumint(9) unsigned NOT NULL default '0', type varchar(255), KEY (id_bibrec), KEY (id_bibdoc) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bibdoc_bibdoc ( id_bibdoc1 mediumint(9) unsigned NOT NULL, id_bibdoc2 mediumint(9) unsigned NOT NULL, type varchar(255), KEY (id_bibdoc1), KEY (id_bibdoc2) ) ENGINE=MyISAM; -- tables for publication requests: CREATE TABLE IF NOT EXISTS publreq ( id int(11) NOT NULL auto_increment, host varchar(255) NOT NULL default '', date varchar(255) NOT NULL default '', name varchar(255) NOT NULL default '', email varchar(255) NOT NULL default '', address text NOT NULL, publication text NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; -- table for sessions and users: CREATE TABLE IF NOT EXISTS session ( session_key varchar(32) NOT NULL default '', session_expiry int(11) unsigned NOT NULL default '0', session_object longblob, uid int(15) unsigned NOT NULL, UNIQUE KEY session_key (session_key), KEY uid (uid) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS user ( id int(15) unsigned NOT NULL auto_increment, email varchar(255) NOT NULL default '', password blob NOT NULL, note varchar(255) default NULL, settings blob default NULL, nickname varchar(255) NOT NULL default '', last_login datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY id (id), KEY email (email), KEY nickname (nickname) ) ENGINE=MyISAM; -- tables for usergroups CREATE TABLE IF NOT EXISTS usergroup ( id int(15) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL default '', description text default '', join_policy char(2) NOT NULL default '', login_method varchar(255) NOT NULL default 'INTERNAL', PRIMARY KEY (id), UNIQUE KEY login_method_name (login_method(70), name), KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS user_usergroup ( id_user int(15) unsigned NOT NULL default '0', id_usergroup int(15) unsigned NOT NULL default '0', user_status char(1) NOT NULL default '', user_status_date datetime NOT NULL default '0000-00-00 00:00:00', KEY id_user (id_user), KEY id_usergroup (id_usergroup) ) ENGINE=MyISAM; -- tables for access control engine CREATE TABLE IF NOT EXISTS accROLE ( id int(15) unsigned NOT NULL auto_increment, name varchar(32), description varchar(255), firerole_def_ser blob NULL, firerole_def_src text NULL, PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS user_accROLE ( id_user int(15) unsigned NOT NULL, id_accROLE int(15) unsigned NOT NULL, expiration datetime NOT NULL default '9999-12-31 23:59:59', PRIMARY KEY (id_user, id_accROLE) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS accMAILCOOKIE ( id int(15) unsigned NOT NULL auto_increment, data blob NOT NULL, expiration datetime NOT NULL default '9999-12-31 23:59:59', kind varchar(32) NOT NULL, onetime boolean NOT NULL default 0, status char(1) NOT NULL default 'W', PRIMARY KEY (id), KEY expiration (expiration) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS accACTION ( id int(15) unsigned NOT NULL auto_increment, name varchar(32), description varchar(255), allowedkeywords varchar(255), optional ENUM ('yes', 'no') NOT NULL default 'no', PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS accARGUMENT ( id int(15) unsigned NOT NULL auto_increment, keyword varchar (32), value varchar(255), PRIMARY KEY (id), KEY KEYVAL (keyword, value) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS accROLE_accACTION_accARGUMENT ( id_accROLE int(15), id_accACTION int(15), id_accARGUMENT int(15), argumentlistid mediumint(8), KEY id_accROLE (id_accROLE), KEY id_accACTION (id_accACTION), KEY id_accARGUMENT (id_accARGUMENT) ) ENGINE=MyISAM; -- tables for personal/collaborative features (baskets, alerts, searches, messages, usergroups): CREATE TABLE IF NOT EXISTS user_query ( id_user int(15) unsigned NOT NULL default '0', id_query int(15) unsigned NOT NULL default '0', hostname varchar(50) default 'unknown host', date datetime default NULL, KEY id_user (id_user,id_query) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS query ( id int(15) unsigned NOT NULL auto_increment, type char(1) NOT NULL default 'r', urlargs text NOT NULL, PRIMARY KEY (id), KEY urlargs (urlargs(100)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS user_query_basket ( id_user int(15) unsigned NOT NULL default '0', id_query int(15) unsigned NOT NULL default '0', id_basket int(15) unsigned NOT NULL default '0', frequency varchar(5) NOT NULL default '', date_creation date default NULL, date_lastrun date default '0000-00-00', alert_name varchar(30) NOT NULL default '', notification char(1) NOT NULL default 'y', PRIMARY KEY (id_user,id_query,frequency,id_basket), KEY alert_name (alert_name) ) ENGINE=MyISAM; -- baskets CREATE TABLE IF NOT EXISTS bskBASKET ( id int(15) unsigned NOT NULL auto_increment, id_owner int(15) unsigned NOT NULL default '0', name varchar(50) NOT NULL default '', date_modification datetime NOT NULL default '0000-00-00 00:00:00', nb_views int(15) NOT NULL default '0', PRIMARY KEY (id), KEY id_owner (id_owner), KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bskREC ( id_bibrec_or_bskEXTREC int(16) NOT NULL default '0', id_bskBASKET int(15) unsigned NOT NULL default '0', id_user_who_added_item int(15) NOT NULL default '0', score int(15) NOT NULL default '0', date_added datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY (id_bibrec_or_bskEXTREC,id_bskBASKET), KEY id_bibrec_or_bskEXTREC (id_bibrec_or_bskEXTREC), KEY id_bskBASKET (id_bskBASKET), KEY score (score), KEY date_added (date_added) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bskEXTREC ( id int(15) unsigned NOT NULL auto_increment, external_id int(15) NOT NULL default '0', collection_id int(15) unsigned NOT NULL default '0', original_url text, creation_date datetime NOT NULL default '0000-00-00 00:00:00', modification_date datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bskEXTFMT ( id int(15) unsigned NOT NULL auto_increment, id_bskEXTREC int(15) unsigned NOT NULL default '0', format varchar(10) NOT NULL default '', last_updated datetime NOT NULL default '0000-00-00 00:00:00', value longblob, PRIMARY KEY (id), KEY id_bskEXTREC (id_bskEXTREC), KEY format (format) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS user_bskBASKET ( id_user int(15) unsigned NOT NULL default '0', id_bskBASKET int(15) unsigned NOT NULL default '0', topic varchar(50) NOT NULL default '', PRIMARY KEY (id_user,id_bskBASKET), KEY id_user (id_user), KEY id_bskBASKET (id_bskBASKET) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS usergroup_bskBASKET ( id_usergroup int(15) unsigned NOT NULL default '0', id_bskBASKET int(15) unsigned NOT NULL default '0', topic varchar(50) NOT NULL default '', date_shared datetime NOT NULL default '0000-00-00 00:00:00', share_level char(2) NOT NULL default '', PRIMARY KEY (id_usergroup,id_bskBASKET), KEY id_usergroup (id_usergroup), KEY id_bskBASKET (id_bskBASKET) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS bskRECORDCOMMENT ( id int(15) unsigned NOT NULL auto_increment, id_bibrec_or_bskEXTREC int(16) NOT NULL default '0', id_bskBASKET int(15) unsigned NOT NULL default '0', id_user int(15) unsigned NOT NULL default '0', title varchar(255) NOT NULL default '', body text NOT NULL, date_creation datetime NOT NULL default '0000-00-00 00:00:00', priority int(15) NOT NULL default '0', in_reply_to_id_bskRECORDCOMMENT int(15) unsigned NOT NULL default '0', reply_order_cached_data blob NULL default NULL, PRIMARY KEY (id), KEY id_bskBASKET (id_bskBASKET), KEY id_bibrec_or_bskEXTREC (id_bibrec_or_bskEXTREC), KEY date_creation (date_creation), KEY in_reply_to_id_bskRECORDCOMMENT (in_reply_to_id_bskRECORDCOMMENT), INDEX (reply_order_cached_data(40)) ) ENGINE=MyISAM; -- tables for messaging system CREATE TABLE IF NOT EXISTS msgMESSAGE ( id int(15) unsigned NOT NULL auto_increment, id_user_from int(15) unsigned NOT NULL default '0', sent_to_user_nicks text NOT NULL default '', sent_to_group_names text NOT NULL default '', subject text NOT NULL default '', body text default NULL, sent_date datetime NOT NULL default '0000-00-00 00:00:00', received_date datetime NULL default '0000-00-00 00:00:00', PRIMARY KEY id (id), KEY id_user_from (id_user_from) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS user_msgMESSAGE ( id_user_to int(15) unsigned NOT NULL default '0', id_msgMESSAGE int(15) unsigned NOT NULL default '0', status char(1) NOT NULL default 'N', PRIMARY KEY id (id_user_to, id_msgMESSAGE), KEY id_user_to (id_user_to), KEY id_msgMESSAGE (id_msgMESSAGE) ) ENGINE=MyISAM; -- tables for WebComment CREATE TABLE IF NOT EXISTS cmtRECORDCOMMENT ( id int(15) unsigned NOT NULL auto_increment, id_bibrec int(15) unsigned NOT NULL default '0', id_user int(15) unsigned NOT NULL default '0', title varchar(255) NOT NULL default '', body text NOT NULL default '', date_creation datetime NOT NULL default '0000-00-00 00:00:00', star_score tinyint(5) unsigned NOT NULL default '0', nb_votes_yes int(10) NOT NULL default '0', nb_votes_total int(10) unsigned NOT NULL default '0', nb_abuse_reports int(10) NOT NULL default '0', status char(2) NOT NULL default 'ok', round_name varchar(255) NOT NULL default '', restriction varchar(50) NOT NULL default '', in_reply_to_id_cmtRECORDCOMMENT int(15) unsigned NOT NULL default '0', reply_order_cached_data blob NULL default NULL, PRIMARY KEY (id), KEY id_bibrec (id_bibrec), KEY id_user (id_user), KEY status (status), KEY in_reply_to_id_cmtRECORDCOMMENT (in_reply_to_id_cmtRECORDCOMMENT), INDEX (reply_order_cached_data(40)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS cmtACTIONHISTORY ( id_cmtRECORDCOMMENT int(15) unsigned NULL, id_bibrec int(15) unsigned NULL, id_user int(15) unsigned NULL default NULL, client_host int(10) unsigned default NULL, action_time datetime NOT NULL default '0000-00-00 00:00:00', action_code char(1) NOT NULL, KEY id_cmtRECORDCOMMENT (id_cmtRECORDCOMMENT), KEY client_host (client_host), KEY id_user (id_user), KEY action_code (action_code) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS cmtSUBSCRIPTION ( id_bibrec mediumint(8) unsigned NOT NULL, id_user int(15) unsigned NOT NULL, creation_time datetime NOT NULL default '0000-00-00 00:00:00', KEY id_user (id_bibrec, id_user) ) ENGINE=MyISAM; -- tables for BibKnowledge: CREATE TABLE IF NOT EXISTS knwKB ( id mediumint(8) unsigned NOT NULL auto_increment, name varchar(255) default '', description text default '', kbtype char default NULL, PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS knwKBRVAL ( id mediumint(8) unsigned NOT NULL auto_increment, m_key varchar(255) NOT NULL default '', m_value text NOT NULL default '', id_knwKB mediumint(8) NOT NULL default '0', PRIMARY KEY (id), KEY id_knwKB (id_knwKB), KEY m_key (m_key(30)), KEY m_value (m_value(30)) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS knwKBDDEF ( id_knwKB mediumint(8) unsigned NOT NULL, id_collection mediumint(9), output_tag text default '', search_expression text default '', PRIMARY KEY (id_knwKB) ) ENGINE=MyISAM; -- tables for WebSubmit: CREATE TABLE IF NOT EXISTS sbmACTION ( lactname text, sactname char(3) NOT NULL default '', dir text, cd date default NULL, md date default NULL, actionbutton text, statustext text, PRIMARY KEY (sactname) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmALLFUNCDESCR ( function varchar(40) NOT NULL default '', description tinytext, PRIMARY KEY (function) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmAPPROVAL ( doctype varchar(10) NOT NULL default '', categ varchar(50) NOT NULL default '', rn varchar(50) NOT NULL default '', status varchar(10) NOT NULL default '', dFirstReq datetime NOT NULL default '0000-00-00 00:00:00', dLastReq datetime NOT NULL default '0000-00-00 00:00:00', dAction datetime NOT NULL default '0000-00-00 00:00:00', access varchar(20) NOT NULL default '0', note text NOT NULL default '', PRIMARY KEY (rn) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmCPLXAPPROVAL ( doctype varchar(10) NOT NULL default '', categ varchar(50) NOT NULL default '', rn varchar(50) NOT NULL default '', type varchar(10) NOT NULL, status varchar(10) NOT NULL, id_group int(15) unsigned NOT NULL default '0', id_bskBASKET int(15) unsigned NOT NULL default '0', id_EdBoardGroup int(15) unsigned NOT NULL default '0', dFirstReq datetime NOT NULL default '0000-00-00 00:00:00', dLastReq datetime NOT NULL default '0000-00-00 00:00:00', dEdBoardSel datetime NOT NULL default '0000-00-00 00:00:00', dRefereeSel datetime NOT NULL default '0000-00-00 00:00:00', dRefereeRecom datetime NOT NULL default '0000-00-00 00:00:00', dEdBoardRecom datetime NOT NULL default '0000-00-00 00:00:00', dPubComRecom datetime NOT NULL default '0000-00-00 00:00:00', dProjectLeaderAction datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY (rn, type) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmCOLLECTION ( id int(11) NOT NULL auto_increment, name varchar(100) NOT NULL default '', PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmCOLLECTION_sbmCOLLECTION ( id_father int(11) NOT NULL default '0', id_son int(11) NOT NULL default '0', catalogue_order int(11) NOT NULL default '0' ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmCOLLECTION_sbmDOCTYPE ( id_father int(11) NOT NULL default '0', id_son char(10) NOT NULL default '0', catalogue_order int(11) NOT NULL default '0' ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmCATEGORIES ( doctype varchar(10) NOT NULL default '', sname varchar(75) NOT NULL default '', lname varchar(75) NOT NULL default '', score tinyint unsigned NOT NULL default 0, PRIMARY KEY (doctype, sname), KEY doctype (doctype), KEY sname (sname) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmCHECKS ( chname varchar(15) NOT NULL default '', chdesc text, cd date default NULL, md date default NULL, chefi1 text, chefi2 text, PRIMARY KEY (chname) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmDOCTYPE ( ldocname text, sdocname varchar(10) default NULL, cd date default NULL, md date default NULL, description text ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmFIELD ( subname varchar(13) default NULL, pagenb int(11) default NULL, fieldnb int(11) default NULL, fidesc varchar(15) default NULL, fitext text, level char(1) default NULL, sdesc text, checkn text, cd date default NULL, md date default NULL, fiefi1 text, fiefi2 text ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmFIELDDESC ( name varchar(15) NOT NULL default '', alephcode varchar(50) default NULL, marccode varchar(50) NOT NULL default '', type char(1) default NULL, size int(11) default NULL, rows int(11) default NULL, cols int(11) default NULL, maxlength int(11) default NULL, val text, fidesc text, cd date default NULL, md date default NULL, modifytext text, fddfi2 text, cookie int(11) default '0', PRIMARY KEY (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmFORMATEXTENSION ( FILE_FORMAT text NOT NULL, FILE_EXTENSION text NOT NULL ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmFUNCTIONS ( action varchar(10) NOT NULL default '', doctype varchar(10) NOT NULL default '', function varchar(40) NOT NULL default '', score int(11) NOT NULL default '0', step tinyint(4) NOT NULL default '1' ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmFUNDESC ( function varchar(40) NOT NULL default '', param varchar(40) default NULL ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmGFILERESULT ( FORMAT text NOT NULL, RESULT text NOT NULL ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmIMPLEMENT ( docname varchar(10) default NULL, actname char(3) default NULL, displayed char(1) default NULL, subname varchar(13) default NULL, nbpg int(11) default NULL, cd date default NULL, md date default NULL, buttonorder int(11) default NULL, statustext text, level char(1) NOT NULL default '', score int(11) NOT NULL default '0', stpage int(11) NOT NULL default '0', endtxt varchar(100) NOT NULL default '' ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmPARAMETERS ( doctype varchar(10) NOT NULL default '', name varchar(40) NOT NULL default '', value text NOT NULL default '', PRIMARY KEY (doctype,name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmPUBLICATION ( doctype varchar(10) NOT NULL default '', categ varchar(50) NOT NULL default '', rn varchar(50) NOT NULL default '', status varchar(10) NOT NULL default '', dFirstReq datetime NOT NULL default '0000-00-00 00:00:00', dLastReq datetime NOT NULL default '0000-00-00 00:00:00', dAction datetime NOT NULL default '0000-00-00 00:00:00', accessref varchar(20) NOT NULL default '', accessedi varchar(20) NOT NULL default '', access varchar(20) NOT NULL default '', referees varchar(50) NOT NULL default '', authoremail varchar(50) NOT NULL default '', dRefSelection datetime NOT NULL default '0000-00-00 00:00:00', dRefRec datetime NOT NULL default '0000-00-00 00:00:00', dEdiRec datetime NOT NULL default '0000-00-00 00:00:00', accessspo varchar(20) NOT NULL default '', journal varchar(100) default NULL, PRIMARY KEY (doctype,categ,rn) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmPUBLICATIONCOMM ( id int(11) NOT NULL auto_increment, id_parent int(11) default '0', rn varchar(100) NOT NULL default '', firstname varchar(100) default NULL, secondname varchar(100) default NULL, email varchar(100) default NULL, date varchar(40) NOT NULL default '', synopsis varchar(255) NOT NULL default '', commentfulltext text, PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmPUBLICATIONDATA ( doctype varchar(10) NOT NULL default '', editoboard varchar(250) NOT NULL default '', base varchar(10) NOT NULL default '', logicalbase varchar(10) NOT NULL default '', spokesperson varchar(50) NOT NULL default '', PRIMARY KEY (doctype) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmREFEREES ( doctype varchar(10) NOT NULL default '', categ varchar(10) NOT NULL default '', name varchar(50) NOT NULL default '', address varchar(50) NOT NULL default '', rid int(11) NOT NULL auto_increment, PRIMARY KEY (rid) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmSUBMISSIONS ( email varchar(50) NOT NULL default '', doctype varchar(10) NOT NULL default '', action varchar(10) NOT NULL default '', status varchar(10) NOT NULL default '', id varchar(30) NOT NULL default '', reference varchar(40) NOT NULL default '', cd datetime NOT NULL default '0000-00-00 00:00:00', md datetime NOT NULL default '0000-00-00 00:00:00' ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS sbmCOOKIES ( id int(15) unsigned NOT NULL auto_increment, name varchar(100) NOT NULL, value text, uid int(15) NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; -- Scheduler tables CREATE TABLE IF NOT EXISTS schTASK ( id int(15) unsigned NOT NULL auto_increment, proc varchar(255) NOT NULL, host varchar(255) NOT NULL default '', user varchar(50) NOT NULL, runtime datetime NOT NULL, sleeptime varchar(20), arguments mediumblob, status varchar(50), progress varchar(255), priority tinyint(4) NOT NULL default 0, PRIMARY KEY (id), KEY status (status), KEY runtime (runtime), KEY priority (priority) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS hstTASK ( id int(15) unsigned NOT NULL, proc varchar(20) NOT NULL, host varchar(255) NOT NULL default '', user varchar(50) NOT NULL, runtime datetime NOT NULL, sleeptime varchar(20), arguments mediumblob, status varchar(50), progress varchar(255), priority tinyint(4) NOT NULL default 0, PRIMARY KEY (id), KEY status (status), KEY runtime (runtime), KEY priority (priority) ) ENGINE=MyISAM; -- Batch Upload History CREATE TABLE IF NOT EXISTS hstBATCHUPLOAD ( id int(15) unsigned NOT NULL auto_increment, user varchar(50) NOT NULL, submitdate datetime NOT NULL, filename varchar(255) NOT NULL, execdate datetime NOT NULL, id_schTASK int(15) unsigned NOT NULL, batch_mode varchar(15) NOT NULL, PRIMARY KEY (id), KEY user (user) ) ENGINE=MyISAM; -- External collections CREATE TABLE IF NOT EXISTS collection_externalcollection ( id_collection mediumint(9) unsigned NOT NULL default '0', id_externalcollection mediumint(9) unsigned NOT NULL default '0', type tinyint(4) unsigned NOT NULL default '0', PRIMARY KEY (id_collection, id_externalcollection) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS externalcollection ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL default '', PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; -- WebStat tables: CREATE TABLE IF NOT EXISTS staEVENT ( id varchar(255) NOT NULL, number smallint(2) unsigned ZEROFILL NOT NULL auto_increment, name varchar(255), creation_time TIMESTAMP DEFAULT NOW(), cols varchar(255), PRIMARY KEY (id), UNIQUE KEY number (number) ) ENGINE=MyISAM; -- BibClassify tables: CREATE TABLE IF NOT EXISTS clsMETHOD ( id mediumint(9) unsigned NOT NULL, name varchar(50) NOT NULL default '', location varchar(255) NOT NULL default '', description varchar(255) NOT NULL default '', last_updated datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS collection_clsMETHOD ( id_collection mediumint(9) unsigned NOT NULL, id_clsMETHOD mediumint(9) unsigned NOT NULL, PRIMARY KEY (id_collection, id_clsMETHOD) ) ENGINE=MyISAM; -- WebJournal tables: CREATE TABLE IF NOT EXISTS jrnJOURNAL ( id mediumint(9) unsigned NOT NULL auto_increment, name varchar(50) NOT NULL default '', PRIMARY KEY (id), UNIQUE KEY name (name) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS jrnISSUE ( id_jrnJOURNAL mediumint(9) unsigned NOT NULL, issue_number varchar(50) NOT NULL default '', issue_display varchar(50) NOT NULL default '', date_released datetime NOT NULL default '0000-00-00 00:00:00', date_announced datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY (id_jrnJOURNAL,issue_number) ) ENGINE=MyISAM; -- tables recording history of record's metadata and fulltext documents: CREATE TABLE IF NOT EXISTS hstRECORD ( id_bibrec mediumint(8) unsigned NOT NULL, marcxml blob NOT NULL, job_id mediumint(15) unsigned NOT NULL, job_name varchar(255) NOT NULL, job_person varchar(255) NOT NULL, job_date datetime NOT NULL, job_details blob NOT NULL, KEY (id_bibrec), KEY (job_id), KEY (job_name), KEY (job_person), KEY (job_date) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS hstDOCUMENT ( id_bibdoc mediumint(9) unsigned NOT NULL, docname varchar(250) NOT NULL, docformat varchar(50) NOT NULL, docversion tinyint(4) unsigned NOT NULL, docsize bigint(15) unsigned NOT NULL, docchecksum char(32) NOT NULL, doctimestamp datetime NOT NULL, action varchar(50) NOT NULL, job_id mediumint(15) unsigned NULL default NULL, job_name varchar(255) NULL default NULL, job_person varchar(255) NULL default NULL, job_date datetime NULL default NULL, job_details blob NULL default NULL, KEY (action), KEY (id_bibdoc), KEY (docname), KEY (docformat), KEY (doctimestamp), KEY (job_id), KEY (job_name), KEY (job_person), KEY (job_date) ) ENGINE=MyISAM; -- BibCirculation tables: CREATE TABLE IF NOT EXISTS crcBORROWER ( id int(15) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL default '', email varchar(255) NOT NULL default '', phone varchar(60) default NULL, address varchar(60) default NULL, mailbox varchar(30) default NULL, borrower_since datetime NOT NULL default '0000-00-00 00:00:00', borrower_until datetime NOT NULL default '0000-00-00 00:00:00', notes text, PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS crcILLREQUEST ( id int(15) unsigned NOT NULL auto_increment, id_crcBORROWER int(15) unsigned NOT NULL default '0', barcode varchar(30) NOT NULL default '', period_of_interest_from datetime NOT NULL default '0000-00-00 00:00:00', period_of_interest_to datetime NOT NULL default '0000-00-00 00:00:00', id_crcLIBRARY int(15) unsigned NOT NULL default '0', request_date datetime NOT NULL default '0000-00-00 00:00:00', expected_date datetime NOT NULL default '0000-00-00 00:00:00', arrival_date datetime NOT NULL default '0000-00-00 00:00:00', due_date datetime NOT NULL default '0000-00-00 00:00:00', return_date datetime NOT NULL default '0000-00-00 00:00:00', status varchar(20) NOT NULL default '', cost varchar(30) NOT NULL default '', item_info text, request_type text, borrower_comments text, only_this_edition varchar(10) NOT NULL default '', library_notes text, PRIMARY KEY (id), KEY id_crcborrower (id_crcBORROWER), KEY id_crclibrary (id_crcLIBRARY) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS crcITEM ( barcode varchar(30) NOT NULL default '', id_bibrec int(15) unsigned NOT NULL default '0', id_crcLIBRARY int(15) unsigned NOT NULL default '0', collection varchar(60) default NULL, location varchar(60) default NULL, description varchar(60) default NULL, loan_period varchar(30) NOT NULL default '', status varchar(20) NOT NULL default '', creation_date datetime NOT NULL default '0000-00-00 00:00:00', modification_date datetime NOT NULL default '0000-00-00 00:00:00', number_of_requests int(3) unsigned NOT NULL default '0', PRIMARY KEY (barcode), KEY id_bibrec (id_bibrec), KEY id_crclibrary (id_crcLIBRARY) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS crcLIBRARY ( id int(15) unsigned NOT NULL auto_increment, name varchar(80) NOT NULL default '', address varchar(255) NOT NULL default '', email varchar(255) NOT NULL default '', phone varchar(30) NOT NULL default '', type varchar(30) default NULL, notes text, PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS crcLOAN ( id int(15) unsigned NOT NULL auto_increment, id_crcBORROWER int(15) unsigned NOT NULL default '0', id_bibrec int(15) unsigned NOT NULL default '0', barcode varchar(30) NOT NULL default '', loaned_on datetime NOT NULL default '0000-00-00 00:00:00', returned_on date NOT NULL default '0000-00-00', due_date datetime NOT NULL default '0000-00-00 00:00:00', number_of_renewals int(3) unsigned NOT NULL default '0', overdue_letter_number int(3) unsigned NOT NULL default '0', overdue_letter_date datetime NOT NULL default '0000-00-00 00:00:00', status varchar(20) NOT NULL default '', type varchar(20) NOT NULL default '', notes text, PRIMARY KEY (id), KEY id_crcborrower (id_crcBORROWER), KEY id_bibrec (id_bibrec), KEY barcode (barcode) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS crcLOANREQUEST ( id int(15) unsigned NOT NULL auto_increment, id_crcBORROWER int(15) unsigned NOT NULL default '0', id_bibrec int(15) unsigned NOT NULL default '0', barcode varchar(30) NOT NULL default '', period_of_interest_from datetime NOT NULL default '0000-00-00 00:00:00', period_of_interest_to datetime NOT NULL default '0000-00-00 00:00:00', status varchar(20) NOT NULL default '', notes text, request_date datetime NOT NULL default '0000-00-00 00:00:00', PRIMARY KEY (id), KEY id_crcborrower (id_crcBORROWER), KEY id_bibrec (id_bibrec), KEY barcode (barcode) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS crcPURCHASE ( id int(15) unsigned NOT NULL auto_increment, id_bibrec int(15) unsigned NOT NULL default '0', id_crcVENDOR int(15) unsigned NOT NULL default '0', ordered_date datetime NOT NULL default '0000-00-00 00:00:00', expected_date datetime NOT NULL default '0000-00-00 00:00:00', price varchar(20) NOT NULL default '0', status varchar(20) NOT NULL default '', notes text, PRIMARY KEY (id), KEY id_bibrec (id_bibrec), KEY id_crcVENDOR (id_crcVENDOR) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS crcVENDOR ( id int(15) unsigned NOT NULL auto_increment, name varchar(80) NOT NULL default '', address varchar(255) NOT NULL default '', email varchar(255) NOT NULL default '', phone varchar(30) NOT NULL default '', notes text, PRIMARY KEY (id) ) ENGINE=MyISAM; -- BibExport tables: CREATE TABLE IF NOT EXISTS expJOB ( id int(15) unsigned NOT NULL auto_increment, jobname varchar(50) NOT NULL default '', jobfreq mediumint(12) NOT NULL default '0', output_format mediumint(12) NOT NULL default '0', deleted mediumint(12) NOT NULL default '0', lastrun datetime NOT NULL default '0000-00-00 00:00:00', output_directory text, PRIMARY KEY (id), UNIQUE KEY jobname (jobname) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS expQUERY ( id int(15) unsigned NOT NULL auto_increment, name varchar(255) NOT NULL, search_criteria text NOT NULL, output_fields text NOT NULL, notes text, deleted mediumint(12) NOT NULL default '0', PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS expJOB_expQUERY ( id_expJOB int(15) NOT NULL, id_expQUERY int(15) NOT NULL, PRIMARY KEY (id_expJOB,id_expQUERY), KEY id_expJOB (id_expJOB), KEY id_expQUERY (id_expQUERY) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS expQUERYRESULT ( id int(15) unsigned NOT NULL auto_increment, id_expQUERY int(15) NOT NULL, result text NOT NULL, status mediumint(12) NOT NULL default '0', status_message text NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS expJOBRESULT ( id int(15) unsigned NOT NULL auto_increment, id_expJOB int(15) NOT NULL, execution_time datetime NOT NULL default '0000-00-00 00:00:00', status mediumint(12) NOT NULL default '0', status_message text NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS expJOBRESULT_expQUERYRESULT ( id_expJOBRESULT int(15) NOT NULL, id_expQUERYRESULT int(15) NOT NULL, PRIMARY KEY (id_expJOBRESULT, id_expQUERYRESULT), KEY id_expJOBRESULT (id_expJOBRESULT), KEY id_expQUERYRESULT (id_expQUERYRESULT) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS user_expJOB ( id_user int(15) NOT NULL, id_expJOB int(15) NOT NULL, PRIMARY KEY (id_user, id_expJOB), KEY id_user (id_user), KEY id_expJOB (id_expJOB) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS swrREMOTESERVER ( id int(15) unsigned NOT NULL auto_increment, name varchar(50) unique NOT NULL, host varchar(50) NOT NULL, username varchar(50) NOT NULL, password varchar(50) NOT NULL, email varchar(50) NOT NULL, realm varchar(50) NOT NULL, url_base_record varchar(50) NOT NULL, url_servicedocument varchar(80) NOT NULL, xml_servicedocument longblob, last_update int(15) unsigned NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS swrCLIENTDATA ( id int(15) unsigned NOT NULL auto_increment, id_swrREMOTESERVER int(15) NOT NULL, id_record int(15) NOT NULL, report_no varchar(50) NOT NULL, id_remote varchar(50) NOT NULL, id_user int(15) NOT NULL, user_name varchar(100) NOT NULL, user_email varchar(100) NOT NULL, xml_media_deposit longblob NOT NULL, xml_metadata_submit longblob NOT NULL, submission_date datetime NOT NULL default '0000-00-00 00:00:00', publication_date datetime NOT NULL default '0000-00-00 00:00:00', removal_date datetime NOT NULL default '0000-00-00 00:00:00', link_medias varchar(150) NOT NULL, link_metadata varchar(150) NOT NULL, link_status varchar(150) NOT NULL, status varchar(150) NOT NULL default 'submitted', last_update datetime NOT NULL, PRIMARY KEY (id) ) ENGINE=MyISAM; -- tables for exception management -- This table is used to log exceptions -- to discover the full details of an exception either check the email -- that are sent to CFG_SITE_ADMIN_EMAIL or look into invenio.err CREATE TABLE IF NOT EXISTS hstEXCEPTION ( id int(15) unsigned NOT NULL auto_increment, name varchar(50) NOT NULL, -- name of the exception filename varchar(255) NULL, -- file where the exception was raised line int(9) NULL, -- line at which the exception was raised last_seen datetime NOT NULL default '0000-00-00 00:00:00', -- last time this exception has been seen last_notified datetime NOT NULL default '0000-00-00 00:00:00', -- last time this exception has been notified counter int(15) NOT NULL default 0, -- internal counter to decide when to notify this exception total int(15) NOT NULL default 0, -- total number of times this exception has been seen PRIMARY KEY (id), KEY (last_seen), KEY (last_notified), KEY (total), UNIQUE KEY (name(50), filename(255), line) ) ENGINE=MyISAM; -- tables for BibAuthorID module: CREATE TABLE IF NOT EXISTS `aidPERSONID` ( `id` bigint(15) NOT NULL AUTO_INCREMENT, `personid` bigint(15) NOT NULL, `tag` varchar(50) NOT NULL, `data` varchar(250) NOT NULL, `flag` int NOT NULL DEFAULT '0', `lcul` int NOT NULL DEFAULT '0', PRIMARY KEY (`id`), INDEX `personid-b` (`personid`), INDEX `tag-b` (`tag`), INDEX `data-b` (`data`), INDEX `flag-b` (`flag`), INDEX `tdf-b` (`tag`,`data`,`flag`), INDEX `ptf-b` (`personid`,`tag`,`flag`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidUSERINPUTLOG` ( `id` bigint(15) NOT NULL AUTO_INCREMENT, `transactionid` bigint(15) NOT NULL, `timestamp` datetime NOT NULL, `userinfo` varchar(255) NOT NULL, `personid` bigint(15) NOT NULL, `action` varchar(50) NOT NULL, `tag` varchar(50) NOT NULL, `value` varchar(200) NOT NULL, `comment` text, PRIMARY KEY (`id`), INDEX `transactionid-b` (`transactionid`), INDEX `timestamp-b` (`timestamp`), INDEX `userinfo-b` (`userinfo`), INDEX `personid-b` (`personid`), INDEX `action-b` (`action`), INDEX `tag-b` (`tag`), INDEX `value-b` (`value`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidAUTHORNAMES` ( `id` bigint(15) NOT NULL auto_increment, `Name` varchar(255) NOT NULL, `bibrefs` varchar(200) NOT NULL, `db_name` varchar(255), PRIMARY KEY (`id`), INDEX `Name-b` (`Name`), INDEX `db_Name-b` (`db_name`), INDEX `bibrefs-b` (`bibrefs`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidAUTHORNAMESBIBREFS` ( `id` bigint(15) NOT NULL auto_increment, `Name_id` bigint(15) NOT NULL, `bibref` varchar(200) NOT NULL, PRIMARY KEY (`id`), INDEX `Name_id-b` (`Name_id`), INDEX `bibref-b` (`bibref`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidDOCLIST` ( `id` bigint(15) NOT NULL auto_increment, `bibrecID` bigint(15) NOT NULL, `processed_author` bigint(15) default NULL, PRIMARY KEY (`id`), INDEX `bibrecID-b` (`bibrecID`), INDEX `processed_author-b` (`processed_author`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidREALAUTHORS` ( `id` bigint(15) NOT NULL auto_increment, `realauthorID` bigint(15) NOT NULL, `virtualauthorID` bigint(15) NOT NULL, `p` float NOT NULL, PRIMARY KEY (`id`), INDEX `realauthorID-b` (`realauthorID`), INDEX `virtualauthorID-b` (`virtualauthorID`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidREALAUTHORDATA` ( `id` bigint(15) NOT NULL auto_increment, `realauthorID` bigint(15) NOT NULL, `tag` varchar(50) NOT NULL, `value` varchar(255) NOT NULL, `va_count` int(8) NOT NULL default '0', `va_names_p` double NOT NULL default '0' COMMENT 'Summed VA-Names probability', `va_p` double NOT NULL default '0' COMMENT 'Summed VA probabilities', PRIMARY KEY (`id`), INDEX `realauthorID-b` (`realauthorID`,`tag`), INDEX `value-b` (`value`), INDEX `tag-b` (`tag`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidVIRTUALAUTHORS` ( `id` bigint(15) NOT NULL auto_increment, `virtualauthorID` bigint(15) NOT NULL, `authornamesID` bigint(15) NOT NULL, `p` float NOT NULL, `clusterID` bigint(15) NOT NULL default '0', PRIMARY KEY (`id`), INDEX `authornamesID-b` (`authornamesID`), INDEX `clusterID-b` (`clusterID`), INDEX `virtualauthorID-b` (`virtualauthorID`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidVIRTUALAUTHORSDATA` ( `id` bigint(15) NOT NULL auto_increment, `virtualauthorID` bigint(15) NOT NULL, `tag` varchar(255) NOT NULL, `value` varchar(255) NOT NULL, PRIMARY KEY (`id`), INDEX `virtualauthorID-b` (`virtualauthorID`), INDEX `tag-b` (`tag`), INDEX `value-b` (`value`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidVIRTUALAUTHORSCLUSTERS` ( `id` int(15) NOT NULL auto_increment, `cluster_name` varchar(60) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=MyISAM; CREATE TABLE IF NOT EXISTS `aidCACHE` ( `id` int(15) NOT NULL auto_increment, `object_name` varchar(120) NOT NULL, `object_key` varchar(120) NOT NULL, `object_value` text, `last_updated` datetime NOT NULL, PRIMARY KEY (`id`), INDEX `name-b` (`object_name`), INDEX `key-b` (`object_key`), INDEX `last_updated-b` (`last_updated`) ) ENGINE=MyISAM; -- refextract tables: CREATE TABLE IF NOT EXISTS `xtrJOB` ( `id` tinyint(4) NOT NULL AUTO_INCREMENT, `name` varchar(30) NOT NULL, `last_updated` datetime NOT NULL, PRIMARY KEY (`id`) ) ENGINE=MyISAM; -- end of file diff --git a/modules/websearch/doc/hacking/search-engine-api.webdoc b/modules/websearch/doc/hacking/search-engine-api.webdoc index a44c7217c..49153d237 100644 --- a/modules/websearch/doc/hacking/search-engine-api.webdoc +++ b/modules/websearch/doc/hacking/search-engine-api.webdoc @@ -1,367 +1,367 @@ ## -*- mode: html; coding: utf-8; -*- ## This file is part of Invenio. ## Copyright (C) 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
     Invenio Search Engine can be called from within your Python programs
     via both a high-level and low-level API interface.
     
     1. High-level API
     
        Description:
     
           The high-level access to the search engine is provided by
           exactly the same function as called from the web interface when
           users submit their queries.  This should guarantee exactly the
           same behaviour, and means that you can pass to the high-level
           API all the arguments as you see them in the URL.
     
           There are two things to note: (i) the function does not check
           for eventual restricted status of the collection, so the
           restricted collections will be searched without asking for a
           password; (ii) the output format argument (``of'') should be set
           to ``id'' (which is the default value) meaning to return list of
           recIDs.  The function returns the list of recIDs in this case.
     
        Signature:
     
            def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=10, sf="", so="d", sp="", rm="", of="id", ot="", aas=0,
                                       p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", sc=0, jrec=0,
                                       recid=-1, recidb=-1, sysno="", id=-1, idb=-1, sysnb="", action="",
                                       d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="", verbose=0, ap=0, ln=CFG_SITE_LANG, ec=None):
               """Perform search or browse request, without checking for
                  authentication.  Return list of recIDs found, if of=id.
                  Otherwise create web page.
     
                  The arguments are as follows:
     
                    req - mod_python Request class instance.
     
                     cc - current collection (e.g. "ATLAS").  The collection the
                          user started to search/browse from.
     
                      c - collection list (e.g. ["Theses", "Books"]).  The
                          collections user may have selected/deselected when
                          starting to search from 'cc'.
     
                     ec - external collection list (e.g. ['CiteSeer', 'Google']). The
                          external collections may have been selected/deselected by the
                          user.
     
                      p - pattern to search for (e.g. "ellis and muon or kaon").
     
                      f - field to search within (e.g. "author").
     
                     rg - records in groups of (e.g. "10").  Defines how many hits
                          per collection in the search results page are
                          displayed.
     
                     sf - sort field (e.g. "title").
     
                     so - sort order ("a"=ascending, "d"=descending).
     
                     sp - sort pattern (e.g. "CERN-") -- in case there are more
                          values in a sort field, this argument tells which one
                          to prefer
     
                     rm - ranking method (e.g. "jif").  Defines whether results
                          should be ranked by some known ranking method.
     
                     of - output format (e.g. "hb").  Usually starting "h" means
                          HTML output (and "hb" for HTML brief, "hd" for HTML
                          detailed), "x" means XML output, "t" means plain text
                          output, "id" means no output at all but to return list
                          of recIDs found.  (Suitable for high-level API.)
     
                     ot - output only these MARC tags (e.g. "100,700,909C0b").
                          Useful if only some fields are to be shown in the
                          output, e.g. for library to control some fields.
     
                    aas - advanced search ("0" means no, "1" means yes).  Whether
                          search was called from within the advanced search
                          interface.
     
                     p1 - first pattern to search for in the advanced search
                          interface.  Much like 'p'.
     
                     f1 - first field to search within in the advanced search
                          interface.  Much like 'f'.
     
                     m1 - first matching type in the advanced search interface.
                          ("a" all of the words, "o" any of the words, "e" exact
                          phrase, "p" partial phrase, "r" regular expression).
     
                    op1 - first operator, to join the first and the second unit
                          in the advanced search interface.  ("a" add, "o" or,
                          "n" not).
     
                     p2 - second pattern to search for in the advanced search
                          interface.  Much like 'p'.
     
                     f2 - second field to search within in the advanced search
                          interface.  Much like 'f'.
     
                     m2 - second matching type in the advanced search interface.
                          ("a" all of the words, "o" any of the words, "e" exact
                          phrase, "p" partial phrase, "r" regular expression).
     
                    op2 - second operator, to join the second and the third unit
                          in the advanced search interface.  ("a" add, "o" or,
                          "n" not).
     
                     p3 - third pattern to search for in the advanced search
                          interface.  Much like 'p'.
     
                     f3 - third field to search within in the advanced search
                          interface.  Much like 'f'.
     
                     m3 - third matching type in the advanced search interface.
                          ("a" all of the words, "o" any of the words, "e" exact
                          phrase, "p" partial phrase, "r" regular expression).
     
                     sc - split by collection ("0" no, "1" yes).  Governs whether
                          we want to present the results in a single huge list,
                          or splitted by collection.
     
                   jrec - jump to record (e.g. "234").  Used for navigation
                          inside the search results.
     
                  recid - display record ID (e.g. "20000").  Do not
                          search/browse but go straight away to the Detailed
                          record page for the given recID.
     
                 recidb - display record ID bis (e.g. "20010").  If greater than
                          'recid', then display records from recid to recidb.
                          Useful for example for dumping records from the
                          database for reformatting.
     
                  sysno - display old system SYS number (e.g. "").  If you
                          migrate to Invenio from another system, and store your
                          old SYS call numbers, you can use them instead of recid
                          if you wish so.
     
                     id - the same as recid, in case recid is not set.  For
                          backwards compatibility.
     
                    idb - the same as recid, in case recidb is not set.  For
                          backwards compatibility.
     
                  sysnb - the same as sysno, in case sysno is not set.  For
                          backwards compatibility.
     
                 action - action to do.  "SEARCH" for searching, "Browse" for
                          browsing.  Default is to search.
     
                     d1 - first datetime in full YYYY-mm-dd HH:MM:DD format
                          (e.g. "1998-08-23 12:34:56"). Useful for search limits
                          on creation/modification date (see 'dt' argument
                          below).  Note that 'd1' takes precedence over d1y, d1m,
                          d1d if these are defined.
     
                    d1y - first date's year (e.g. "1998").  Useful for search
                          limits on creation/modification date.
     
                    d1m - first date's month (e.g. "08").  Useful for search
                          limits on creation/modification date.
     
                    d1d - first date's day (e.g. "23").  Useful for search
                          limits on creation/modification date.
     
                     d2 - second datetime in full YYYY-mm-dd HH:MM:DD format
                          (e.g. "1998-09-02 12:34:56"). Useful for search limits
                          on creation/modification date (see 'dt' argument
                          below).  Note that 'd2' takes precedence over d2y, d2m,
                          d2d if these are defined.
     
                    d2y - second date's year (e.g. "1998").  Useful for search
                          limits on creation/modification date.
     
                    d2m - second date's month (e.g. "09").  Useful for search
                          limits on creation/modification date.
     
                    d2d - second date's day (e.g. "02").  Useful for search
                          limits on creation/modification date.
     
                     dt - first and second date's type (e.g. "c").  Specifies
                          whether to search in creation dates ("c") or in
                          modification dates ("m").  When dt is not set and d1*
                          and d2* are set, the default is "c".
     
                verbose - verbose level (0=min, 9=max).  Useful to print some
                          internal information on the searching process in case
                          something goes wrong.
     
                     ap - alternative patterns (0=no, 1=yes).  In case no exact
                          match is found, the search engine can try alternative
                          patterns e.g. to replace non-alphanumeric characters by
                          a boolean query.  ap defines if this is wanted.
     
                     ln - language of the search interface (e.g. "en").  Useful
                          for internationalization.
     
                     ec - list of external search engines to search as well
                          (e.g. "SPIRES HEP").
               """
     
        Examples: (retrieving record IDs)
     
           >>> # import the function:
           >>> from invenio.search_engine import perform_request_search
           >>> # get all hits in a collection:
           >>> perform_request_search(cc="ATLAS Communications")
           >>> # search for the word `of' in Theses and Books:
           >>> perform_request_search(p="of", c=["Theses","Books"])
           >>> # search for `muon or kaon' within title:
           >>> perform_request_search(p="muon or kaon", f="title")
           >>> # phrase search (not the quotes):
           >>> perform_request_search(p='"Ellis, J"', f="author")
           >>> # regexp search for a system number
           >>> perform_request_search(p1="^CERN.*2003-001$", f1="reportnumber", m1="r")
           >>> # moi inside Standards gives no hits...
           >>> perform_request_search(p="moi", cc="Standards")
           >>> # but it does if we use alternative patterns:
           >>> perform_request_search(p="moi", cc="Standards", ap=1)
     
        Example: (retrieving MARCXML)
     
           >>> import cStringIO
           >>> tmp = cStringIO.StringIO()
           >>> perform_request_search(req=tmp, p='ellis', of='xm')
           >>> out = tmp.getvalue()
           >>> tmp.close()
           >>> # `out' now contains MARCXML of 12 records found
     
        Example: (retrieving Text MARC, certain tags only)
     
           >>> import cStringIO
           >>> tmp = cStringIO.StringIO()
           >>> perform_request_search(req=tmp, p='higgs', of='tm', ot=['100', '700'])
           >>> out = tmp.getvalue()
           >>> tmp.close()
           >>> print out
           000000085 100__ $$aGirardello, L$$uINFN$$uUniversita di Milano-Bicocca
           000000085 700__ $$aPorrati, Massimo
           000000085 700__ $$aZaffaroni, A
           000000001 100__ $$aPhotolab
     
     2. Mid-level API
     
        Description:
     
           The mid-level API is provided by a search_pattern() function
           that only searches for the given pattern in the given field
           according to the given matching pattern.  This function does not
           know anything about collection.  The function does not wash its
           arguments, it expects them to be `clean' already.  The pattern
           is split into `basic search units' for which a boolean query is
    -      launched.  The function returns an instance of the HitSet class.
    +      launched.  The function returns an instance of the intbitset class.
           Note that if you want to obtain the list of recIDs (as with the
           high-level API), you can invoke the ``tolist()'' method on a
           hitset.
     
        Signature:
     
           def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0):
               """Search for complex pattern 'p' within field 'f' according to
                  matching type 'm'.  Return hitset of recIDs.
     
                  The function uses multi-stage searching algorithm in case of no
                  exact match found.  See the Search Internals document for
                  detailed description.
     
                  The 'ap' argument governs whether an alternative patterns are to
                  be used in case there is no direct hit for (p,f,m).  For
                  example, whether to replace non-alphanumeric characters by
                  spaces if it would give some hits.  See the Search Internals
                  document for detailed description.  (ap=0 forbits the
                  alternative pattern usage, ap=1 permits it.)
     
                  The 'of' argument governs whether to print or not some
                  information to the user in case of no match found.  (Usually it
                  prints the information in case of HTML formats, otherwise it's
                  silent).
     
                  The 'verbose' argument controls the level of debugging information
                  to be printed (0=least, 9=most).
     
                  All the parameters are assumed to have been previously washed.
     
                  This function is suitable as a mid-level API.
               """
     
        Examples:
     
           >>> # import the function:
           >>> from invenio.search_engine import search_pattern
           >>> # search for muon or kaon in any field:
           >>> search_pattern(p="muon or kaon").tolist()
           >>> # the following finds nothing by default...
           >>> search_pattern(p="cern-moi").tolist()
           >>> # ...but it does find something if we allow alternative patterns:
           >>> search_pattern(p="cern-moi", ap=1).tolist()
           >>> # wildcard search for a report number:
           >>> search_pattern(p="CERN-LHC-PROJECT-REPORT-40*", f="reportnumber").tolist()
           >>> # regexp search for a report number with possible trailing subjects:
           >>> search_pattern(p="^CERN-LHC-PROJECT-REPORT-40(-|$)", f="reportnumber", m="r").tolist()
     
     3. Low-level API
     
        Description:
     
           The low-level API is provided by search_unit() function that
           assumes its arguments to be already the basic search units.
           Therefore it does not know anything about boolean queries, etc.
    -      The function returns an instance of the HitSet class.  Note that
    +      The function returns an instance of the intbitset class.  Note that
           if you want to obtain the list of recIDs (as with the high-level
           API), you can invoke the ``tolist()'' method on a hitset.
     
        Signature:
     
           def search_unit(p, f=None, m=None):
               """Search for basic search unit defined by pattern 'p' and field
                  'f' and matching type 'm'.  Return hitset of recIDs.
     
                  All the parameters are assumed to have been previously washed.
                  'p' is assumed to be already a ``basic search unit'' so that it
                  is searched as such and is not broken up in any way.  Only
                  wildcard and span queries are being detected inside 'p'.
     
                  This function is suitable as a low-level API.
               """
     
        Examples:
     
           >>> # import the function:
           >>> from invenio.search_engine import search_unit
           >>> # search moi in any field:
           >>> search_unit(p="moi").tolist()
           >>> # this one will not match:
           >>> search_unit(p="muon or kaon").tolist()
           >>> # regexp search for a report number with possible trailing subjects:
           >>> search_unit(p="^CERN-PS-99-037(-|$)", f="reportnumber", m="r").tolist()
     
     More entry points may be created, but I think this threesome kind of
     access to the search engine should cover all your needs.
     
    diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py index a34d905c6..b9e2bf740 100644 --- a/modules/websearch/lib/search_engine.py +++ b/modules/websearch/lib/search_engine.py @@ -1,5492 +1,5502 @@ # -*- coding: utf-8 -*- ## This file is part of Invenio. ## Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 """Invenio Search Engine in mod_python.""" __lastupdated__ = """$Date$""" __revision__ = "$Id$" ## import general modules: import cgi import cStringIO import copy import string import os import re import time import urllib import urlparse import zlib import sys if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 ## import Invenio stuff: from invenio.config import \ CFG_CERN_SITE, \ CFG_INSPIRE_SITE, \ CFG_OAI_ID_FIELD, \ CFG_WEBCOMMENT_ALLOW_REVIEWS, \ CFG_WEBSEARCH_CALL_BIBFORMAT, \ CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX, \ CFG_WEBSEARCH_FIELDS_CONVERT, \ CFG_WEBSEARCH_NB_RECORDS_TO_SORT, \ CFG_WEBSEARCH_SEARCH_CACHE_SIZE, \ CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS, \ CFG_WEBSEARCH_USE_ALEPH_SYSNOS, \ CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, \ CFG_WEBSEARCH_FULLTEXT_SNIPPETS, \ CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE, \ CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG, \ CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS, \ CFG_WEBSEARCH_WILDCARD_LIMIT, \ CFG_WEBSEARCH_SYNONYM_KBRS, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_LOGDIR, \ CFG_BIBFORMAT_HIDDEN_TAGS, \ CFG_SITE_URL, \ CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS, \ CFG_SOLR_URL, \ CFG_SITE_RECORD, \ CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT, \ CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY from invenio.search_engine_config import InvenioWebSearchUnknownCollectionError, InvenioWebSearchWildcardLimitError from invenio.search_engine_utils import get_fieldvalues from invenio.bibrecord import create_record from invenio.bibrank_record_sorter import get_bibrank_methods, rank_records, is_method_valid from invenio.bibrank_downloads_similarity import register_page_view_event, calculate_reading_similarity_list from invenio.bibindex_engine_stemmer import stem from invenio.bibindex_engine_tokenizer import wash_author_name, author_name_requires_phrase_search from invenio.bibformat import format_record, format_records, get_output_format_content_type, create_excel from invenio.bibformat_config import CFG_BIBFORMAT_USE_OLD_BIBFORMAT from invenio.bibrank_downloads_grapher import create_download_history_graph_and_box from invenio.bibknowledge import get_kbr_values from invenio.data_cacher import DataCacher from invenio.websearch_external_collections import print_external_results_overview, perform_external_collection_search from invenio.access_control_admin import acc_get_action_id from invenio.access_control_config import VIEWRESTRCOLL, \ CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS from invenio.websearchadminlib import get_detailed_page_tabs, get_detailed_page_tabs_counts -from invenio.intbitset import intbitset as HitSet +from invenio.intbitset import intbitset from invenio.dbquery import DatabaseError, deserialize_via_marshal, InvenioDbQueryWildcardLimitError from invenio.access_control_engine import acc_authorize_action from invenio.errorlib import register_exception from invenio.textutils import encode_for_xml, wash_for_utf8 from invenio.htmlutils import get_mathjax_header from invenio.htmlutils import nmtoken_from_string import invenio.template webstyle_templates = invenio.template.load('webstyle') webcomment_templates = invenio.template.load('webcomment') from invenio.bibrank_citation_searcher import calculate_cited_by_list, \ calculate_co_cited_with_list, get_records_with_num_cites, get_self_cited_by, \ get_refersto_hitset, get_citedby_hitset from invenio.bibrank_citation_grapher import create_citation_history_graph_and_box from invenio.dbquery import run_sql, run_sql_with_limit, \ get_table_update_time, Error from invenio.webuser import getUid, collect_user_info, session_param_set from invenio.webpage import pageheaderonly, pagefooteronly, create_error_box from invenio.messages import gettext_set_language from invenio.search_engine_query_parser import SearchQueryParenthesisedParser, \ SpiresToInvenioSyntaxConverter from invenio import webinterface_handler_config as apache from invenio.solrutils import solr_get_bitset try: import invenio.template websearch_templates = invenio.template.load('websearch') except: pass from invenio.websearch_external_collections import calculate_hosted_collections_results, do_calculate_hosted_collections_results from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_ANTE_SEARCH from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_POST_SEARCH from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_MAXRESULTS VIEWRESTRCOLL_ID = acc_get_action_id(VIEWRESTRCOLL) ## global vars: cfg_nb_browse_seen_records = 100 # limit of the number of records to check when browsing certain collection cfg_nicely_ordered_collection_list = 0 # do we propose collection list nicely ordered or alphabetical? ## precompile some often-used regexp for speed reasons: re_word = re.compile('[\s]') re_quotes = re.compile('[\'\"]') re_doublequote = re.compile('\"') re_equal = re.compile('\=') re_logical_and = re.compile('\sand\s', re.I) re_logical_or = re.compile('\sor\s', re.I) re_logical_not = re.compile('\snot\s', re.I) re_operators = re.compile(r'\s([\+\-\|])\s') re_pattern_wildcards_after_spaces = re.compile(r'(\s)[\*\%]+') re_pattern_single_quotes = re.compile("'(.*?)'") re_pattern_double_quotes = re.compile("\"(.*?)\"") re_pattern_regexp_quotes = re.compile("\/(.*?)\/") re_pattern_spaces_after_colon = re.compile(r'(:\s+)') re_pattern_short_words = re.compile(r'([\s\"]\w{1,3})[\*\%]+') re_pattern_space = re.compile("__SPACE__") re_pattern_today = re.compile("\$TODAY\$") re_pattern_parens = re.compile(r'\([^\)]+\s+[^\)]+\)') re_unicode_lowercase_a = re.compile(unicode(r"(?u)[áàäâãå]", "utf-8")) re_unicode_lowercase_ae = re.compile(unicode(r"(?u)[æ]", "utf-8")) re_unicode_lowercase_e = re.compile(unicode(r"(?u)[éèëê]", "utf-8")) re_unicode_lowercase_i = re.compile(unicode(r"(?u)[íìïî]", "utf-8")) re_unicode_lowercase_o = re.compile(unicode(r"(?u)[óòöôõø]", "utf-8")) re_unicode_lowercase_u = re.compile(unicode(r"(?u)[úùüû]", "utf-8")) re_unicode_lowercase_y = re.compile(unicode(r"(?u)[ýÿ]", "utf-8")) re_unicode_lowercase_c = re.compile(unicode(r"(?u)[çć]", "utf-8")) re_unicode_lowercase_n = re.compile(unicode(r"(?u)[ñ]", "utf-8")) re_unicode_uppercase_a = re.compile(unicode(r"(?u)[ÁÀÄÂÃÅ]", "utf-8")) re_unicode_uppercase_ae = re.compile(unicode(r"(?u)[Æ]", "utf-8")) re_unicode_uppercase_e = re.compile(unicode(r"(?u)[ÉÈËÊ]", "utf-8")) re_unicode_uppercase_i = re.compile(unicode(r"(?u)[ÍÌÏÎ]", "utf-8")) re_unicode_uppercase_o = re.compile(unicode(r"(?u)[ÓÒÖÔÕØ]", "utf-8")) re_unicode_uppercase_u = re.compile(unicode(r"(?u)[ÚÙÜÛ]", "utf-8")) re_unicode_uppercase_y = re.compile(unicode(r"(?u)[Ý]", "utf-8")) re_unicode_uppercase_c = re.compile(unicode(r"(?u)[ÇĆ]", "utf-8")) re_unicode_uppercase_n = re.compile(unicode(r"(?u)[Ñ]", "utf-8")) re_latex_lowercase_a = re.compile("\\\\[\"H'`~^vu=k]\{?a\}?") re_latex_lowercase_ae = re.compile("\\\\ae\\{\\}?") re_latex_lowercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?e\\}?") re_latex_lowercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?i\\}?") re_latex_lowercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?o\\}?") re_latex_lowercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?u\\}?") re_latex_lowercase_y = re.compile("\\\\[\"']\\{?y\\}?") re_latex_lowercase_c = re.compile("\\\\['uc]\\{?c\\}?") re_latex_lowercase_n = re.compile("\\\\[c'~^vu]\\{?n\\}?") re_latex_uppercase_a = re.compile("\\\\[\"H'`~^vu=k]\\{?A\\}?") re_latex_uppercase_ae = re.compile("\\\\AE\\{?\\}?") re_latex_uppercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?E\\}?") re_latex_uppercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?I\\}?") re_latex_uppercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?O\\}?") re_latex_uppercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?U\\}?") re_latex_uppercase_y = re.compile("\\\\[\"']\\{?Y\\}?") re_latex_uppercase_c = re.compile("\\\\['uc]\\{?C\\}?") re_latex_uppercase_n = re.compile("\\\\[c'~^vu]\\{?N\\}?") class RestrictedCollectionDataCacher(DataCacher): def __init__(self): def cache_filler(): ret = [] try: res = run_sql("""SELECT DISTINCT ar.value FROM accROLE_accACTION_accARGUMENT raa JOIN accARGUMENT ar ON raa.id_accARGUMENT = ar.id WHERE ar.keyword = 'collection' AND raa.id_accACTION = %s""", (VIEWRESTRCOLL_ID,)) except Exception: # database problems, return empty cache return [] for coll in res: ret.append(coll[0]) return ret def timestamp_verifier(): return max(get_table_update_time('accROLE_accACTION_accARGUMENT'), get_table_update_time('accARGUMENT')) DataCacher.__init__(self, cache_filler, timestamp_verifier) def collection_restricted_p(collection, recreate_cache_if_needed=True): if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() return collection in restricted_collection_cache.cache try: restricted_collection_cache.is_ok_p except Exception: restricted_collection_cache = RestrictedCollectionDataCacher() def ziplist(*lists): """Just like zip(), but returns lists of lists instead of lists of tuples Example: zip([f1, f2, f3], [p1, p2, p3], [op1, op2, '']) => [(f1, p1, op1), (f2, p2, op2), (f3, p3, '')] ziplist([f1, f2, f3], [p1, p2, p3], [op1, op2, '']) => [[f1, p1, op1], [f2, p2, op2], [f3, p3, '']] FIXME: This is handy to have, and should live somewhere else, like miscutil.really_useful_functions or something. XXX: Starting in python 2.6, the same can be achieved (faster) by using itertools.izip_longest(); when the minimum recommended Python is bumped, we should use that instead. """ def l(*items): return list(items) return map(l, *lists) def get_permitted_restricted_collections(user_info, recreate_cache_if_needed=True): """Return a list of collection that are restricted but for which the user is authorized.""" if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() ret = [] for collection in restricted_collection_cache.cache: if acc_authorize_action(user_info, 'viewrestrcoll', collection=collection)[0] == 0: ret.append(collection) return ret def get_all_restricted_recids(): """ Return the set of all the restricted recids, i.e. the ids of those records which belong to at least one restricted collection. """ - ret = HitSet() + ret = intbitset() for collection in restricted_collection_cache.cache: ret |= get_collection_reclist(collection) return ret def get_restricted_collections_for_recid(recid, recreate_cache_if_needed=True): """ Return the list of restricted collection names to which recid belongs. """ if recreate_cache_if_needed: restricted_collection_cache.recreate_cache_if_needed() collection_reclist_cache.recreate_cache_if_needed() return [collection for collection in restricted_collection_cache.cache if recid in get_collection_reclist(collection, recreate_cache_if_needed=False)] def is_user_owner_of_record(user_info, recid): """ Check if the user is owner of the record, i.e. he is the submitter and/or belongs to a owner-like group authorized to 'see' the record. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: True if the user is 'owner' of the record; False otherwise @rtype: bool """ authorized_emails_or_group = [] for tag in CFG_ACC_GRANT_AUTHOR_RIGHTS_TO_EMAILS_IN_TAGS: authorized_emails_or_group.extend(get_fieldvalues(recid, tag)) for email_or_group in authorized_emails_or_group: if email_or_group in user_info['group']: return True email = email_or_group.strip().lower() if user_info['email'].strip().lower() == email: return True return False def check_user_can_view_record(user_info, recid): """ Check if the user is authorized to view the given recid. The function grants access in two cases: either user has author rights on this record, or he has view rights to the primary collection this record belongs to. @param user_info: the user_info dictionary that describe the user. @type user_info: user_info dictionary @param recid: the record identifier. @type recid: positive integer @return: (0, ''), when authorization is granted, (>0, 'message') when authorization is not granted @rtype: (int, string) """ policy = CFG_WEBSEARCH_VIEWRESTRCOLL_POLICY.strip().upper() if isinstance(recid, str): recid = int(recid) if record_public_p(recid): ## The record is already known to be public. return (0, '') ## At this point, either webcoll has not yet run or there are some ## restricted collections. Let's see first if the user own the record. if is_user_owner_of_record(user_info, recid): ## Perfect! It's authorized then! return (0, '') restricted_collections = get_restricted_collections_for_recid(recid, recreate_cache_if_needed=False) if restricted_collections: ## If there are restricted collections the user must be authorized to all/any of them (depending on the policy) auth_code, auth_msg = 0, '' for collection in get_restricted_collections_for_recid(recid, recreate_cache_if_needed=False): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collection) if auth_code and policy != 'ANY': ## Ouch! the user is not authorized to this collection return (auth_code, auth_msg) elif auth_code == 0 and policy == 'ANY': ## Good! At least one collection is authorized return (0, '') ## Depending on the policy, the user will be either authorized or not return auth_code, auth_msg if is_record_in_any_collection(recid, recreate_cache_if_needed=False): ## the record is not in any restricted collection return (0, '') elif record_exists(recid) > 0: ## We are in the case where webcoll has not run. ## Let's authorize SUPERADMIN (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=None) if auth_code == 0: return (0, '') else: ## Too bad. Let's print a nice message: return (1, """The record you are trying to access has just been submitted to the system and needs to be assigned to the proper collections. It is currently restricted for security reasons until the assignment will be fully completed. Please come back later to properly access this record.""") else: ## The record either does not exists or has been deleted. ## Let's handle these situations outside of this code. return (0, '') class IndexStemmingDataCacher(DataCacher): """ Provides cache for stemming information for word/phrase indexes. This class is not to be used directly; use function get_index_stemming_language() instead. """ def __init__(self): def cache_filler(): try: res = run_sql("""SELECT id, stemming_language FROM idxINDEX""") except DatabaseError: # database problems, return empty cache return {} return dict(res) def timestamp_verifier(): return get_table_update_time('idxINDEX') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: index_stemming_cache.is_ok_p except Exception: index_stemming_cache = IndexStemmingDataCacher() def get_index_stemming_language(index_id, recreate_cache_if_needed=True): """Return stemming langugage for given index.""" if recreate_cache_if_needed: index_stemming_cache.recreate_cache_if_needed() return index_stemming_cache.cache[index_id] class CollectionRecListDataCacher(DataCacher): """ Provides cache for collection reclist hitsets. This class is not to be used directly; use function get_collection_reclist() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT name,reclist FROM collection") except Exception: # database problems, return empty cache return {} for name, reclist in res: ret[name] = None # this will be filled later during runtime by calling get_collection_reclist(coll) return ret def timestamp_verifier(): return get_table_update_time('collection') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_reclist_cache.is_ok_p: raise Exception except Exception: collection_reclist_cache = CollectionRecListDataCacher() def get_collection_reclist(coll, recreate_cache_if_needed=True): """Return hitset of recIDs that belong to the collection 'coll'.""" if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() if not collection_reclist_cache.cache[coll]: # not yet it the cache, so calculate it and fill the cache: - set = HitSet() + set = intbitset() query = "SELECT nbrecs,reclist FROM collection WHERE name=%s" res = run_sql(query, (coll, ), 1) if res: try: - set = HitSet(res[0][1]) + set = intbitset(res[0][1]) except: pass collection_reclist_cache.cache[coll] = set # finally, return reclist: return collection_reclist_cache.cache[coll] def get_available_output_formats(visible_only=False): """ Return the list of available output formats. When visible_only is True, returns only those output formats that have visibility flag set to 1. """ formats = [] query = "SELECT code,name FROM format" if visible_only: query += " WHERE visibility='1'" query += " ORDER BY name ASC" res = run_sql(query) if res: # propose found formats: for code, name in res: formats.append({ 'value' : code, 'text' : name }) else: formats.append({'value' : 'hb', 'text' : "HTML brief" }) return formats class SearchResultsCache(DataCacher): """ Provides temporary lazy cache for Search Results. Useful when users click on `next page'. """ def __init__(self): def cache_filler(): return {} def timestamp_verifier(): return '1970-01-01 00:00:00' # lazy cache is always okay; # its filling is governed by # CFG_WEBSEARCH_SEARCH_CACHE_SIZE DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not search_results_cache.is_ok_p: raise Exception except Exception: search_results_cache = SearchResultsCache() class CollectionI18nNameDataCacher(DataCacher): """ Provides cache for I18N collection names. This class is not to be used directly; use function get_coll_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT c.name,cn.ln,cn.value FROM collectionname AS cn, collection AS c WHERE cn.id_collection=c.id AND cn.type='ln'") # ln=long name except Exception: # database problems return {} for c, ln, i18nname in res: if i18nname: if not ret.has_key(c): ret[c] = {} ret[c][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('collectionname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not collection_i18nname_cache.is_ok_p: raise Exception except Exception: collection_i18nname_cache = CollectionI18nNameDataCacher() def get_coll_i18nname(c, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted collection name (of the name type `ln' (=long name)) for collection C in language LN. This function uses collection_i18nname_cache, but it verifies whether the cache is up-to-date first by default. This verification step is performed by checking the DB table update time. So, if you call this function 1000 times, it can get very slow because it will do 1000 table update time verifications, even though collection names change not that often. Hence the parameter VERIFY_CACHE_TIMESTAMP which, when set to False, will assume the cache is already up-to-date. This is useful namely in the generation of collection lists for the search results page. """ if verify_cache_timestamp: collection_i18nname_cache.recreate_cache_if_needed() out = c try: out = collection_i18nname_cache.cache[c][ln] except KeyError: pass # translation in LN does not exist return out class FieldI18nNameDataCacher(DataCacher): """ Provides cache for I18N field names. This class is not to be used directly; use function get_field_i18nname() instead. """ def __init__(self): def cache_filler(): ret = {} try: res = run_sql("SELECT f.name,fn.ln,fn.value FROM fieldname AS fn, field AS f WHERE fn.id_field=f.id AND fn.type='ln'") # ln=long name except Exception: # database problems, return empty cache return {} for f, ln, i18nname in res: if i18nname: if not ret.has_key(f): ret[f] = {} ret[f][ln] = i18nname return ret def timestamp_verifier(): return get_table_update_time('fieldname') DataCacher.__init__(self, cache_filler, timestamp_verifier) try: if not field_i18nname_cache.is_ok_p: raise Exception except Exception: field_i18nname_cache = FieldI18nNameDataCacher() def get_field_i18nname(f, ln=CFG_SITE_LANG, verify_cache_timestamp=True): """ Return nicely formatted field name (of type 'ln', 'long name') for field F in language LN. If VERIFY_CACHE_TIMESTAMP is set to True, then verify DB timestamp and field I18N name cache timestamp and refresh cache from the DB if needed. Otherwise don't bother checking DB timestamp and return the cached value. (This is useful when get_field_i18nname is called inside a loop.) """ if verify_cache_timestamp: field_i18nname_cache.recreate_cache_if_needed() out = f try: out = field_i18nname_cache.cache[f][ln] except KeyError: pass # translation in LN does not exist return out def get_alphabetically_ordered_collection_list(level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" out = [] res = run_sql("SELECT id,name FROM collection ORDER BY name ASC") for c_id, c_name in res: # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c_name, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable out.append([c_name, c_printable]) return out def get_nicely_ordered_collection_list(collid=1, level=0, ln=CFG_SITE_LANG): """Returns nicely ordered (score respected) list of collections, more exactly list of tuples (collection name, printable collection name). Suitable for create_search_box().""" colls_nicely_ordered = [] res = run_sql("""SELECT c.name,cc.id_son FROM collection_collection AS cc, collection AS c WHERE c.id=cc.id_son AND cc.id_dad=%s ORDER BY score DESC""", (collid, )) for c, cid in res: # make a nice printable name (e.g. truncate c_printable for # long collection names in given language): c_printable_fullname = get_coll_i18nname(c, ln, False) c_printable = wash_index_term(c_printable_fullname, 30, False) if c_printable != c_printable_fullname: c_printable = c_printable + "..." if level: c_printable = " " + level * '-' + " " + c_printable colls_nicely_ordered.append([c, c_printable]) colls_nicely_ordered = colls_nicely_ordered + get_nicely_ordered_collection_list(cid, level+1, ln=ln) return colls_nicely_ordered def get_index_id_from_field(field): """ Return index id with name corresponding to FIELD, or the first index id where the logical field code named FIELD is indexed. Return zero in case there is no index defined for this field. Example: field='author', output=4. """ out = 0 if not field: field = 'global' # empty string field means 'global' index (field 'anyfield') # first look in the index table: res = run_sql("""SELECT id FROM idxINDEX WHERE name=%s""", (field,)) if res: out = res[0][0] return out # not found in the index table, now look in the field table: res = run_sql("""SELECT w.id FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f WHERE f.code=%s AND wf.id_field=f.id AND w.id=wf.id_idxINDEX LIMIT 1""", (field,)) if res: out = res[0][0] return out def get_words_from_pattern(pattern): "Returns list of whitespace-separated words from pattern." words = {} for word in string.split(pattern): if not words.has_key(word): words[word] = 1 return words.keys() def create_basic_search_units(req, p, f, m=None, of='hb'): """Splits search pattern and search field into a list of independently searchable units. - A search unit consists of '(operator, pattern, field, type, hitset)' tuples where 'operator' is set union (|), set intersection (+) or set exclusion (-); 'pattern' is either a word (e.g. muon*) or a phrase (e.g. 'nuclear physics'); 'field' is either a code like 'title' or MARC tag like '100__a'; 'type' is the search type ('w' for word file search, 'a' for access file search). - Optionally, the function accepts the match type argument 'm'. If it is set (e.g. from advanced search interface), then it performs this kind of matching. If it is not set, then a guess is made. 'm' can have values: 'a'='all of the words', 'o'='any of the words', 'p'='phrase/substring', 'r'='regular expression', 'e'='exact value'. - Warnings are printed on req (when not None) in case of HTML output formats.""" opfts = [] # will hold (o,p,f,t,h) units # FIXME: quick hack for the journal index if f == 'journal': opfts.append(['+', p, f, 'w']) return opfts ## check arguments: is desired matching type set? if m: ## A - matching type is known; good! if m == 'e': # A1 - exact value: opfts.append(['+', p, f, 'a']) # '+' since we have only one unit elif m == 'p': # A2 - phrase/substring: opfts.append(['+', "%" + p + "%", f, 'a']) # '+' since we have only one unit elif m == 'r': # A3 - regular expression: opfts.append(['+', p, f, 'r']) # '+' since we have only one unit elif m == 'a' or m == 'w': # A4 - all of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): opfts.append(['+', word, f, 'w']) # '+' in all units elif m == 'o': # A5 - any of the words: p = strip_accents(p) # strip accents for 'w' mode, FIXME: delete when not needed for word in get_words_from_pattern(p): if len(opfts)==0: opfts.append(['+', word, f, 'w']) # '+' in the first unit else: opfts.append(['|', word, f, 'w']) # '|' in further units else: if of.startswith("h"): print_warning(req, "Matching type '%s' is not implemented yet." % cgi.escape(m), "Warning") opfts.append(['+', "%" + p + "%", f, 'w']) else: ## B - matching type is not known: let us try to determine it by some heuristics if f and p[0] == '"' and p[-1] == '"': ## B0 - does 'p' start and end by double quote, and is 'f' defined? => doing ACC search opfts.append(['+', p[1:-1], f, 'a']) elif f in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor') and author_name_requires_phrase_search(p): ## B1 - do we search in author, and does 'p' contain space/comma/dot/etc? ## => doing washed ACC search opfts.append(['+', p, f, 'a']) elif f and p[0] == "'" and p[-1] == "'": ## B0bis - does 'p' start and end by single quote, and is 'f' defined? => doing ACC search opfts.append(['+', '%' + p[1:-1] + '%', f, 'a']) elif f and p[0] == "/" and p[-1] == "/": ## B0ter - does 'p' start and end by a slash, and is 'f' defined? => doing regexp search opfts.append(['+', p[1:-1], f, 'r']) elif f and string.find(p, ',') >= 0: ## B1 - does 'p' contain comma, and is 'f' defined? => doing ACC search opfts.append(['+', p, f, 'a']) elif f and str(f[0:2]).isdigit(): ## B2 - does 'f' exist and starts by two digits? => doing ACC search opfts.append(['+', p, f, 'a']) else: ## B3 - doing WRD search, but maybe ACC too # search units are separated by spaces unless the space is within single or double quotes # so, let us replace temporarily any space within quotes by '__SPACE__' p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p) p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p) p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p) # and spaces after colon as well: p = re_pattern_spaces_after_colon.sub(lambda x: string.replace(x.group(1), ' ', '__SPACE__'), p) # wash argument: p = re_equal.sub(":", p) p = re_logical_and.sub(" ", p) p = re_logical_or.sub(" |", p) p = re_logical_not.sub(" -", p) p = re_operators.sub(r' \1', p) for pi in string.split(p): # iterate through separated units (or items, as "pi" stands for "p item") pi = re_pattern_space.sub(" ", pi) # replace back '__SPACE__' by ' ' # firstly, determine set operator if pi[0] == '+' or pi[0] == '-' or pi[0] == '|': oi = pi[0] pi = pi[1:] else: # okay, there is no operator, so let us decide what to do by default oi = '+' # by default we are doing set intersection... # secondly, determine search pattern and field: if string.find(pi, ":") > 0: fi, pi = string.split(pi, ":", 1) fi = wash_field(fi) # test whether fi is a real index code or a MARC-tag defined code: if fi in get_fieldcodes() or '00' <= fi[:2] <= '99': pass else: # it is not, so join it back: fi, pi = f, fi + ":" + pi else: fi, pi = f, pi # wash 'fi' argument: fi = wash_field(fi) # wash 'pi' argument: pi = pi.strip() # strip eventual spaces if re_quotes.match(pi): # B3a - quotes are found => do ACC search (phrase search) if pi[0] == '"' and pi[-1] == '"': pi = string.replace(pi, '"', '') # remove quote signs opfts.append([oi, pi, fi, 'a']) elif pi[0] == "'" and pi[-1] == "'": pi = string.replace(pi, "'", "") # remove quote signs opfts.append([oi, "%" + pi + "%", fi, 'a']) else: # unbalanced quotes, so fall back to WRD query: opfts.append([oi, pi, fi, 'w']) elif pi.startswith('/') and pi.endswith('/'): # B3b - pi has slashes around => do regexp search opfts.append([oi, pi[1:-1], fi, 'r']) elif fi and str(fi[0]).isdigit() and str(fi[0]).isdigit(): # B3c - fi exists and starts by two digits => do ACC search opfts.append([oi, pi, fi, 'a']) elif fi and not get_index_id_from_field(fi) and get_field_name(fi): # B3d - logical field fi exists but there is no WRD index for fi => try ACC search opfts.append([oi, pi, fi, 'a']) else: # B3e - general case => do WRD search pi = strip_accents(pi) # strip accents for 'w' mode, FIXME: delete when not needed for pii in get_words_from_pattern(pi): opfts.append([oi, pii, fi, 'w']) ## sanity check: for i in range(0, len(opfts)): try: pi = opfts[i][1] if pi == '*': if of.startswith("h"): print_warning(req, "Ignoring standalone wildcard word.", "Warning") del opfts[i] if pi == '' or pi == ' ': fi = opfts[i][2] if fi: if of.startswith("h"): print_warning(req, "Ignoring empty %s search term." % fi, "Warning") del opfts[i] except: pass ## replace old logical field names if applicable: if CFG_WEBSEARCH_FIELDS_CONVERT: opfts = [[o,p,wash_field(f),t] for o,p,f,t in opfts] ## return search units: return opfts def page_start(req, of, cc, aas, ln, uid, title_message=None, description='', keywords='', recID=-1, tab='', p=''): "Start page according to given output format." _ = gettext_set_language(ln) if not req or isinstance(req, cStringIO.OutputType): return # we were called from CLI if not title_message: title_message = _("Search Results") content_type = get_output_format_content_type(of) if of.startswith('x'): if of == 'xr': # we are doing RSS output req.content_type = "application/rss+xml" req.send_http_header() req.write("""\n""") else: # we are doing XML output: req.content_type = "text/xml" req.send_http_header() req.write("""\n""") elif of.startswith('t') or str(of[0:3]).isdigit(): # we are doing plain text output: req.content_type = "text/plain" req.send_http_header() elif of == "id": pass # nothing to do, we shall only return list of recIDs elif content_type == 'text/html': # we are doing HTML output: req.content_type = "text/html" req.send_http_header() if not description: description = "%s %s." % (cc, _("Search Results")) if not keywords: keywords = "%s, WebSearch, %s" % (get_coll_i18nname(CFG_SITE_NAME, ln, False), get_coll_i18nname(cc, ln, False)) ## generate RSS URL: argd = {} if req.args: argd = cgi.parse_qs(req.args) rssurl = websearch_templates.build_rss_url(argd) ## add MathJax if displaying single records (FIXME: find ## eventual better place to this code) if of.lower() in CFG_WEBSEARCH_USE_MATHJAX_FOR_FORMATS: metaheaderadd = get_mathjax_header(req.is_https()) else: metaheaderadd = '' ## generate navtrail: navtrail = create_navtrail_links(cc, aas, ln) if navtrail != '': navtrail += ' > ' if (tab != '' or ((of != '' or of.lower() != 'hd') and of != 'hb')) and \ recID != -1: # If we are not in information tab in HD format, customize # the nav. trail to have a link back to main record. (Due # to the way perform_request_search() works, hb # (lowercase) is equal to hd) navtrail += ' %s' % \ (CFG_SITE_URL, CFG_SITE_RECORD, recID, title_message) if (of != '' or of.lower() != 'hd') and of != 'hb': # Export format_name = of query = "SELECT name FROM format WHERE code=%s" res = run_sql(query, (of,)) if res: format_name = res[0][0] navtrail += ' > ' + format_name else: # Discussion, citations, etc. tabs tab_label = get_detailed_page_tabs(cc, ln=ln)[tab]['label'] navtrail += ' > ' + _(tab_label) else: navtrail += title_message if p: # we are serving search/browse results pages, so insert pattern: navtrail += ": " + cgi.escape(p) title_message = cgi.escape(p) + " - " + title_message body_css_classes = [] if cc: # we know the collection, lets allow page styles based on cc #collection names may not satisfy rules for css classes which #are something like: -?[_a-zA-Z]+[_a-zA-Z0-9-]* #however it isn't clear what we should do about cases with #numbers, so we leave them to fail. Everything else becomes "_" css = nmtoken_from_string(cc).replace('.','_').replace('-','_').replace(':','_') body_css_classes.append(css) ## finally, print page header: req.write(pageheaderonly(req=req, title=title_message, navtrail=navtrail, description=description, keywords=keywords, metaheaderadd=metaheaderadd, uid=uid, language=ln, navmenuid='search', navtrail_append_title_p=0, rssurl=rssurl, body_css_classes=body_css_classes)) req.write(websearch_templates.tmpl_search_pagestart(ln=ln)) #else: # req.send_http_header() def page_end(req, of="hb", ln=CFG_SITE_LANG): "End page according to given output format: e.g. close XML tags, add HTML footer, etc." if of == "id": return [] # empty recID list if not req: return # we were called from CLI if of.startswith('h'): req.write(websearch_templates.tmpl_search_pageend(ln = ln)) # pagebody end req.write(pagefooteronly(lastupdated=__lastupdated__, language=ln, req=req)) return def create_page_title_search_pattern_info(p, p1, p2, p3): """Create the search pattern bit for the page web page HTML header. Basically combine p and (p1,p2,p3) together so that the page header may be filled whether we are in the Simple Search or Advanced Search interface contexts.""" out = "" if p: out = p else: out = p1 if p2: out += ' ' + p2 if p3: out += ' ' + p3 return out def create_inputdate_box(name="d1", selected_year=0, selected_month=0, selected_day=0, ln=CFG_SITE_LANG): "Produces 'From Date', 'Until Date' kind of selection box. Suitable for search options." _ = gettext_set_language(ln) box = "" # day box += """<select name="%sd">""" % name box += """<option value="">%s""" % _("any day") for day in range(1, 32): box += """<option value="%02d"%s>%02d""" % (day, is_selected(day, selected_day), day) box += """</select>""" # month box += """<select name="%sm">""" % name box += """<option value="">%s""" % _("any month") for mm, month in [(1, _("January")), (2, _("February")), (3, _("March")), (4, _("April")), \ (5, _("May")), (6, _("June")), (7, _("July")), (8, _("August")), \ (9, _("September")), (10, _("October")), (11, _("November")), (12, _("December"))]: box += """<option value="%02d"%s>%s""" % (mm, is_selected(mm, selected_month), month) box += """</select>""" # year box += """<select name="%sy">""" % name box += """<option value="">%s""" % _("any year") this_year = int(time.strftime("%Y", time.localtime())) for year in range(this_year-20, this_year+1): box += """<option value="%d"%s>%d""" % (year, is_selected(year, selected_year), year) box += """</select>""" return box def create_search_box(cc, colls, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action=""): """Create search box for 'search again in the results page' functionality.""" # load the right message language _ = gettext_set_language(ln) # some computations cc_intl = get_coll_i18nname(cc, ln, False) cc_colID = get_colID(cc) colls_nicely_ordered = [] if cfg_nicely_ordered_collection_list: colls_nicely_ordered = get_nicely_ordered_collection_list(ln=ln) else: colls_nicely_ordered = get_alphabetically_ordered_collection_list(ln=ln) colls_nice = [] for (cx, cx_printable) in colls_nicely_ordered: if not cx.startswith("Unnamed collection"): colls_nice.append({ 'value' : cx, 'text' : cx_printable }) coll_selects = [] if colls and colls[0] != CFG_SITE_NAME: # some collections are defined, so print these first, and only then print 'add another collection' heading: for c in colls: if c: temp = [] temp.append({ 'value' : CFG_SITE_NAME, 'text' : '*** %s ***' % _("any public collection") }) # this field is used to remove the current collection from the ones to be searched. temp.append({ 'value' : '', 'text' : '*** %s ***' % _("remove this collection") }) for val in colls_nice: # print collection: if not cx.startswith("Unnamed collection"): temp.append({ 'value' : val['value'], 'text' : val['text'], 'selected' : (c == re.sub("^[\s\-]*","", val['value'])) }) coll_selects.append(temp) coll_selects.append([{ 'value' : '', 'text' : '*** %s ***' % _("add another collection") }] + colls_nice) else: # we searched in CFG_SITE_NAME, so print 'any public collection' heading coll_selects.append([{ 'value' : CFG_SITE_NAME, 'text' : '*** %s ***' % _("any public collection") }] + colls_nice) ## ranking methods ranks = [{ 'value' : '', 'text' : "- %s %s -" % (_("OR").lower (), _("rank by")), }] for (code, name) in get_bibrank_methods(cc_colID, ln): # propose found rank methods: ranks.append({ 'value' : code, 'text' : name, }) formats = get_available_output_formats(visible_only=True) # show collections in the search box? (not if there is only one # collection defined, and not if we are in light search) show_colls = True show_title = True if len(collection_reclist_cache.cache.keys()) == 1 or \ aas == -1: show_colls = False show_title = False if cc == CFG_SITE_NAME: show_title = False if CFG_INSPIRE_SITE: show_title = False return websearch_templates.tmpl_search_box( ln = ln, aas = aas, cc_intl = cc_intl, cc = cc, ot = ot, sp = sp, action = action, fieldslist = get_searchwithin_fields(ln=ln, colID=cc_colID), f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, p1 = p1, p2 = p2, p3 = p3, op1 = op1, op2 = op2, rm = rm, p = p, f = f, coll_selects = coll_selects, d1y = d1y, d2y = d2y, d1m = d1m, d2m = d2m, d1d = d1d, d2d = d2d, dt = dt, sort_fields = get_sortby_fields(ln=ln, colID=cc_colID), sf = sf, so = so, ranks = ranks, sc = sc, rg = rg, formats = formats, of = of, pl = pl, jrec = jrec, ec = ec, show_colls = show_colls, show_title = show_title, ) def create_navtrail_links(cc=CFG_SITE_NAME, aas=0, ln=CFG_SITE_LANG, self_p=1, tab=''): """Creates navigation trail links, i.e. links to collection ancestors (except Home collection). If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. """ dads = [] for dad in get_coll_ancestors(cc): if dad != CFG_SITE_NAME: # exclude Home collection dads.append ((dad, get_coll_i18nname(dad, ln, False))) if self_p and cc != CFG_SITE_NAME: dads.append((cc, get_coll_i18nname(cc, ln, False))) return websearch_templates.tmpl_navtrail_links( aas=aas, ln=ln, dads=dads) def get_searchwithin_fields(ln='en', colID=None): """Retrieves the fields name used in the 'search within' selection box for the collection ID colID.""" res = None if colID: res = run_sql("""SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='sew' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (colID,)) if not res: res = run_sql("SELECT code,name FROM field ORDER BY name ASC") fields = [{ 'value' : '', 'text' : get_field_i18nname("any field", ln, False) }] for field_code, field_name in res: if field_code and field_code != "anyfield": fields.append({ 'value' : field_code, 'text' : get_field_i18nname(field_name, ln, False) }) return fields def get_sortby_fields(ln='en', colID=None): """Retrieves the fields name used in the 'sort by' selection box for the collection ID colID.""" _ = gettext_set_language(ln) res = None if colID: res = run_sql("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (colID,)) if not res: # no sort fields defined for this colID, try to take Home collection: res = run_sql("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_collection=%s AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""", (1,)) if not res: # no sort fields defined for the Home collection, take all sort fields defined wherever they are: res = run_sql("""SELECT DISTINCT(f.code),f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='soo' AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""",) fields = [{ 'value' : '', 'text' : _("latest first") }] for field_code, field_name in res: if field_code and field_code != "anyfield": fields.append({ 'value' : field_code, 'text' : get_field_i18nname(field_name, ln, False) }) return fields def create_andornot_box(name='op', value='', ln='en'): "Returns HTML code for the AND/OR/NOT selection box." _ = gettext_set_language(ln) out = """ <select name="%s"> <option value="a"%s>%s <option value="o"%s>%s <option value="n"%s>%s </select> """ % (name, is_selected('a', value), _("AND"), is_selected('o', value), _("OR"), is_selected('n', value), _("AND NOT")) return out def create_matchtype_box(name='m', value='', ln='en'): "Returns HTML code for the 'match type' selection box." _ = gettext_set_language(ln) out = """ <select name="%s"> <option value="a"%s>%s <option value="o"%s>%s <option value="e"%s>%s <option value="p"%s>%s <option value="r"%s>%s </select> """ % (name, is_selected('a', value), _("All of the words:"), is_selected('o', value), _("Any of the words:"), is_selected('e', value), _("Exact phrase:"), is_selected('p', value), _("Partial phrase:"), is_selected('r', value), _("Regular expression:")) return out def is_selected(var, fld): "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes." if type(var) is int and type(fld) is int: if var == fld: return " selected" elif str(var) == str(fld): return " selected" elif fld and len(fld)==3 and fld[0] == "w" and var == fld[1:]: return " selected" return "" def wash_colls(cc, c, split_colls=0, verbose=0): """Wash collection list by checking whether user has deselected anything under 'Narrow search'. Checks also if cc is a list or not. Return list of cc, colls_to_display, colls_to_search since the list of collections to display is different from that to search in. This is because users might have chosen 'split by collection' functionality. The behaviour of "collections to display" depends solely whether user has deselected a particular collection: e.g. if it started from 'Articles and Preprints' page, and deselected 'Preprints', then collection to display is 'Articles'. If he did not deselect anything, then collection to display is 'Articles & Preprints'. The behaviour of "collections to search in" depends on the 'split_colls' parameter: * if is equal to 1, then we can wash the colls list down and search solely in the collection the user started from; * if is equal to 0, then we are splitting to the first level of collections, i.e. collections as they appear on the page we started to search from; The function raises exception InvenioWebSearchUnknownCollectionError if cc or one of c collections is not known. """ colls_out = [] colls_out_for_display = [] # list to hold the hosted collections to be searched and displayed hosted_colls_out = [] debug = "" if verbose: debug += "<br />" debug += "<br />1) --- initial parameters ---" debug += "<br />cc : %s" % cc debug += "<br />c : %s" % c debug += "<br />" # check what type is 'cc': if type(cc) is list: for ci in cc: if collection_reclist_cache.cache.has_key(ci): # yes this collection is real, so use it: cc = ci break else: # check once if cc is real: if not collection_reclist_cache.cache.has_key(cc): if cc: raise InvenioWebSearchUnknownCollectionError(cc) else: cc = CFG_SITE_NAME # cc is not set, so replace it with Home collection # check type of 'c' argument: if type(c) is list: colls = c else: colls = [c] if verbose: debug += "<br />2) --- after check for the integrity of cc and the being or not c a list ---" debug += "<br />cc : %s" % cc debug += "<br />c : %s" % c debug += "<br />" # remove all 'unreal' collections: colls_real = [] for coll in colls: if collection_reclist_cache.cache.has_key(coll): colls_real.append(coll) else: if coll: raise InvenioWebSearchUnknownCollectionError(coll) colls = colls_real if verbose: debug += "<br />3) --- keeping only the real colls of c ---" debug += "<br />colls : %s" % colls debug += "<br />" # check if some real collections remain: if len(colls)==0: colls = [cc] if verbose: debug += "<br />4) --- in case no colls were left we use cc directly ---" debug += "<br />colls : %s" % colls debug += "<br />" # then let us check the list of non-restricted "real" sons of 'cc' and compare it to 'coll': res = run_sql("""SELECT c.name FROM collection AS c, collection_collection AS cc, collection AS ccc WHERE c.id=cc.id_son AND cc.id_dad=ccc.id AND ccc.name=%s AND cc.type='r'""", (cc,)) # list that holds all the non restricted sons of cc that are also not hosted collections l_cc_nonrestricted_sons_and_nonhosted_colls = [] res_hosted = run_sql("""SELECT c.name FROM collection AS c, collection_collection AS cc, collection AS ccc WHERE c.id=cc.id_son AND cc.id_dad=ccc.id AND ccc.name=%s AND cc.type='r' AND (c.dbquery NOT LIKE 'hostedcollection:%%' OR c.dbquery IS NULL)""", (cc,)) for row_hosted in res_hosted: l_cc_nonrestricted_sons_and_nonhosted_colls.append(row_hosted[0]) l_cc_nonrestricted_sons_and_nonhosted_colls.sort() l_cc_nonrestricted_sons = [] l_c = colls for row in res: if not collection_restricted_p(row[0]): l_cc_nonrestricted_sons.append(row[0]) l_c.sort() l_cc_nonrestricted_sons.sort() if l_cc_nonrestricted_sons == l_c: colls_out_for_display = [cc] # yep, washing permitted, it is sufficient to display 'cc' # the following elif is a hack that preserves the above funcionality when we start searching from # the frontpage with some hosted collections deselected (either by default or manually) elif set(l_cc_nonrestricted_sons_and_nonhosted_colls).issubset(set(l_c)): colls_out_for_display = colls split_colls = 0 else: colls_out_for_display = colls # nope, we need to display all 'colls' successively # remove duplicates: #colls_out_for_display_nondups=filter(lambda x, colls_out_for_display=colls_out_for_display: colls_out_for_display[x-1] not in colls_out_for_display[x:], range(1, len(colls_out_for_display)+1)) #colls_out_for_display = map(lambda x, colls_out_for_display=colls_out_for_display:colls_out_for_display[x-1], colls_out_for_display_nondups) colls_out_for_display = list(set(colls_out_for_display)) if verbose: debug += "<br />5) --- decide whether colls_out_for_diplay should be colls or is it sufficient for it to be cc; remove duplicates ---" debug += "<br />colls_out_for_display : %s" % colls_out_for_display debug += "<br />" # FIXME: The below quoted part of the code has been commented out # because it prevents searching in individual restricted daughter # collections when both parent and all its public daughter # collections were asked for, in addition to some restricted # daughter collections. The removal was introduced for hosted # collections, so we may want to double check in this context. # the following piece of code takes care of removing collections whose ancestors are going to be searched anyway # list to hold the collections to be removed #colls_to_be_removed = [] # first calculate the collections that can safely be removed #for coll in colls_out_for_display: # for ancestor in get_coll_ancestors(coll): # #if ancestor in colls_out_for_display: colls_to_be_removed.append(coll) # if ancestor in colls_out_for_display and not is_hosted_collection(coll): colls_to_be_removed.append(coll) # secondly remove the collections #for coll in colls_to_be_removed: # colls_out_for_display.remove(coll) if verbose: debug += "<br />6) --- remove collections that have ancestors about to be search, unless they are hosted ---" debug += "<br />colls_out_for_display : %s" % colls_out_for_display debug += "<br />" # calculate the hosted collections to be searched. if colls_out_for_display == [cc]: if is_hosted_collection(cc): hosted_colls_out.append(cc) else: for coll in get_coll_sons(cc): if is_hosted_collection(coll): hosted_colls_out.append(coll) else: for coll in colls_out_for_display: if is_hosted_collection(coll): hosted_colls_out.append(coll) if verbose: debug += "<br />7) --- calculate the hosted_colls_out ---" debug += "<br />hosted_colls_out : %s" % hosted_colls_out debug += "<br />" # second, let us decide on collection splitting: if split_colls == 0: # type A - no sons are wanted colls_out = colls_out_for_display else: # type B - sons (first-level descendants) are wanted for coll in colls_out_for_display: coll_sons = get_coll_sons(coll) if coll_sons == []: colls_out.append(coll) else: for coll_son in coll_sons: if not is_hosted_collection(coll_son): colls_out.append(coll_son) #else: # colls_out = colls_out + coll_sons # remove duplicates: #colls_out_nondups=filter(lambda x, colls_out=colls_out: colls_out[x-1] not in colls_out[x:], range(1, len(colls_out)+1)) #colls_out = map(lambda x, colls_out=colls_out:colls_out[x-1], colls_out_nondups) colls_out = list(set(colls_out)) if verbose: debug += "<br />8) --- calculate the colls_out; remove duplicates ---" debug += "<br />colls_out : %s" % colls_out debug += "<br />" # remove the hosted collections from the collections to be searched if hosted_colls_out: for coll in hosted_colls_out: try: colls_out.remove(coll) except ValueError: # in case coll was not found in colls_out pass if verbose: debug += "<br />9) --- remove the hosted_colls from the colls_out ---" debug += "<br />colls_out : %s" % colls_out return (cc, colls_out_for_display, colls_out, hosted_colls_out, debug) def strip_accents(x): """Strip accents in the input phrase X (assumed in UTF-8) by replacing accented characters with their unaccented cousins (e.g. é by e). Return such a stripped X.""" x = re_latex_lowercase_a.sub("a", x) x = re_latex_lowercase_ae.sub("ae", x) x = re_latex_lowercase_e.sub("e", x) x = re_latex_lowercase_i.sub("i", x) x = re_latex_lowercase_o.sub("o", x) x = re_latex_lowercase_u.sub("u", x) x = re_latex_lowercase_y.sub("x", x) x = re_latex_lowercase_c.sub("c", x) x = re_latex_lowercase_n.sub("n", x) x = re_latex_uppercase_a.sub("A", x) x = re_latex_uppercase_ae.sub("AE", x) x = re_latex_uppercase_e.sub("E", x) x = re_latex_uppercase_i.sub("I", x) x = re_latex_uppercase_o.sub("O", x) x = re_latex_uppercase_u.sub("U", x) x = re_latex_uppercase_y.sub("Y", x) x = re_latex_uppercase_c.sub("C", x) x = re_latex_uppercase_n.sub("N", x) # convert input into Unicode string: try: y = unicode(x, "utf-8") except: return x # something went wrong, probably the input wasn't UTF-8 # asciify Latin-1 lowercase characters: y = re_unicode_lowercase_a.sub("a", y) y = re_unicode_lowercase_ae.sub("ae", y) y = re_unicode_lowercase_e.sub("e", y) y = re_unicode_lowercase_i.sub("i", y) y = re_unicode_lowercase_o.sub("o", y) y = re_unicode_lowercase_u.sub("u", y) y = re_unicode_lowercase_y.sub("y", y) y = re_unicode_lowercase_c.sub("c", y) y = re_unicode_lowercase_n.sub("n", y) # asciify Latin-1 uppercase characters: y = re_unicode_uppercase_a.sub("A", y) y = re_unicode_uppercase_ae.sub("AE", y) y = re_unicode_uppercase_e.sub("E", y) y = re_unicode_uppercase_i.sub("I", y) y = re_unicode_uppercase_o.sub("O", y) y = re_unicode_uppercase_u.sub("U", y) y = re_unicode_uppercase_y.sub("Y", y) y = re_unicode_uppercase_c.sub("C", y) y = re_unicode_uppercase_n.sub("N", y) # return UTF-8 representation of the Unicode string: return y.encode("utf-8") def wash_index_term(term, max_char_length=50, lower_term=True): """ Return washed form of the index term TERM that would be suitable for storing into idxWORD* tables. I.e., lower the TERM if LOWER_TERM is True, and truncate it safely to MAX_CHAR_LENGTH UTF-8 characters (meaning, in principle, 4*MAX_CHAR_LENGTH bytes). The function works by an internal conversion of TERM, when needed, from its input Python UTF-8 binary string format into Python Unicode format, and then truncating it safely to the given number of UTF-8 characters, without possible mis-truncation in the middle of a multi-byte UTF-8 character that could otherwise happen if we would have been working with UTF-8 binary representation directly. Note that MAX_CHAR_LENGTH corresponds to the length of the term column in idxINDEX* tables. """ if lower_term: washed_term = unicode(term, 'utf-8').lower() else: washed_term = unicode(term, 'utf-8') if len(washed_term) <= max_char_length: # no need to truncate the term, because it will fit # nicely even if it uses four-byte UTF-8 characters return washed_term.encode('utf-8') else: # truncate the term in a safe position: return washed_term[:max_char_length].encode('utf-8') def lower_index_term(term): """ Return safely lowered index term TERM. This is done by converting to UTF-8 first, because standard Python lower() function is not UTF-8 safe. To be called by both the search engine and the indexer when appropriate (e.g. before stemming). In case of problems with UTF-8 compliance, this function raises UnicodeDecodeError, so the client code may want to catch it. """ return unicode(term, 'utf-8').lower().encode('utf-8') def get_synonym_terms(term, kbr_name, match_type): """ Return list of synonyms for TERM by looking in KBR_NAME in MATCH_TYPE style. @param term: search-time term or index-time term @type term: str @param kbr_name: knowledge base name @type kbr_name: str @param match_type: specifies how the term matches against the KBR before doing the lookup. Could be `exact' (default), 'leading_to_comma', `leading_to_number'. @type match_type: str @return: list of term synonyms @rtype: list of strings """ dterms = {} ## exact match is default: term_for_lookup = term term_remainder = '' ## but maybe match different term: if match_type == 'leading_to_comma': mmm = re.match(r'^(.*?)(\s*,.*)$', term) if mmm: term_for_lookup = mmm.group(1) term_remainder = mmm.group(2) elif match_type == 'leading_to_number': mmm = re.match(r'^(.*?)(\s*\d.*)$', term) if mmm: term_for_lookup = mmm.group(1) term_remainder = mmm.group(2) ## FIXME: workaround: escaping SQL wild-card signs, since KBR's ## exact search is doing LIKE query, so would match everything: term_for_lookup = term_for_lookup.replace('%', '\%') ## OK, now find synonyms: for kbr_values in get_kbr_values(kbr_name, searchkey=term_for_lookup, searchtype='e'): for kbr_value in kbr_values: dterms[kbr_value + term_remainder] = 1 ## return list of term synonyms: return dterms.keys() def wash_output_format(format): """Wash output format FORMAT. Currently only prevents input like 'of=9' for backwards-compatible format that prints certain fields only. (for this task, 'of=tm' is preferred)""" if str(format[0:3]).isdigit() and len(format) != 6: # asked to print MARC tags, but not enough digits, # so let's switch back to HTML brief default return 'hb' else: return format def wash_pattern(p): """Wash pattern passed by URL. Check for sanity of the wildcard by removing wildcards if they are appended to extremely short words (1-3 letters). TODO: instead of this approximative treatment, it will be much better to introduce a temporal limit, e.g. to kill a query if it does not finish in 10 seconds.""" # strip accents: # p = strip_accents(p) # FIXME: when available, strip accents all the time # add leading/trailing whitespace for the two following wildcard-sanity checking regexps: p = " " + p + " " # replace spaces within quotes by __SPACE__ temporarily: p = re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", p) p = re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", p) p = re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", p) # get rid of unquoted wildcards after spaces: p = re_pattern_wildcards_after_spaces.sub("\\1", p) # get rid of extremely short words (1-3 letters with wildcards): #p = re_pattern_short_words.sub("\\1", p) # replace back __SPACE__ by spaces: p = re_pattern_space.sub(" ", p) # replace special terms: p = re_pattern_today.sub(time.strftime("%Y-%m-%d", time.localtime()), p) # remove unnecessary whitespace: p = string.strip(p) # remove potentially wrong UTF-8 characters: p = wash_for_utf8(p) return p def wash_field(f): """Wash field passed by URL.""" if f: # get rid of unnecessary whitespace and make it lowercase # (e.g. Author -> author) to better suit iPhone etc input # mode: f = f.strip().lower() # wash legacy 'f' field names, e.g. replace 'wau' or `au' by # 'author', if applicable: if CFG_WEBSEARCH_FIELDS_CONVERT: f = CFG_WEBSEARCH_FIELDS_CONVERT.get(f, f) return f def wash_dates(d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0): """ Take user-submitted date arguments D1 (full datetime string) or (D1Y, D1M, D1Y) year, month, day tuple and D2 or (D2Y, D2M, D2Y) and return (YYY1-M1-D2 H1:M1:S2, YYY2-M2-D2 H2:M2:S2) datetime strings in the YYYY-MM-DD HH:MM:SS format suitable for time restricted searching. Note that when both D1 and (D1Y, D1M, D1D) parameters are present, the precedence goes to D1. Ditto for D2*. Note that when (D1Y, D1M, D1D) are taken into account, some values may be missing and are completed e.g. to 01 or 12 according to whether it is the starting or the ending date. """ datetext1, datetext2 = "", "" # sanity checking: if d1 == "" and d1y == 0 and d1m == 0 and d1d == 0 and d2 == "" and d2y == 0 and d2m == 0 and d2d == 0: return ("", "") # nothing selected, so return empty values # wash first (starting) date: if d1: # full datetime string takes precedence: datetext1 = d1 else: # okay, first date passed as (year,month,day): if d1y: datetext1 += "%04d" % d1y else: datetext1 += "0000" if d1m: datetext1 += "-%02d" % d1m else: datetext1 += "-01" if d1d: datetext1 += "-%02d" % d1d else: datetext1 += "-01" datetext1 += " 00:00:00" # wash second (ending) date: if d2: # full datetime string takes precedence: datetext2 = d2 else: # okay, second date passed as (year,month,day): if d2y: datetext2 += "%04d" % d2y else: datetext2 += "9999" if d2m: datetext2 += "-%02d" % d2m else: datetext2 += "-12" if d2d: datetext2 += "-%02d" % d2d else: datetext2 += "-31" # NOTE: perhaps we should add max(datenumber) in # given month, but for our quering it's not # needed, 31 will always do datetext2 += " 00:00:00" # okay, return constructed YYYY-MM-DD HH:MM:SS datetexts: return (datetext1, datetext2) def is_hosted_collection(coll): """Check if the given collection is a hosted one; i.e. its dbquery starts with hostedcollection: Returns True if it is, False if it's not or if the result is empty or if the query failed""" res = run_sql("SELECT dbquery FROM collection WHERE name=%s", (coll, )) try: return res[0][0].startswith("hostedcollection:") except: return False def get_colID(c): "Return collection ID for collection name C. Return None if no match found." colID = None res = run_sql("SELECT id FROM collection WHERE name=%s", (c,), 1) if res: colID = res[0][0] return colID def get_coll_normalised_name(c): """Returns normalised collection name (case sensitive) for collection name C (case insensitive). Returns None if no match found.""" try: return run_sql("SELECT name FROM collection WHERE name=%s", (c,))[0][0] except: return None def get_coll_ancestors(coll): "Returns a list of ancestors for collection 'coll'." coll_ancestors = [] coll_ancestor = coll while 1: res = run_sql("""SELECT c.name FROM collection AS c LEFT JOIN collection_collection AS cc ON c.id=cc.id_dad LEFT JOIN collection AS ccc ON ccc.id=cc.id_son WHERE ccc.name=%s ORDER BY cc.id_dad ASC LIMIT 1""", (coll_ancestor,)) if res: coll_name = res[0][0] coll_ancestors.append(coll_name) coll_ancestor = coll_name else: break # ancestors found, return reversed list: coll_ancestors.reverse() return coll_ancestors def get_coll_sons(coll, type='r', public_only=1): """Return a list of sons (first-level descendants) of type 'type' for collection 'coll'. If public_only, then return only non-restricted son collections. """ coll_sons = [] query = "SELECT c.name FROM collection AS c "\ "LEFT JOIN collection_collection AS cc ON c.id=cc.id_son "\ "LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad "\ "WHERE cc.type=%s AND ccc.name=%s" query += " ORDER BY cc.score DESC" res = run_sql(query, (type, coll)) for name in res: if not public_only or not collection_restricted_p(name[0]): coll_sons.append(name[0]) return coll_sons def get_coll_real_descendants(coll, type='_', get_hosted_colls=True): """Return a list of all descendants of collection 'coll' that are defined by a 'dbquery'. IOW, we need to decompose compound collections like "A & B" into "A" and "B" provided that "A & B" has no associated database query defined. """ coll_sons = [] res = run_sql("""SELECT c.name,c.dbquery FROM collection AS c LEFT JOIN collection_collection AS cc ON c.id=cc.id_son LEFT JOIN collection AS ccc ON ccc.id=cc.id_dad WHERE ccc.name=%s AND cc.type LIKE %s ORDER BY cc.score DESC""", (coll, type,)) for name, dbquery in res: if dbquery: # this is 'real' collection, so return it: if get_hosted_colls: coll_sons.append(name) else: if not dbquery.startswith("hostedcollection:"): coll_sons.append(name) else: # this is 'composed' collection, so recurse: coll_sons.extend(get_coll_real_descendants(name)) return coll_sons def browse_pattern(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Browse either biliographic phrases or words indexes, and display it.""" # load the right message language _ = gettext_set_language(ln) ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] p_orig = p ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) ## do we search in words indexes? if not f: return browse_in_bibwords(req, p, f) index_id = get_index_id_from_field(f) if index_id != 0: - coll = HitSet() + coll = intbitset() for coll_name in colls: coll |= get_collection_reclist(coll_name) browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection(p, index_id, rg/2, rg/2, coll) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg+1)/2+1, (rg-1)/2+1) except: # probably there are no hits at all: req.write(_("No values found.")) return ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: - phrase_hitset = HitSet() + phrase_hitset = intbitset() phrase_hitsets = search_pattern("", phrase, f, 'e') for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append([phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: #print_warning(req, """<p>No match close to <em>%s</em> found in given collections. #Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: browsed_phrases_in_colls.append([phrase, get_nbhits_in_bibxxx(phrase, f)]) ## display results now: out = websearch_templates.tmpl_browse_pattern( f=f, fn=get_field_i18nname(get_field_name(f) or f, ln, False), ln=ln, browsed_phrases_in_colls=browsed_phrases_in_colls, colls=colls, rg=rg, ) req.write(out) return def browse_in_bibwords(req, p, f, ln=CFG_SITE_LANG): """Browse inside words indexes.""" if not p: return _ = gettext_set_language(ln) urlargd = {} urlargd.update(req.argd) urlargd['action'] = 'search' nearest_box = create_nearest_terms_box(urlargd, p, f, 'w', ln=ln, intro_text_p=0) req.write(websearch_templates.tmpl_search_in_bibwords( p = p, f = f, ln = ln, nearest_box = nearest_box )) return def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True, wl=0): """Search for complex pattern 'p' within field 'f' according to matching type 'm'. Return hitset of recIDs. The function uses multi-stage searching algorithm in case of no exact match found. See the Search Internals document for detailed description. The 'ap' argument governs whether an alternative patterns are to be used in case there is no direct hit for (p,f,m). For example, whether to replace non-alphanumeric characters by spaces if it would give some hits. See the Search Internals document for detailed description. (ap=0 forbits the alternative pattern usage, ap=1 permits it.) The 'of' argument governs whether to print or not some information to the user in case of no match found. (Usually it prints the information in case of HTML formats, otherwise it's silent). The 'verbose' argument controls the level of debugging information to be printed (0=least, 9=most). All the parameters are assumed to have been previously washed. This function is suitable as a mid-level API. """ _ = gettext_set_language(ln) - hitset_empty = HitSet() + hitset_empty = intbitset() # sanity check: if not p: - hitset_full = HitSet(trailing_bits=1) + hitset_full = intbitset(trailing_bits=1) hitset_full.discard(0) # no pattern, so return all universe return hitset_full # search stage 1: break up arguments into basic search units: if verbose and of.startswith("h"): t1 = os.times()[4] basic_search_units = create_basic_search_units(req, p, f, m, of) if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 1: basic search units are: %s" % cgi.escape(repr(basic_search_units))) print_warning(req, "Search stage 1: execution took %.2f seconds." % (t2 - t1)) # search stage 2: do search for each search unit and verify hit presence: if verbose and of.startswith("h"): t1 = os.times()[4] basic_search_units_hitsets = [] #prepare hiddenfield-related.. myhiddens = CFG_BIBFORMAT_HIDDEN_TAGS can_see_hidden = False if req: user_info = collect_user_info(req) can_see_hidden = (acc_authorize_action(user_info, 'runbibedit')[0] == 0) if can_see_hidden: myhiddens = [] if CFG_INSPIRE_SITE and of.startswith('h'): # fulltext/caption search warnings for INSPIRE: fields_to_be_searched = [f for o,p,f,m in basic_search_units] if 'fulltext' in fields_to_be_searched: print_warning(req, _("Warning: full-text search is only available for a subset of papers mostly from 2006-2011.")) elif 'caption' in fields_to_be_searched: print_warning(req, _("Warning: figure caption search is only available for a subset of papers mostly from 2008-2011.")) for idx_unit in xrange(len(basic_search_units)): bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit] if bsu_f and len(bsu_f) < 2: if of.startswith("h"): print_warning(req, _("There is no index %s. Searching for %s in all fields." % (bsu_f, bsu_p))) bsu_f = '' bsu_m = 'w' if of.startswith("h") and verbose: print_warning(req, _('Instead searching %s.' % str([bsu_o, bsu_p, bsu_f, bsu_m]))) try: basic_search_unit_hitset = search_unit(bsu_p, bsu_f, bsu_m, wl) except InvenioWebSearchWildcardLimitError, excp: basic_search_unit_hitset = excp.res if of.startswith("h"): print_warning(req, _("Search term too generic, displaying only partial results...")) # FIXME: print warning if we use native full-text indexing if bsu_f == 'fulltext' and bsu_m != 'w' and of.startswith('h') and not CFG_SOLR_URL: print_warning(req, _("No phrase index available for fulltext yet, looking for word combination...")) #check that the user is allowed to search with this tag #if he/she tries it if bsu_f and len(bsu_f) > 1 and bsu_f[0].isdigit() and bsu_f[1].isdigit(): for htag in myhiddens: ltag = len(htag) samelenfield = bsu_f[0:ltag] if samelenfield == htag: #user searches by a hidden tag #we won't show you anything.. - basic_search_unit_hitset = HitSet() + basic_search_unit_hitset = intbitset() if verbose >= 9 and of.startswith("h"): print_warning(req, "Pattern %s hitlist omitted since \ it queries in a hidden tag %s" % (repr(bsu_p), repr(myhiddens))) display_nearest_terms_box=False #..and stop spying, too. if verbose >= 9 and of.startswith("h"): print_warning(req, "Search stage 1: pattern %s gave hitlist %s" % (cgi.escape(bsu_p), basic_search_unit_hitset)) if len(basic_search_unit_hitset) > 0 or \ ap==0 or \ bsu_o=="|" or \ ((idx_unit+1)<len(basic_search_units) and basic_search_units[idx_unit+1][0]=="|"): # stage 2-1: this basic search unit is retained, since # either the hitset is non-empty, or the approximate # pattern treatment is switched off, or the search unit # was joined by an OR operator to preceding/following # units so we do not require that it exists basic_search_units_hitsets.append(basic_search_unit_hitset) else: # stage 2-2: no hits found for this search unit, try to replace non-alphanumeric chars inside pattern: if re.search(r'[^a-zA-Z0-9\s\:]', bsu_p) and bsu_f != 'refersto' and bsu_f != 'citedby': if bsu_p.startswith('"') and bsu_p.endswith('"'): # is it ACC query? bsu_pn = re.sub(r'[^a-zA-Z0-9\s\:]+', "*", bsu_p) else: # it is WRD query bsu_pn = re.sub(r'[^a-zA-Z0-9\s\:]+', " ", bsu_p) if verbose and of.startswith('h') and req: print_warning(req, "Trying (%s,%s,%s)" % (cgi.escape(bsu_pn), cgi.escape(bsu_f), cgi.escape(bsu_m))) basic_search_unit_hitset = search_pattern(req=None, p=bsu_pn, f=bsu_f, m=bsu_m, of="id", ln=ln, wl=wl) if len(basic_search_unit_hitset) > 0: # we retain the new unit instead if of.startswith('h'): print_warning(req, _("No exact match found for %(x_query1)s, using %(x_query2)s instead...") % \ {'x_query1': "<em>" + cgi.escape(bsu_p) + "</em>", 'x_query2': "<em>" + cgi.escape(bsu_pn) + "</em>"}) basic_search_units[idx_unit][1] = bsu_pn basic_search_units_hitsets.append(basic_search_unit_hitset) else: # stage 2-3: no hits found either, propose nearest indexed terms: if of.startswith('h') and display_nearest_terms_box: if req: if bsu_f == "recid": print_warning(req, _("Requested record does not seem to exist.")) else: print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln)) return hitset_empty else: # stage 2-3: no hits found either, propose nearest indexed terms: if of.startswith('h') and display_nearest_terms_box: if req: if bsu_f == "recid": print_warning(req, _("Requested record does not seem to exist.")) else: print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln)) return hitset_empty if verbose and of.startswith("h"): t2 = os.times()[4] for idx_unit in range(0, len(basic_search_units)): print_warning(req, "Search stage 2: basic search unit %s gave %d hits." % (basic_search_units[idx_unit][1:], len(basic_search_units_hitsets[idx_unit]))) print_warning(req, "Search stage 2: execution took %.2f seconds." % (t2 - t1)) # search stage 3: apply boolean query for each search unit: if verbose and of.startswith("h"): t1 = os.times()[4] # let the initial set be the complete universe: - hitset_in_any_collection = HitSet(trailing_bits=1) + hitset_in_any_collection = intbitset(trailing_bits=1) hitset_in_any_collection.discard(0) for idx_unit in xrange(len(basic_search_units)): this_unit_operation = basic_search_units[idx_unit][0] this_unit_hitset = basic_search_units_hitsets[idx_unit] if this_unit_operation == '+': hitset_in_any_collection.intersection_update(this_unit_hitset) elif this_unit_operation == '-': hitset_in_any_collection.difference_update(this_unit_hitset) elif this_unit_operation == '|': hitset_in_any_collection.union_update(this_unit_hitset) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(this_unit_operation), "Error") if len(hitset_in_any_collection) == 0: # no hits found, propose alternative boolean query: if of.startswith('h') and display_nearest_terms_box: nearestterms = [] for idx_unit in range(0, len(basic_search_units)): bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit] if bsu_p.startswith("%") and bsu_p.endswith("%"): bsu_p = "'" + bsu_p[1:-1] + "'" bsu_nbhits = len(basic_search_units_hitsets[idx_unit]) # create a similar query, but with the basic search unit only argd = {} argd.update(req.argd) argd['p'] = bsu_p argd['f'] = bsu_f nearestterms.append((bsu_p, bsu_nbhits, argd)) text = websearch_templates.tmpl_search_no_boolean_hits( ln=ln, nearestterms=nearestterms) print_warning(req, text) if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 3: boolean query gave %d hits." % len(hitset_in_any_collection)) print_warning(req, "Search stage 3: execution took %.2f seconds." % (t2 - t1)) return hitset_in_any_collection def search_pattern_parenthesised(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True, wl=0): """Search for complex pattern 'p' containing parenthesis within field 'f' according to matching type 'm'. Return hitset of recIDs. For more details on the parameters see 'search_pattern' """ _ = gettext_set_language(ln) spires_syntax_converter = SpiresToInvenioSyntaxConverter() spires_syntax_query = False # if the pattern uses SPIRES search syntax, convert it to Invenio syntax if spires_syntax_converter.is_applicable(p): spires_syntax_query = True p = spires_syntax_converter.convert_query(p) # sanity check: do not call parenthesised parser for search terms # like U(1): if not re_pattern_parens.search(p): return search_pattern(req, p, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box, wl=wl) # Try searching with parentheses try: parser = SearchQueryParenthesisedParser() # get a hitset with all recids - result_hitset = HitSet(trailing_bits=1) + result_hitset = intbitset(trailing_bits=1) # parse the query. The result is list of [op1, expr1, op2, expr2, ..., opN, exprN] parsing_result = parser.parse_query(p) if verbose and of.startswith("h"): print_warning(req, "Search stage 1: search_pattern_parenthesised() searched %s." % repr(p)) print_warning(req, "Search stage 1: search_pattern_parenthesised() returned %s." % repr(parsing_result)) # go through every pattern # calculate hitset for it # combine pattern's hitset with the result using the corresponding operator for index in xrange(0, len(parsing_result)-1, 2 ): current_operator = parsing_result[index] current_pattern = parsing_result[index+1] if CFG_INSPIRE_SITE and spires_syntax_query: # setting ap=0 to turn off approximate matching for 0 results. # Doesn't work well in combinations. # FIXME: The right fix involves collecting statuses for each # hitset, then showing a nearest terms box exactly once, # outside this loop. ap = 0 display_nearest_terms_box=False # obtain a hitset for the current pattern current_hitset = search_pattern(req, current_pattern, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box, wl=wl) # combine the current hitset with resulting hitset using the current operator if current_operator == '+': result_hitset = result_hitset & current_hitset elif current_operator == '-': result_hitset = result_hitset - current_hitset elif current_operator == '|': result_hitset = result_hitset | current_hitset else: assert False, "Unknown operator in search_pattern_parenthesised()" return result_hitset # If searching with parenteses fails, perform search ignoring parentheses except SyntaxError: print_warning(req, _("Search syntax misunderstood. Ignoring all parentheses in the query. If this doesn't help, please check your search and try again.")) # remove the parentheses in the query. Current implementation removes all the parentheses, # but it could be improved to romove only these that are not inside quotes p = p.replace('(', ' ') p = p.replace(')', ' ') return search_pattern(req, p, f, m, ap, of, verbose, ln, display_nearest_terms_box=display_nearest_terms_box, wl=wl) def search_unit(p, f=None, m=None, wl=0): """Search for basic search unit defined by pattern 'p' and field 'f' and matching type 'm'. Return hitset of recIDs. All the parameters are assumed to have been previously washed. 'p' is assumed to be already a ``basic search unit'' so that it is searched as such and is not broken up in any way. Only wildcard and span queries are being detected inside 'p'. If CFG_WEBSEARCH_SYNONYM_KBRS is set and we are searching in one of the indexes that has defined runtime synonym knowledge base, then look up there and automatically enrich search results with results for synonyms. In case the wildcard limit (wl) is greater than 0 and this limit is reached an InvenioWebSearchWildcardLimitError will be raised. In case you want to call this function with no limit for the wildcard queries, wl should be 0. This function is suitable as a low-level API. """ ## create empty output results set: - hitset = HitSet() + hitset = intbitset() if not p: # sanity checking return hitset ## eventually look up runtime synonyms: - hitset_synonyms = HitSet() + hitset_synonyms = intbitset() if CFG_WEBSEARCH_SYNONYM_KBRS.has_key(f): for p_synonym in get_synonym_terms(p, CFG_WEBSEARCH_SYNONYM_KBRS[f][0], CFG_WEBSEARCH_SYNONYM_KBRS[f][1]): if p_synonym != p: hitset_synonyms |= search_unit(p_synonym, f, m, wl) ## look up hits: if CFG_SOLR_URL and f == 'fulltext': # redirect to Solr/Lucene return search_unit_in_solr(p, f, m) if f == 'datecreated': hitset = search_unit_in_bibrec(p, p, 'c') elif f == 'datemodified': hitset = search_unit_in_bibrec(p, p, 'm') elif f == 'refersto': # we are doing search by the citation count hitset = search_unit_refersto(p) elif f == 'citedby': # we are doing search by the citation count hitset = search_unit_citedby(p) elif m == 'a' or m == 'r': # we are doing either phrase search or regexp search if f == 'fulltext': # FIXME: workaround for not having phrase index yet return search_pattern(None, p, f, 'w') index_id = get_index_id_from_field(f) if index_id != 0: hitset = search_unit_in_idxphrases(p, f, m, wl) else: hitset = search_unit_in_bibxxx(p, f, m, wl) elif p.startswith("cited:"): # we are doing search by the citation count hitset = search_unit_by_times_cited(p[6:]) else: # we are doing bibwords search by default hitset = search_unit_in_bibwords(p, f, m, wl=wl) ## merge synonym results and return total: hitset |= hitset_synonyms return hitset def search_unit_in_bibwords(word, f, m=None, decompress=zlib.decompress, wl=0): """Searches for 'word' inside bibwordsX table for field 'f' and returns hitset of recIDs.""" - set = HitSet() # will hold output result set + set = intbitset() # will hold output result set set_used = 0 # not-yet-used flag, to be able to circumvent set operations limit_reached = 0 # flag for knowing if the query limit has been reached # deduce into which bibwordsX table we will search: stemming_language = get_index_stemming_language(get_index_id_from_field("anyfield")) bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id stemming_language = get_index_stemming_language(index_id) else: - return HitSet() # word index f does not exist + return intbitset() # word index f does not exist # wash 'word' argument and run query: word = string.replace(word, '*', '%') # we now use '*' as the truncation character words = string.split(word, "->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) word0 = stem(word0, stemming_language) word1 = stem(word1, stemming_language) try: res = run_sql_with_limit("SELECT term,hitlist FROM %s WHERE term BETWEEN %%s AND %%s" % bibwordsX, (wash_index_term(word0), wash_index_term(word1)), wildcard_limit = wl) except InvenioDbQueryWildcardLimitError, excp: res = excp.res limit_reached = 1 # set the limit reached flag to true else: if f == 'journal': pass # FIXME: quick hack for the journal index else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) word = stem(word, stemming_language) if string.find(word, '%') >= 0: # do we have wildcard in the word? if f == 'journal': # FIXME: quick hack for the journal index # FIXME: we can run a sanity check here for all indexes res = () else: try: res = run_sql_with_limit("SELECT term,hitlist FROM %s WHERE term LIKE %%s" % bibwordsX, (wash_index_term(word),), wildcard_limit = wl) except InvenioDbQueryWildcardLimitError, excp: res = excp.res limit_reached = 1 # set the limit reached flag to true else: res = run_sql("SELECT term,hitlist FROM %s WHERE term=%%s" % bibwordsX, (wash_index_term(word),)) # fill the result set: for word, hitlist in res: - hitset_bibwrd = HitSet(hitlist) + hitset_bibwrd = intbitset(hitlist) # add the results: if set_used: set.union_update(hitset_bibwrd) else: set = hitset_bibwrd set_used = 1 #check to see if the query limit was reached if limit_reached: #raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(set) # okay, return result set: return set def search_unit_in_idxphrases(p, f, type, wl=0): """Searches for phrase 'p' inside idxPHRASE*F table for field 'f' and returns hitset of recIDs found. The search type is defined by 'type' (e.g. equals to 'r' for a regexp search).""" - set = HitSet() # will hold output result set + set = intbitset() # will hold output result set set_used = 0 # not-yet-used flag, to be able to circumvent set operations limit_reached = 0 # flag for knowing if the query limit has been reached use_query_limit = False # flag for knowing if to limit the query results or not # deduce in which idxPHRASE table we will search: idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: idxphraseX = "idxPHRASE%02dF" % index_id else: - return HitSet() # phrase index f does not exist + return intbitset() # phrase index f does not exist # detect query type (exact phrase, partial phrase, regexp): if type == 'r': query_addons = "REGEXP %s" query_params = (p,) use_query_limit = True else: p = string.replace(p, '*', '%') # we now use '*' as the truncation character ps = string.split(p, "->", 1) # check for span query: if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): query_addons = "BETWEEN %s AND %s" query_params = (ps[0], ps[1]) use_query_limit = True else: if string.find(p, '%') > -1: query_addons = "LIKE %s" query_params = (p,) use_query_limit = True else: query_addons = "= %s" query_params = (p,) # special washing for fuzzy author index: if f in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor'): query_params_washed = () for query_param in query_params: query_params_washed += (wash_author_name(query_param),) query_params = query_params_washed # perform search: if use_query_limit: try: res = run_sql_with_limit("SELECT term,hitlist FROM %s WHERE term %s" % (idxphraseX, query_addons), query_params, wildcard_limit=wl) except InvenioDbQueryWildcardLimitError, excp: res = excp.res limit_reached = 1 # set the limit reached flag to true else: res = run_sql("SELECT term,hitlist FROM %s WHERE term %s" % (idxphraseX, query_addons), query_params) # fill the result set: for word, hitlist in res: - hitset_bibphrase = HitSet(hitlist) + hitset_bibphrase = intbitset(hitlist) # add the results: if set_used: set.union_update(hitset_bibphrase) else: set = hitset_bibphrase set_used = 1 #check to see if the query limit was reached if limit_reached: #raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(set) # okay, return result set: return set def search_unit_in_bibxxx(p, f, type, wl=0): """Searches for pattern 'p' inside bibxxx tables for field 'f' and returns hitset of recIDs found. The search type is defined by 'type' (e.g. equals to 'r' for a regexp search).""" # FIXME: quick hack for the journal index if f == 'journal': return search_unit_in_bibwords(p, f, wl=wl) p_orig = p # saving for eventual future 'no match' reporting limit_reached = 0 # flag for knowing if the query limit has been reached use_query_limit = False # flag for knowing if to limit the query results or not query_addons = "" # will hold additional SQL code for the query query_params = () # will hold parameters for the query (their number may vary depending on TYPE argument) # wash arguments: f = string.replace(f, '*', '%') # replace truncation char '*' in field definition if type == 'r': query_addons = "REGEXP %s" query_params = (p,) use_query_limit = True else: p = string.replace(p, '*', '%') # we now use '*' as the truncation character ps = string.split(p, "->", 1) # check for span query: if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): query_addons = "BETWEEN %s AND %s" query_params = (ps[0], ps[1]) use_query_limit = True else: if string.find(p, '%') > -1: query_addons = "LIKE %s" query_params = (p,) use_query_limit = True else: query_addons = "= %s" query_params = (p,) # construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if len(f) >= 2 and str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) if not tl: # f index does not exist, nevermind pass # okay, start search: l = [] # will hold list of recID that matched for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) # construct and run query: if t == "001": if query_addons.find('BETWEEN') > -1 or query_addons.find('=') > -1: # verify that the params are integers (to avoid returning record 123 when searching for 123foo) try: query_params = tuple(int(param) for param in query_params) except ValueError: - return HitSet() + return intbitset() if use_query_limit: try: res = run_sql_with_limit("SELECT id FROM bibrec WHERE id %s" % query_addons, query_params, wildcard_limit=wl) except InvenioDbQueryWildcardLimitError, excp: res = excp.res limit_reached = 1 # set the limit reached flag to true else: res = run_sql("SELECT id FROM bibrec WHERE id %s" % query_addons, query_params) else: query = "SELECT bibx.id_bibrec FROM %s AS bx LEFT JOIN %s AS bibx ON bx.id=bibx.id_bibxxx WHERE bx.value %s" % \ (bx, bibx, query_addons) if len(t) != 6 or t[-1:]=='%': # wildcard query, or only the beginning of field 't' # is defined, so add wildcard character: query += " AND bx.tag LIKE %s" query_params_and_tag = query_params + (t + '%',) else: # exact query for 't': query += " AND bx.tag=%s" query_params_and_tag = query_params + (t,) if use_query_limit: try: res = run_sql_with_limit(query, query_params_and_tag, wildcard_limit=wl) except InvenioDbQueryWildcardLimitError, excp: res = excp.res limit_reached = 1 # set the limit reached flag to true else: res = run_sql(query, query_params_and_tag) # fill the result set: for id_bibrec in res: if id_bibrec[0]: l.append(id_bibrec[0]) # check no of hits found: nb_hits = len(l) # okay, return result set: - set = HitSet(l) + set = intbitset(l) #check to see if the query limit was reached if limit_reached: #raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(set) return set def search_unit_in_solr(p, f=None, m=None): """ Query the Solr full-text index and return an intbitset corresponding to the result. Parameters (p,f,m) are usual search unit ones. """ if m and (m == 'a' or m == 'r'): # phrase/regexp query if p.startswith('%') and p.endswith('%'): p = p[1:-1] # fix for partial phrase p = '"' + p + '"' return solr_get_bitset(p, CFG_SOLR_URL) def search_unit_in_bibrec(datetext1, datetext2, type='c'): """ Return hitset of recIDs found that were either created or modified (according to 'type' arg being 'c' or 'm') from datetext1 until datetext2, inclusive. Does not pay attention to pattern, collection, anything. Useful to intersect later on with the 'real' query. """ - set = HitSet() + set = intbitset() if type.startswith("m"): type = "modification_date" else: type = "creation_date" # by default we are searching for creation dates parts = datetext1.split('->') if len(parts) > 1 and datetext1 == datetext2: datetext1 = parts[0] datetext2 = parts[1] if datetext1 == datetext2: res = run_sql("SELECT id FROM bibrec WHERE %s LIKE %%s" % (type,), (datetext1 + '%',)) else: res = run_sql("SELECT id FROM bibrec WHERE %s>=%%s AND %s<=%%s" % (type, type), (datetext1, datetext2)) for row in res: set += row[0] return set def search_unit_by_times_cited(p): """ Return histset of recIDs found that are cited P times. Usually P looks like '10->23'. """ numstr = '"'+p+'"' #this is sort of stupid but since we may need to #get the records that do _not_ have cites, we have to #know the ids of all records, too #but this is needed only if bsu_p is 0 or 0 or 0->0 allrecs = [] if p == 0 or p == "0" or \ p.startswith("0->") or p.endswith("->0"): - allrecs = HitSet(run_sql("SELECT id FROM bibrec")) + allrecs = intbitset(run_sql("SELECT id FROM bibrec")) return get_records_with_num_cites(numstr, allrecs) def search_unit_refersto(query): """ Search for records satisfying the query (e.g. author:ellis) and return list of records referred to by these records. """ if query: ahitset = search_pattern(p=query) if ahitset: return get_refersto_hitset(ahitset) else: - return HitSet([]) + return intbitset([]) else: - return HitSet([]) + return intbitset([]) def search_unit_citedby(query): """ Search for records satisfying the query (e.g. author:ellis) and return list of records cited by these records. """ if query: ahitset = search_pattern(p=query) if ahitset: return get_citedby_hitset(ahitset) else: - return HitSet([]) + return intbitset([]) else: - return HitSet([]) + return intbitset([]) def intersect_results_with_collrecs(req, hitset_in_any_collection, colls, ap=0, of="hb", verbose=0, ln=CFG_SITE_LANG, display_nearest_terms_box=True): """Return dict of hitsets given by intersection of hitset with the collection universes.""" _ = gettext_set_language(ln) # search stage 4: intersect with the collection universe: if verbose and of.startswith("h"): t1 = os.times()[4] results = {} results_nbhits = 0 for coll in colls: results[coll] = hitset_in_any_collection & get_collection_reclist(coll) results_nbhits += len(results[coll]) if results_nbhits == 0: # no hits found, try to search in Home: results_in_Home = hitset_in_any_collection & get_collection_reclist(CFG_SITE_NAME) if len(results_in_Home) > 0: # some hits found in Home, so propose this search: if of.startswith("h") and display_nearest_terms_box: url = websearch_templates.build_search_url(req.argd, cc=CFG_SITE_NAME, c=[]) print_warning(req, _("No match found in collection %(x_collection)s. Other public collections gave %(x_url_open)s%(x_nb_hits)d hits%(x_url_close)s.") %\ {'x_collection': '<em>' + string.join([get_coll_i18nname(coll, ln, False) for coll in colls], ', ') + '</em>', 'x_url_open': '<a class="nearestterms" href="%s">' % (url), 'x_nb_hits': len(results_in_Home), 'x_url_close': '</a>'}) results = {} else: # no hits found in Home, recommend different search terms: if of.startswith("h") and display_nearest_terms_box: print_warning(req, _("No public collection matched your query. " "If you were looking for a non-public document, please choose " "the desired restricted collection first.")) results = {} if verbose and of.startswith("h"): t2 = os.times()[4] print_warning(req, "Search stage 4: intersecting with collection universe gave %d hits." % results_nbhits) print_warning(req, "Search stage 4: execution took %.2f seconds." % (t2 - t1)) return results def intersect_results_with_hitset(req, results, hitset, ap=0, aptext="", of="hb"): """Return intersection of search 'results' (a dict of hitsets with collection as key) with the 'hitset', i.e. apply 'hitset' intersection to each collection within search 'results'. If the final 'results' set is to be empty, and 'ap' (approximate pattern) is true, and then print the `warningtext' and return the original 'results' set unchanged. If 'ap' is false, then return empty results set. """ if ap: results_ap = copy.deepcopy(results) else: results_ap = {} # will return empty dict in case of no hits found nb_total = 0 for coll in results.keys(): results[coll].intersection_update(hitset) nb_total += len(results[coll]) if nb_total == 0: if of.startswith("h"): print_warning(req, aptext) results = results_ap return results def create_similarly_named_authors_link_box(author_name, ln=CFG_SITE_LANG): """Return a box similar to ``Not satisfied...'' one by proposing author searches for similar names. Namely, take AUTHOR_NAME and the first initial of the firstame (after comma) and look into author index whether authors with e.g. middle names exist. Useful mainly for CERN Library that sometimes contains name forms like Ellis-N, Ellis-Nick, Ellis-Nicolas all denoting the same person. The box isn't proposed if no similarly named authors are found to exist. """ # return nothing if not configured: if CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX == 0: return "" # return empty box if there is no initial: if re.match(r'[^ ,]+, [^ ]', author_name) is None: return "" # firstly find name comma initial: author_name_to_search = re.sub(r'^([^ ,]+, +[^ ,]).*$', '\\1', author_name) # secondly search for similar name forms: similar_author_names = {} for name in author_name_to_search, strip_accents(author_name_to_search): for tag in get_field_tags("author"): # deduce into which bibxxx table we will search: digit1, digit2 = int(tag[0]), int(tag[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) if len(tag) != 6 or tag[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value LIKE %%s AND bx.tag LIKE %%s""" % bx, (name + "%", tag + "%")) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value LIKE %%s AND bx.tag=%%s""" % bx, (name + "%", tag)) for row in res: similar_author_names[row[0]] = 1 # remove the original name and sort the list: try: del similar_author_names[author_name] except KeyError: pass # thirdly print the box: out = "" if similar_author_names: out_authors = similar_author_names.keys() out_authors.sort() tmp_authors = [] for out_author in out_authors: nbhits = get_nbhits_in_bibxxx(out_author, "author") if nbhits: tmp_authors.append((out_author, nbhits)) out += websearch_templates.tmpl_similar_author_names( authors=tmp_authors, ln=ln) return out def create_nearest_terms_box(urlargd, p, f, t='w', n=5, ln=CFG_SITE_LANG, intro_text_p=True): """Return text box containing list of 'n' nearest terms above/below 'p' for the field 'f' for matching type 't' (words/phrases) in language 'ln'. Propose new searches according to `urlargs' with the new words. If `intro_text_p' is true, then display the introductory message, otherwise print only the nearest terms in the box content. """ # load the right message language _ = gettext_set_language(ln) out = "" nearest_terms = [] if not p: # sanity check p = "." if p.startswith('%') and p.endswith('%'): p = p[1:-1] # fix for partial phrase index_id = get_index_id_from_field(f) if f == 'fulltext': if CFG_SOLR_URL: return _("No match found, please enter different search terms.") else: # FIXME: workaround for not having native phrase index yet t = 'w' # special indexes: if f == 'refersto': return _("There are no records referring to %s.") % cgi.escape(p) if f == 'citedby': return _("There are no records cited by %s.") % cgi.escape(p) # look for nearest terms: if t == 'w': nearest_terms = get_nearest_terms_in_bibwords(p, f, n, n) if not nearest_terms: return _("No word index is available for %s.") % \ ('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>') else: nearest_terms = [] if index_id: nearest_terms = get_nearest_terms_in_idxphrase(p, index_id, n, n) if f == 'datecreated' or f == 'datemodified': nearest_terms = get_nearest_terms_in_bibrec(p, f, n, n) if not nearest_terms: nearest_terms = get_nearest_terms_in_bibxxx(p, f, n, n) if not nearest_terms: return _("No phrase index is available for %s.") % \ ('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>') terminfo = [] for term in nearest_terms: if t == 'w': hits = get_nbhits_in_bibwords(term, f) else: if index_id: hits = get_nbhits_in_idxphrases(term, f) elif f == 'datecreated' or f == 'datemodified': hits = get_nbhits_in_bibrec(term, f) else: hits = get_nbhits_in_bibxxx(term, f) argd = {} argd.update(urlargd) # check which fields contained the requested parameter, and replace it. for (px, fx) in ('p', 'f'), ('p1', 'f1'), ('p2', 'f2'), ('p3', 'f3'): if px in argd: argd_px = argd[px] if t == 'w': # p was stripped of accents, to do the same: argd_px = strip_accents(argd_px) if f == argd[fx] or f == "anyfield" or f == "": if string.find(argd_px, p) > -1: argd[px] = string.replace(argd_px, p, term) break else: if string.find(argd_px, f+':'+p) > -1: if string.find(term.strip(), ' ') > -1: term = '"' + term + '"' argd[px] = string.replace(argd_px, f+':'+p, f+':'+term) break elif string.find(argd_px, f+':"'+p+'"') > -1: argd[px] = string.replace(argd_px, f+':"'+p+'"', f+':"'+term+'"') break elif string.find(argd_px, f+':\''+p+'\'') > -1: argd[px] = string.replace(argd_px, f+':\''+p+'\'', f+':\''+term+'\'') break terminfo.append((term, hits, argd)) intro = "" if intro_text_p: # add full leading introductory text if f: intro = _("Search term %(x_term)s inside index %(x_index)s did not match any record. Nearest terms in any collection are:") % \ {'x_term': "<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>", 'x_index': "<em>" + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + "</em>"} else: intro = _("Search term %s did not match any record. Nearest terms in any collection are:") % \ ("<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>") return websearch_templates.tmpl_nearest_term_box(p=p, ln=ln, f=f, terminfo=terminfo, intro=intro) def get_nearest_terms_in_bibwords(p, f, n_below, n_above): """Return list of +n -n nearest terms to word `p' in index for field `f'.""" nearest_words = [] # will hold the (sorted) list of nearest words to return # deduce into which bibwordsX table we will search: bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id else: return nearest_words # firstly try to get `n' closest words above `p': res = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % bibwordsX, (p, n_above)) for row in res: nearest_words.append(row[0]) nearest_words.reverse() # secondly insert given word `p': nearest_words.append(p) # finally try to get `n' closest words below `p': res = run_sql("SELECT term FROM %s WHERE term>%%s ORDER BY term ASC LIMIT %%s" % bibwordsX, (p, n_below)) for row in res: nearest_words.append(row[0]) return nearest_words def get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field idxPHRASE table, regardless of collection. Return list of [phrase1, phrase2, ... , phrase_n].""" if CFG_INSPIRE_SITE and index_id in (3, 15): # FIXME: workaround due to new fuzzy index return [p,] idxphraseX = "idxPHRASE%02dF" % index_id res_above = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above)) res_above = map(lambda x: x[0], res_above) res_above.reverse() res_below = run_sql("SELECT term FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below)) res_below = map(lambda x: x[0], res_below) return res_above + res_below def get_nearest_terms_in_idxphrase_with_collection(p, index_id, n_below, n_above, collection): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field idxPHRASE table, - considering the collection (HitSet). + considering the collection (intbitset). Return list of [(phrase1, hitset), (phrase2, hitset), ... , (phrase_n, hitset)].""" idxphraseX = "idxPHRASE%02dF" % index_id res_above = run_sql("SELECT term,hitlist FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % idxphraseX, (p, n_above * 3)) - res_above = [(term, HitSet(hitlist) & collection) for term, hitlist in res_above] + res_above = [(term, intbitset(hitlist) & collection) for term, hitlist in res_above] res_above = [(term, len(hitlist)) for term, hitlist in res_above if hitlist] res_below = run_sql("SELECT term,hitlist FROM %s WHERE term>=%%s ORDER BY term ASC LIMIT %%s" % idxphraseX, (p, n_below * 3)) - res_below = [(term, HitSet(hitlist) & collection) for term, hitlist in res_below] + res_below = [(term, intbitset(hitlist) & collection) for term, hitlist in res_below] res_below = [(term, len(hitlist)) for term, hitlist in res_below if hitlist] res_above.reverse() return res_above[-n_above:] + res_below[:n_below] def get_nearest_terms_in_bibxxx(p, f, n_below, n_above): """Browse (-n_above, +n_below) closest bibliographic phrases for the given pattern p in the given field f, regardless of collection. Return list of [phrase1, phrase2, ... , phrase_n].""" ## determine browse field: if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) # FIXME: quick hack for the journal index if f == 'journal': return get_nearest_terms_in_bibwords(p, f, n_below, n_above) ## We are going to take max(n_below, n_above) as the number of ## values to ferch from bibXXx. This is needed to work around ## MySQL UTF-8 sorting troubles in 4.0.x. Proper solution is to ## use MySQL 4.1.x or our own idxPHRASE in the future. index_id = get_index_id_from_field(f) if index_id: return get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above) n_fetch = 2*max(n_below, n_above) ## construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) ## start browsing to fetch list of hits: browsed_phrases = {} # will hold {phrase1: 1, phrase2: 1, ..., phraseN: 1} dict of browsed phrases (to make them unique) # always add self to the results set: browsed_phrases[p.startswith("%") and p.endswith("%") and p[1:-1] or p] = 1 for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) # firstly try to get `n' closest phrases above `p': if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value<%%s AND bx.tag LIKE %%s ORDER BY bx.value DESC LIMIT %%s""" % bx, (p, t + "%", n_fetch)) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value<%%s AND bx.tag=%%s ORDER BY bx.value DESC LIMIT %%s""" % bx, (p, t, n_fetch)) for row in res: browsed_phrases[row[0]] = 1 # secondly try to get `n' closest phrases equal to or below `p': if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value>=%%s AND bx.tag LIKE %%s ORDER BY bx.value ASC LIMIT %%s""" % bx, (p, t + "%", n_fetch)) else: res = run_sql("""SELECT bx.value FROM %s AS bx WHERE bx.value>=%%s AND bx.tag=%%s ORDER BY bx.value ASC LIMIT %%s""" % bx, (p, t, n_fetch)) for row in res: browsed_phrases[row[0]] = 1 # select first n words only: (this is needed as we were searching # in many different tables and so aren't sure we have more than n # words right; this of course won't be needed when we shall have # one ACC table only for given field): phrases_out = browsed_phrases.keys() phrases_out.sort(lambda x, y: cmp(string.lower(strip_accents(x)), string.lower(strip_accents(y)))) # find position of self: try: idx_p = phrases_out.index(p) except: idx_p = len(phrases_out)/2 # return n_above and n_below: return phrases_out[max(0, idx_p-n_above):idx_p+n_below] def get_nearest_terms_in_bibrec(p, f, n_below, n_above): """Return list of nearest terms and counts from bibrec table. p is usually a date, and f either datecreated or datemodified. Note: below/above count is very approximative, not really respected. """ col = 'creation_date' if f == 'datemodified': col = 'modification_date' res_above = run_sql("""SELECT DATE_FORMAT(%s,'%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s') FROM bibrec WHERE %s < %%s ORDER BY %s DESC LIMIT %%s""" % (col, col, col), (p, n_above)) res_below = run_sql("""SELECT DATE_FORMAT(%s,'%%%%Y-%%%%m-%%%%d %%%%H:%%%%i:%%%%s') FROM bibrec WHERE %s > %%s ORDER BY %s ASC LIMIT %%s""" % (col, col, col), (p, n_below)) out = set([]) for row in res_above: out.add(row[0]) for row in res_below: out.add(row[0]) out_list = list(out) out_list.sort() return list(out_list) def get_nbhits_in_bibrec(term, f): """Return number of hits in bibrec table. term is usually a date, and f is either 'datecreated' or 'datemodified'.""" col = 'creation_date' if f == 'datemodified': col = 'modification_date' res = run_sql("SELECT COUNT(*) FROM bibrec WHERE %s LIKE %%s" % (col,), (term + '%',)) return res[0][0] def get_nbhits_in_bibwords(word, f): """Return number of hits for word 'word' inside words index for field 'f'.""" out = 0 # deduce into which bibwordsX table we will search: bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: bibwordsX = "idxWORD%02dF" % index_id else: return 0 if word: res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % bibwordsX, (word,)) for hitlist in res: - out += len(HitSet(hitlist[0])) + out += len(intbitset(hitlist[0])) return out def get_nbhits_in_idxphrases(word, f): """Return number of hits for word 'word' inside phrase index for field 'f'.""" out = 0 # deduce into which bibwordsX table we will search: idxphraseX = "idxPHRASE%02dF" % get_index_id_from_field("anyfield") if f: index_id = get_index_id_from_field(f) if index_id: idxphraseX = "idxPHRASE%02dF" % index_id else: return 0 if word: res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % idxphraseX, (word,)) for hitlist in res: - out += len(HitSet(hitlist[0])) + out += len(intbitset(hitlist[0])) return out def get_nbhits_in_bibxxx(p, f): """Return number of hits for word 'word' inside words index for field 'f'.""" ## determine browse field: if not f and string.find(p, ":") > 0: # does 'p' contain ':'? f, p = string.split(p, ":", 1) # FIXME: quick hack for the journal index if f == 'journal': return get_nbhits_in_bibwords(p, f) ## construct 'tl' which defines the tag list (MARC tags) to search in: tl = [] if str(f[0]).isdigit() and str(f[1]).isdigit(): tl.append(f) # 'f' seems to be okay as it starts by two digits else: # deduce desired MARC tags on the basis of chosen 'f' tl = get_field_tags(f) # start searching: recIDs = {} # will hold dict of {recID1: 1, recID2: 1, ..., } (unique recIDs, therefore) for t in tl: # deduce into which bibxxx table we will search: digit1, digit2 = int(t[0]), int(t[1]) bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character: res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx WHERE bx.value=%%s AND bx.tag LIKE %%s AND bibx.id_bibxxx=bx.id""" % (bibx, bx), (p, t + "%")) else: res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx WHERE bx.value=%%s AND bx.tag=%%s AND bibx.id_bibxxx=bx.id""" % (bibx, bx), (p, t)) for row in res: recIDs[row[0]] = 1 return len(recIDs) def get_mysql_recid_from_aleph_sysno(sysno): """Returns DB's recID for ALEPH sysno passed in the argument (e.g. "002379334CER"). Returns None in case of failure.""" out = None res = run_sql("""SELECT bb.id_bibrec FROM bibrec_bib97x AS bb, bib97x AS b WHERE b.value=%s AND b.tag='970__a' AND bb.id_bibxxx=b.id""", (sysno,)) if res: out = res[0][0] return out def guess_primary_collection_of_a_record(recID): """Return primary collection name a record recid belongs to, by testing 980 identifier. May lead to bad guesses when a collection is defined dynamically via dbquery. In that case, return 'CFG_SITE_NAME'.""" out = CFG_SITE_NAME dbcollids = get_fieldvalues(recID, "980__a") if dbcollids: for dbcollid in dbcollids: dbquery = "collection:" + dbcollid res = run_sql("SELECT name FROM collection WHERE dbquery=%s", (dbquery,)) if res: out = res[0][0] break if CFG_CERN_SITE: # dirty hack for ATLAS collections at CERN: if out in ('ATLAS Communications', 'ATLAS Internal Notes'): for alternative_collection in ('ATLAS Communications Physics', 'ATLAS Communications General', 'ATLAS Internal Notes Physics', 'ATLAS Internal Notes General',): if recID in get_collection_reclist(alternative_collection): out = alternative_collection break return out _re_collection_url = re.compile('/collection/(.+)') def guess_collection_of_a_record(recID, referer=None, recreate_cache_if_needed=True): """Return collection name a record recid belongs to, by first testing the referer URL if provided and otherwise returning the primary collection.""" if referer: dummy, hostname, path, dummy, query, dummy = urlparse.urlparse(referer) #requests can come from different invenio installations, with different collections if CFG_SITE_URL.find(hostname) < 0: return guess_primary_collection_of_a_record(recID) g = _re_collection_url.match(path) if g: name = urllib.unquote_plus(g.group(1)) #check if this collection actually exist (also normalize the name if case-insensitive) name = get_coll_normalised_name(name) if name and recID in get_collection_reclist(name): return name elif path.startswith('/search'): if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() query = cgi.parse_qs(query) for name in query.get('cc', []) + query.get('c', []): name = get_coll_normalised_name(name) if name and recID in get_collection_reclist(name, recreate_cache_if_needed=False): return name return guess_primary_collection_of_a_record(recID) def is_record_in_any_collection(recID, recreate_cache_if_needed=True): """Return True if the record belongs to at least one collection. This is a good, although not perfect, indicator to guess if webcoll has already run after this record has been entered into the system. """ if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() for name in collection_reclist_cache.cache.keys(): if recID in get_collection_reclist(name, recreate_cache_if_needed=False): return True return False def get_all_collections_of_a_record(recID, recreate_cache_if_needed=True): """Return all the collection names a record belongs to. Note this function is O(n_collections).""" ret = [] if recreate_cache_if_needed: collection_reclist_cache.recreate_cache_if_needed() for name in collection_reclist_cache.cache.keys(): if recID in get_collection_reclist(name, recreate_cache_if_needed=False): ret.append(name) return ret def get_tag_name(tag_value, prolog="", epilog=""): """Return tag name from the known tag value, by looking up the 'tag' table. Return empty string in case of failure. Example: input='100__%', output=first author'.""" out = "" res = run_sql("SELECT name FROM tag WHERE value=%s", (tag_value,)) if res: out = prolog + res[0][0] + epilog return out def get_fieldcodes(): """Returns a list of field codes that may have been passed as 'search options' in URL. Example: output=['subject','division'].""" out = [] res = run_sql("SELECT DISTINCT(code) FROM field") for row in res: out.append(row[0]) return out def get_field_name(code): """Return the corresponding field_name given the field code. e.g. reportnumber -> report number.""" res = run_sql("SELECT name FROM field WHERE code=%s", (code, )) if res: return res[0][0] else: return "" def get_field_tags(field): """Returns a list of MARC tags for the field code 'field'. Returns empty list in case of error. Example: field='author', output=['100__%','700__%'].""" out = [] query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" res = run_sql(query, (field, )) for val in res: out.append(val[0]) return out def get_fieldvalues_alephseq_like(recID, tags_in, can_see_hidden=False): """Return buffer of ALEPH sequential-like textual format with fields found in the list TAGS_IN for record RECID. If can_see_hidden is True, just print everything. Otherwise hide fields from CFG_BIBFORMAT_HIDDEN_TAGS. """ out = "" if type(tags_in) is not list: tags_in = [tags_in,] if len(tags_in) == 1 and len(tags_in[0]) == 6: ## case A: one concrete subfield asked, so print its value if found ## (use with care: can mislead if field has multiple occurrences) out += string.join(get_fieldvalues(recID, tags_in[0]),"\n") else: ## case B: print our "text MARC" format; works safely all the time # find out which tags to output: dict_of_tags_out = {} if not tags_in: for i in range(0, 10): for j in range(0, 10): dict_of_tags_out["%d%d%%" % (i, j)] = 1 else: for tag in tags_in: if len(tag) == 0: for i in range(0, 10): for j in range(0, 10): dict_of_tags_out["%d%d%%" % (i, j)] = 1 elif len(tag) == 1: for j in range(0, 10): dict_of_tags_out["%s%d%%" % (tag, j)] = 1 elif len(tag) < 5: dict_of_tags_out["%s%%" % tag] = 1 elif tag >= 6: dict_of_tags_out[tag[0:5]] = 1 tags_out = dict_of_tags_out.keys() tags_out.sort() # search all bibXXx tables as needed: for tag in tags_out: digits = tag[0:2] try: intdigits = int(digits) if intdigits < 0 or intdigits > 99: raise ValueError except ValueError: # invalid tag value asked for continue if tag.startswith("001") or tag.startswith("00%"): if out: out += "\n" out += "%09d %s %d" % (recID, "001__", recID) bx = "bib%sx" % digits bibx = "bibrec_bib%sx" % digits query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx) res = run_sql(query, (recID, str(tag)+'%')) # go through fields: field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] printme = True #check the stuff in hiddenfields if not can_see_hidden: for htag in CFG_BIBFORMAT_HIDDEN_TAGS: ltag = len(htag) samelenfield = field[0:ltag] if samelenfield == htag: printme = False if ind1 == "_": ind1 = "" if ind2 == "_": ind2 = "" # print field tag if printme: if field_number != field_number_old or field[:-1] != field_old[:-1]: if out: out += "\n" out += "%09d %s " % (recID, field[:5]) field_number_old = field_number field_old = field # print subfield value if field[0:2] == "00" and field[-1:] == "_": out += value else: out += "$$%s%s" % (field[-1:], value) return out def record_exists(recID): """Return 1 if record RECID exists. Return 0 if it doesn't exist. Return -1 if it exists but is marked as deleted. """ out = 0 res = run_sql("SELECT id FROM bibrec WHERE id=%s", (recID,), 1) if res: try: # if recid is '123foo', mysql will return id=123, and we don't want that recID = int(recID) except ValueError: return 0 # record exists; now check whether it isn't marked as deleted: dbcollids = get_fieldvalues(recID, "980__%") if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids): out = -1 # exists, but marked as deleted else: out = 1 # exists fine return out def record_empty(recID): """ Is this record empty, e.g. has only 001, waiting for integration? @param recID: the record identifier. @type recID: int @return: 1 if the record is empty, 0 otherwise. @rtype: int """ record = get_record(recID) if record is None or len(record) < 2: return 1 else: return 0 def record_public_p(recID, recreate_cache_if_needed=True): """Return 1 if the record is public, i.e. if it can be found in the Home collection. Return 0 otherwise. """ return recID in get_collection_reclist(CFG_SITE_NAME, recreate_cache_if_needed=recreate_cache_if_needed) def get_creation_date(recID, fmt="%Y-%m-%d"): "Returns the creation date of the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def get_modification_date(recID, fmt="%Y-%m-%d"): "Returns the date of last modification for the record 'recID'." out = "" res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1) if res: out = res[0][0] return out def print_warning(req, msg, msg_type='', prologue='<br />', epilogue='<br />'): "Prints warning message and flushes output." if req and msg: req.write(websearch_templates.tmpl_print_warning( msg = msg, type = msg_type, prologue = prologue, epilogue = epilogue, )) return def print_search_info(p, f, sf, so, sp, rm, of, ot, collection=CFG_SITE_NAME, nb_found=-1, jrec=1, rg=10, aas=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="", sc=1, pl_in_url="", d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="", cpu_time=-1, middle_only=0): """Prints stripe with the information on 'collection' and 'nb_found' results and CPU time. Also, prints navigation links (beg/next/prev/end) inside the results set. If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links. This is suitable for displaying navigation links at the bottom of the search results page.""" # sanity check: if jrec < 1: jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) return websearch_templates.tmpl_print_search_info( ln = ln, collection = collection, aas = aas, collection_name = get_coll_i18nname(collection, ln, False), collection_id = get_colID(collection), middle_only = middle_only, rg = rg, nb_found = nb_found, sf = sf, so = so, rm = rm, of = of, ot = ot, p = p, f = f, p1 = p1, p2 = p2, p3 = p3, f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, op1 = op1, op2 = op2, pl_in_url = pl_in_url, d1y = d1y, d1m = d1m, d1d = d1d, d2y = d2y, d2m = d2m, d2d = d2d, dt = dt, jrec = jrec, sc = sc, sp = sp, all_fieldcodes = get_fieldcodes(), cpu_time = cpu_time, ) def print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, collection=CFG_SITE_NAME, nb_found=-1, jrec=1, rg=10, aas=0, ln=CFG_SITE_LANG, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="", sc=1, pl_in_url="", d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="", cpu_time=-1, middle_only=0): """Prints stripe with the information on 'collection' and 'nb_found' results and CPU time. Also, prints navigation links (beg/next/prev/end) inside the results set. If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links. This is suitable for displaying navigation links at the bottom of the search results page.""" out = "" # sanity check: if jrec < 1: jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) return websearch_templates.tmpl_print_hosted_search_info( ln = ln, collection = collection, aas = aas, collection_name = get_coll_i18nname(collection, ln, False), collection_id = get_colID(collection), middle_only = middle_only, rg = rg, nb_found = nb_found, sf = sf, so = so, rm = rm, of = of, ot = ot, p = p, f = f, p1 = p1, p2 = p2, p3 = p3, f1 = f1, f2 = f2, f3 = f3, m1 = m1, m2 = m2, m3 = m3, op1 = op1, op2 = op2, pl_in_url = pl_in_url, d1y = d1y, d1m = d1m, d1d = d1d, d2y = d2y, d2m = d2m, d2d = d2d, dt = dt, jrec = jrec, sc = sc, sp = sp, all_fieldcodes = get_fieldcodes(), cpu_time = cpu_time, ) def print_results_overview(colls, results_final_nb_total, results_final_nb, cpu_time, ln=CFG_SITE_LANG, ec=[], hosted_colls_potential_results_p=False): """Prints results overview box with links to particular collections below.""" out = "" new_colls = [] for coll in colls: new_colls.append({ 'id': get_colID(coll), 'code': coll, 'name': get_coll_i18nname(coll, ln, False), }) return websearch_templates.tmpl_print_results_overview( ln = ln, results_final_nb_total = results_final_nb_total, results_final_nb = results_final_nb, cpu_time = cpu_time, colls = new_colls, ec = ec, hosted_colls_potential_results_p = hosted_colls_potential_results_p, ) def print_hosted_results(url_and_engine, ln=CFG_SITE_LANG, of=None, req=None, no_records_found=False, search_timed_out=False, limit=CFG_EXTERNAL_COLLECTION_MAXRESULTS): """Prints the full results of a hosted collection""" if of.startswith("h"): if no_records_found: return "<br />No results found." if search_timed_out: return "<br />The search engine did not respond in time." return websearch_templates.tmpl_print_hosted_results( url_and_engine=url_and_engine, ln=ln, of=of, req=req, limit=limit ) def sort_records(req, recIDs, sort_field='', sort_order='d', sort_pattern='', verbose=0, of='hb', ln=CFG_SITE_LANG): """Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'. If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by 'sort pattern', for example "sort by report number that starts by CERN-PS". Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly.""" _ = gettext_set_language(ln) ## check arguments: if not sort_field: return recIDs if len(recIDs) > CFG_WEBSEARCH_NB_RECORDS_TO_SORT: if of.startswith('h'): print_warning(req, _("Sorry, sorting is allowed on sets of up to %d records only. Using default sort order.") % CFG_WEBSEARCH_NB_RECORDS_TO_SORT, "Warning") return recIDs sort_fields = string.split(sort_field, ",") recIDs_dict = {} recIDs_out = [] ## first deduce sorting MARC tag out of the 'sort_field' argument: tags = [] for sort_field in sort_fields: if sort_field and str(sort_field[0:2]).isdigit(): # sort_field starts by two digits, so this is probably a MARC tag already tags.append(sort_field) else: # let us check the 'field' table query = """SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" res = run_sql(query, (sort_field, )) if res: for row in res: tags.append(row[0]) else: if of.startswith('h'): print_warning(req, _("Sorry, %s does not seem to be a valid sort option. Choosing title sort instead.") % cgi.escape(sort_field), "Error") tags.append("245__a") if verbose >= 3: print_warning(req, "Sorting by tags %s." % cgi.escape(repr(tags))) if sort_pattern: print_warning(req, "Sorting preferentially by %s." % cgi.escape(sort_pattern)) ## check if we have sorting tag defined: if tags: # fetch the necessary field values: for recID in recIDs: val = "" # will hold value for recID according to which sort vals = [] # will hold all values found in sorting tag for recID for tag in tags: if CFG_CERN_SITE and tag == '773__c': # CERN hack: journal sorting # 773__c contains page numbers, e.g. 3-13, and we want to sort by 3, and numerically: vals.extend(["%050s" % x.split("-",1)[0] for x in get_fieldvalues(recID, tag)]) else: vals.extend(get_fieldvalues(recID, tag)) if sort_pattern: # try to pick that tag value that corresponds to sort pattern bingo = 0 for v in vals: if v.lower().startswith(sort_pattern.lower()): # bingo! bingo = 1 val = v break if not bingo: # sort_pattern not present, so add other vals after spaces val = sort_pattern + " " + string.join(vals) else: # no sort pattern defined, so join them all together val = string.join(vals) val = strip_accents(val.lower()) # sort values regardless of accents and case if recIDs_dict.has_key(val): recIDs_dict[val].append(recID) else: recIDs_dict[val] = [recID] # sort them: recIDs_dict_keys = recIDs_dict.keys() recIDs_dict_keys.sort() # now that keys are sorted, create output array: for k in recIDs_dict_keys: for s in recIDs_dict[k]: recIDs_out.append(s) # ascending or descending? if sort_order == 'a': recIDs_out.reverse() # okay, we are done return recIDs_out else: # good, no sort needed return recIDs def print_records(req, recIDs, jrec=1, rg=10, format='hb', ot='', ln=CFG_SITE_LANG, relevances=[], relevances_prologue="(", relevances_epilogue="%%)", decompress=zlib.decompress, search_pattern='', print_records_prologue_p=True, print_records_epilogue_p=True, verbose=0, tab='', sf='', so='d', sp='', rm=''): """ Prints list of records 'recIDs' formatted according to 'format' in groups of 'rg' starting from 'jrec'. Assumes that the input list 'recIDs' is sorted in reverse order, so it counts records from tail to head. A value of 'rg=-9999' means to print all records: to be used with care. Print also list of RELEVANCES for each record (if defined), in between RELEVANCE_PROLOGUE and RELEVANCE_EPILOGUE. Print prologue and/or epilogue specific to 'format' if 'print_records_prologue_p' and/or print_records_epilogue_p' are True. 'sf' is sort field and 'rm' is ranking method that are passed here only for proper linking purposes: e.g. when a certain ranking method or a certain sort field was selected, keep it selected in any dynamic search links that may be printed. """ # load the right message language _ = gettext_set_language(ln) # sanity checking: if req is None: return # get user_info (for formatting based on user) if isinstance(req, cStringIO.OutputType): user_info = {} else: user_info = collect_user_info(req) if len(recIDs): nb_found = len(recIDs) if rg == -9999: # print all records rg = nb_found else: rg = abs(rg) if jrec < 1: # sanity checks jrec = 1 if jrec > nb_found: jrec = max(nb_found-rg+1, 1) # will print records from irec_max to irec_min excluded: irec_max = nb_found - jrec irec_min = nb_found - jrec - rg if irec_min < 0: irec_min = -1 if irec_max >= nb_found: irec_max = nb_found - 1 #req.write("%s:%d-%d" % (recIDs, irec_min, irec_max)) if format.startswith('x'): # print header if needed if print_records_prologue_p: print_records_prologue(req, format) # print records recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)] format_records(recIDs_to_print, format, ln=ln, search_pattern=search_pattern, record_separator="\n", user_info=user_info, req=req) # print footer if needed if print_records_epilogue_p: print_records_epilogue(req, format) elif format.startswith('t') or str(format[0:3]).isdigit(): # we are doing plain text output: for irec in range(irec_max, irec_min, -1): x = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) req.write(x) if x: req.write('\n') elif format == 'excel': recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)] create_excel(recIDs=recIDs_to_print, req=req, ln=ln, ot=ot) else: # we are doing HTML output: if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"): # portfolio and on-the-fly formats: for irec in range(irec_max, irec_min, -1): req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm)) elif format.startswith("hb"): # HTML brief format: display_add_to_basket = True if user_info: if user_info['email'] == 'guest': if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS > 4: display_add_to_basket = False else: if not user_info['precached_usebaskets']: display_add_to_basket = False req.write(websearch_templates.tmpl_record_format_htmlbrief_header( ln = ln)) for irec in range(irec_max, irec_min, -1): row_number = jrec+irec_max-irec recid = recIDs[irec] if relevances and relevances[irec]: relevance = relevances[irec] else: relevance = '' record = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) req.write(websearch_templates.tmpl_record_format_htmlbrief_body( ln = ln, recid = recid, row_number = row_number, relevance = relevance, record = record, relevances_prologue = relevances_prologue, relevances_epilogue = relevances_epilogue, display_add_to_basket = display_add_to_basket )) req.write(websearch_templates.tmpl_record_format_htmlbrief_footer( ln = ln, display_add_to_basket = display_add_to_basket)) elif format.startswith("hd"): # HTML detailed format: for irec in range(irec_max, irec_min, -1): if record_exists(recIDs[irec]) == -1: print_warning(req, _("The record has been deleted.")) continue unordered_tabs = get_detailed_page_tabs(get_colID(guess_primary_collection_of_a_record(recIDs[irec])), recIDs[irec], ln=ln) ordered_tabs_id = [(tab_id, values['order']) for (tab_id, values) in unordered_tabs.iteritems()] ordered_tabs_id.sort(lambda x,y: cmp(x[1],y[1])) link_ln = '' if ln != CFG_SITE_LANG: link_ln = '?ln=%s' % ln recid = recIDs[irec] recid_to_display = recid # Record ID used to build the URL. if CFG_WEBSEARCH_USE_ALEPH_SYSNOS: try: recid_to_display = get_fieldvalues(recid, CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG)[0] except IndexError: # No external sysno is available, keep using # internal recid. pass tabs = [(unordered_tabs[tab_id]['label'], \ '%s/%s/%s/%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, recid_to_display, tab_id, link_ln), \ tab_id == tab, unordered_tabs[tab_id]['enabled']) \ for (tab_id, order) in ordered_tabs_id if unordered_tabs[tab_id]['visible'] == True] tabs_counts = get_detailed_page_tabs_counts(recid) citedbynum = tabs_counts['Citations'] references = tabs_counts['References'] discussions = tabs_counts['Discussions'] # load content if tab == 'usage': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln, citationnum=citedbynum, referencenum=references, discussionnum=discussions)) r = calculate_reading_similarity_list(recIDs[irec], "downloads") downloadsimilarity = None downloadhistory = None #if r: # downloadsimilarity = r if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS: downloadhistory = create_download_history_graph_and_box(recIDs[irec], ln) r = calculate_reading_similarity_list(recIDs[irec], "pageviews") viewsimilarity = None if r: viewsimilarity = r content = websearch_templates.tmpl_detailed_record_statistics(recIDs[irec], ln, downloadsimilarity=downloadsimilarity, downloadhistory=downloadhistory, viewsimilarity=viewsimilarity) req.write(content) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) elif tab == 'citations': recid = recIDs[irec] req.write(webstyle_templates.detailed_record_container_top(recid, tabs, ln, citationnum=citedbynum, referencenum=references, discussionnum=discussions)) req.write(websearch_templates.tmpl_detailed_record_citations_prologue(recid, ln)) # Citing citinglist = calculate_cited_by_list(recid) req.write(websearch_templates.tmpl_detailed_record_citations_citing_list(recid, ln, citinglist, sf=sf, so=so, sp=sp, rm=rm)) # Self-cited selfcited = get_self_cited_by(recid) req.write(websearch_templates.tmpl_detailed_record_citations_self_cited(recid, ln, selfcited=selfcited, citinglist=citinglist)) # Co-cited s = calculate_co_cited_with_list(recid) cociting = None if s: cociting = s req.write(websearch_templates.tmpl_detailed_record_citations_co_citing(recid, ln, cociting=cociting)) # Citation history, if needed citationhistory = None if citinglist: citationhistory = create_citation_history_graph_and_box(recid, ln) #debug if verbose > 3: print_warning(req, "Citation graph debug: " + \ str(len(citationhistory))) req.write(websearch_templates.tmpl_detailed_record_citations_citation_history(recid, ln, citationhistory)) req.write(websearch_templates.tmpl_detailed_record_citations_epilogue(recid, ln)) req.write(webstyle_templates.detailed_record_container_bottom(recid, tabs, ln)) elif tab == 'references': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln, citationnum=citedbynum, referencenum=references, discussionnum=discussions)) req.write(format_record(recIDs[irec], 'HDREF', ln=ln, user_info=user_info, verbose=verbose)) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) elif tab == 'keywords': import bibclassify_webinterface recid = recIDs[irec] bibclassify_webinterface.main_page(req, recid, tabs, ln, webstyle_templates) elif tab == 'plots': req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln)) content = websearch_templates.tmpl_record_plots(recID=recIDs[irec], ln=ln) req.write(content) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln)) else: # Metadata tab req.write(webstyle_templates.detailed_record_container_top(recIDs[irec], tabs, ln, show_short_rec_p=False, citationnum=citedbynum, referencenum=references, discussionnum=discussions)) creationdate = None modificationdate = None if record_exists(recIDs[irec]) == 1: creationdate = get_creation_date(recIDs[irec]) modificationdate = get_modification_date(recIDs[irec]) content = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) content = websearch_templates.tmpl_detailed_record_metadata( recID = recIDs[irec], ln = ln, format = format, creationdate = creationdate, modificationdate = modificationdate, content = content) # display of the next-hit/previous-hit/back-to-search links # on the detailed record pages content += websearch_templates.tmpl_display_back_to_search(req, recIDs[irec], ln) req.write(content) req.write(webstyle_templates.detailed_record_container_bottom(recIDs[irec], tabs, ln, creationdate=creationdate, modificationdate=modificationdate, show_short_rec_p=False)) if len(tabs) > 0: # Add the mini box at bottom of the page if CFG_WEBCOMMENT_ALLOW_REVIEWS: from invenio.webcomment import get_mini_reviews reviews = get_mini_reviews(recid = recIDs[irec], ln=ln) else: reviews = '' actions = format_record(recIDs[irec], 'HDACT', ln=ln, user_info=user_info, verbose=verbose) files = format_record(recIDs[irec], 'HDFILE', ln=ln, user_info=user_info, verbose=verbose) req.write(webstyle_templates.detailed_record_mini_panel(recIDs[irec], ln, format, files=files, reviews=reviews, actions=actions)) else: # Other formats for irec in range(irec_max, irec_min, -1): req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm)) else: print_warning(req, _("Use different search terms.")) def print_records_prologue(req, format, cc=None): """ Print the appropriate prologue for list of records in the given format. """ prologue = "" # no prologue needed for HTML or Text formats if format.startswith('xm'): prologue = websearch_templates.tmpl_xml_marc_prologue() elif format.startswith('xn'): prologue = websearch_templates.tmpl_xml_nlm_prologue() elif format.startswith('xw'): prologue = websearch_templates.tmpl_xml_refworks_prologue() elif format.startswith('xr'): prologue = websearch_templates.tmpl_xml_rss_prologue(cc=cc) elif format.startswith('xe'): prologue = websearch_templates.tmpl_xml_endnote_prologue() elif format.startswith('xo'): prologue = websearch_templates.tmpl_xml_mods_prologue() elif format.startswith('xp'): prologue = websearch_templates.tmpl_xml_podcast_prologue(cc=cc) elif format.startswith('x'): prologue = websearch_templates.tmpl_xml_default_prologue() req.write(prologue) def print_records_epilogue(req, format): """ Print the appropriate epilogue for list of records in the given format. """ epilogue = "" # no epilogue needed for HTML or Text formats if format.startswith('xm'): epilogue = websearch_templates.tmpl_xml_marc_epilogue() elif format.startswith('xn'): epilogue = websearch_templates.tmpl_xml_nlm_epilogue() elif format.startswith('xw'): epilogue = websearch_templates.tmpl_xml_refworks_epilogue() elif format.startswith('xr'): epilogue = websearch_templates.tmpl_xml_rss_epilogue() elif format.startswith('xe'): epilogue = websearch_templates.tmpl_xml_endnote_epilogue() elif format.startswith('xo'): epilogue = websearch_templates.tmpl_xml_mods_epilogue() elif format.startswith('xp'): epilogue = websearch_templates.tmpl_xml_podcast_epilogue() elif format.startswith('x'): epilogue = websearch_templates.tmpl_xml_default_epilogue() req.write(epilogue) def get_record(recid): """Directly the record object corresponding to the recid.""" if CFG_BIBUPLOAD_SERIALIZE_RECORD_STRUCTURE: value = run_sql("SELECT value FROM bibfmt WHERE id_bibrec=%s AND FORMAT='recstruct'", (recid, )) if value: try: return deserialize_via_marshal(value[0][0]) except: ### In case of corruption, let's rebuild it! pass return create_record(print_record(recid, 'xm'))[0] def print_record(recID, format='hb', ot='', ln=CFG_SITE_LANG, decompress=zlib.decompress, search_pattern=None, user_info=None, verbose=0, sf='', so='d', sp='', rm=''): """ Prints record 'recID' formatted according to 'format'. 'sf' is sort field and 'rm' is ranking method that are passed here only for proper linking purposes: e.g. when a certain ranking method or a certain sort field was selected, keep it selected in any dynamic search links that may be printed. """ if format == 'recstruct': return get_record(recID) _ = gettext_set_language(ln) display_claim_this_paper = False try: display_claim_this_paper = user_info["precached_viewclaimlink"] except (KeyError, TypeError): display_claim_this_paper = False #check from user information if the user has the right to see hidden fields/tags in the #records as well can_see_hidden = (acc_authorize_action(user_info, 'runbibedit')[0] == 0) out = "" # sanity check: record_exist_p = record_exists(recID) if record_exist_p == 0: # doesn't exist return out # New Python BibFormat procedure for formatting # Old procedure follows further below # We must still check some special formats, but these # should disappear when BibFormat improves. if not (CFG_BIBFORMAT_USE_OLD_BIBFORMAT \ or format.lower().startswith('t') \ or format.lower().startswith('hm') \ or str(format[0:3]).isdigit() \ or ot): # Unspecified format is hd if format == '': format = 'hd' if record_exist_p == -1 and get_output_format_content_type(format) == 'text/html': # HTML output displays a default value for deleted records. # Other format have to deal with it. out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) # at the end of HTML brief mode, print the "Detailed record" functionality: if format.lower().startswith('hb') and \ format.lower() != 'hb_p': out += websearch_templates.tmpl_print_record_brief_links(ln=ln, recID=recID, sf=sf, so=so, sp=sp, rm=rm, display_claim_link=display_claim_this_paper) return out # Old PHP BibFormat procedure for formatting # print record opening tags, if needed: if format == "marcxml" or format == "oai_dc": out += " <record>\n" out += " <header>\n" for oai_id in get_fieldvalues(recID, CFG_OAI_ID_FIELD): out += " <identifier>%s</identifier>\n" % oai_id out += " <datestamp>%s</datestamp>\n" % get_modification_date(recID) out += " </header>\n" out += " <metadata>\n" if format.startswith("xm") or format == "marcxml": # look for detailed format existence: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format), 1) if res and record_exist_p == 1: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format' -- they are not in "bibfmt" table; so fetch all the data from "bibXXx" tables: if format == "marcxml": out += """ <record xmlns="http://www.loc.gov/MARC21/slim">\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) elif format.startswith("xm"): out += """ <record>\n""" out += " <controlfield tag=\"001\">%d</controlfield>\n" % int(recID) if record_exist_p == -1: # deleted record, so display only OAI ID and 980: oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD) if oai_ids: out += "<datafield tag=\"%s\" ind1=\"%s\" ind2=\"%s\"><subfield code=\"%s\">%s</subfield></datafield>\n" % \ (CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0]) out += "<datafield tag=\"980\" ind1=\"\" ind2=\"\"><subfield code=\"c\">DELETED</subfield></datafield>\n" else: # controlfields query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\ "WHERE bb.id_bibrec=%s AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\ "ORDER BY bb.field_number, b.tag ASC" res = run_sql(query, (recID, )) for row in res: field, value = row[0], row[1] value = encode_for_xml(value) out += """ <controlfield tag="%s" >%s</controlfield>\n""" % \ (encode_for_xml(field[0:3]), value) # datafields i = 1 # Do not process bib00x and bibrec_bib00x, as # they are controlfields. So start at bib01x and # bibrec_bib00x (and set i = 0 at the end of # first loop) for digit1 in range(0, 10): for digit2 in range(i, 10): bx = "bib%d%dx" % (digit1, digit2) bibx = "bibrec_bib%d%dx" % (digit1, digit2) query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\ "WHERE bb.id_bibrec=%%s AND b.id=bb.id_bibxxx AND b.tag LIKE %%s"\ "ORDER BY bb.field_number, b.tag ASC" % (bx, bibx) res = run_sql(query, (recID, str(digit1)+str(digit2)+'%')) field_number_old = -999 field_old = "" for row in res: field, value, field_number = row[0], row[1], row[2] ind1, ind2 = field[3], field[4] if ind1 == "_" or ind1 == "": ind1 = " " if ind2 == "_" or ind2 == "": ind2 = " " # print field tag, unless hidden printme = True if not can_see_hidden: for htag in CFG_BIBFORMAT_HIDDEN_TAGS: ltag = len(htag) samelenfield = field[0:ltag] if samelenfield == htag: printme = False if printme: if field_number != field_number_old or field[:-1] != field_old[:-1]: if field_number_old != -999: out += """ </datafield>\n""" out += """ <datafield tag="%s" ind1="%s" ind2="%s">\n""" % \ (encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2)) field_number_old = field_number field_old = field # print subfield value value = encode_for_xml(value) out += """ <subfield code="%s">%s</subfield>\n""" % \ (encode_for_xml(field[-1:]), value) # all fields/subfields printed in this run, so close the tag: if field_number_old != -999: out += """ </datafield>\n""" i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x # we are at the end of printing the record: out += " </record>\n" elif format == "xd" or format == "oai_dc": # XML Dublin Core format, possibly OAI -- select only some bibXXx fields: out += """ <dc xmlns="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://www.openarchives.org/OAI/1.1/dc.xsd">\n""" if record_exist_p == -1: out += "" else: for f in get_fieldvalues(recID, "041__a"): out += " <language>%s</language>\n" % f for f in get_fieldvalues(recID, "100__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "700__a"): out += " <creator>%s</creator>\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "245__a"): out += " <title>%s\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "65017a"): out += " %s\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "8564_u"): if f.split('.') == 'png': continue out += " %s\n" % encode_for_xml(f) for f in get_fieldvalues(recID, "520__a"): out += " %s\n" % encode_for_xml(f) out += " %s\n" % get_creation_date(recID) out += " \n" elif len(format) == 6 and str(format[0:3]).isdigit(): # user has asked to print some fields only if format == "001": out += "%s\n" % (format, recID, format) else: vals = get_fieldvalues(recID, format) for val in vals: out += "%s\n" % (format, val, format) elif format.startswith('t'): ## user directly asked for some tags to be displayed only if record_exist_p == -1: out += get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden) else: out += get_fieldvalues_alephseq_like(recID, ot, can_see_hidden) elif format == "hm": if record_exist_p == -1: out += "\n
    " + cgi.escape(get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden)) + "
    " else: out += "\n
    " + cgi.escape(get_fieldvalues_alephseq_like(recID, ot, can_see_hidden)) + "
    " elif format.startswith("h") and ot: ## user directly asked for some tags to be displayed only if record_exist_p == -1: out += "\n
    " + get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"], can_see_hidden) + "
    " else: out += "\n
    " + get_fieldvalues_alephseq_like(recID, ot, can_see_hidden) + "
    " elif format == "hd": # HTML detailed format if record_exist_p == -1: out += _("The record has been deleted.") else: # look for detailed format existence: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format), 1) if res: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format', so try to call BibFormat on the fly or use default format: out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) if out_record_in_format: out += out_record_in_format else: out += websearch_templates.tmpl_print_record_detailed( ln = ln, recID = recID, ) elif format.startswith("hb_") or format.startswith("hd_"): # underscore means that HTML brief/detailed formats should be called on-the-fly; suitable for testing formats if record_exist_p == -1: out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) elif format.startswith("hx"): # BibTeX format, called on the fly: if record_exist_p == -1: out += _("The record has been deleted.") else: out += call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) elif format.startswith("hs"): # for citation/download similarity navigation links: if record_exist_p == -1: out += _("The record has been deleted.") else: out += '' % websearch_templates.build_search_url(recid=recID, ln=ln) # firstly, title: titles = get_fieldvalues(recID, "245__a") if titles: for title in titles: out += "%s" % title else: # usual title not found, try conference title: titles = get_fieldvalues(recID, "111__a") if titles: for title in titles: out += "%s" % title else: # just print record ID: out += "%s %d" % (get_field_i18nname("record ID", ln, False), recID) out += "" # secondly, authors: authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a") if authors: out += " - %s" % authors[0] if len(authors) > 1: out += " et al" # thirdly publication info: publinfos = get_fieldvalues(recID, "773__s") if not publinfos: publinfos = get_fieldvalues(recID, "909C4s") if not publinfos: publinfos = get_fieldvalues(recID, "037__a") if not publinfos: publinfos = get_fieldvalues(recID, "088__a") if publinfos: out += " - %s" % publinfos[0] else: # fourthly publication year (if not publication info): years = get_fieldvalues(recID, "773__y") if not years: years = get_fieldvalues(recID, "909C4y") if not years: years = get_fieldvalues(recID, "260__c") if years: out += " (%s)" % years[0] else: # HTML brief format by default if record_exist_p == -1: out += _("The record has been deleted.") else: query = "SELECT value FROM bibfmt WHERE id_bibrec=%s AND format=%s" res = run_sql(query, (recID, format)) if res: # record 'recID' is formatted in 'format', so print it out += "%s" % decompress(res[0][0]) else: # record 'recID' is not formatted in 'format', so try to call BibFormat on the fly: or use default format: if CFG_WEBSEARCH_CALL_BIBFORMAT: out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern, user_info=user_info, verbose=verbose) if out_record_in_format: out += out_record_in_format else: out += websearch_templates.tmpl_print_record_brief( ln = ln, recID = recID, ) else: out += websearch_templates.tmpl_print_record_brief( ln = ln, recID = recID, ) # at the end of HTML brief mode, print the "Detailed record" functionality: if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"): pass # do nothing for portfolio and on-the-fly formats else: out += websearch_templates.tmpl_print_record_brief_links(ln=ln, recID=recID, sf=sf, so=so, sp=sp, rm=rm, display_claim_link=display_claim_this_paper) # print record closing tags, if needed: if format == "marcxml" or format == "oai_dc": out += " \n" out += " \n" return out def call_bibformat(recID, format="HD", ln=CFG_SITE_LANG, search_pattern=None, user_info=None, verbose=0): """ Calls BibFormat and returns formatted record. BibFormat will decide by itself if old or new BibFormat must be used. """ from invenio.bibformat_utils import get_pdf_snippets keywords = [] if search_pattern is not None: units = create_basic_search_units(None, str(search_pattern), None) keywords = [unit[1] for unit in units if (unit[0] != '-' and unit[2] in [None, 'fulltext'])] out = format_record(recID, of=format, ln=ln, search_pattern=keywords, user_info=user_info, verbose=verbose) if CFG_WEBSEARCH_FULLTEXT_SNIPPETS and user_info and \ 'fulltext' in user_info['uri']: # check snippets only if URL contains fulltext # FIXME: make it work for CLI too, via new function arg if keywords: snippets = get_pdf_snippets(recID, keywords) if snippets: out += snippets return out def log_query(hostname, query_args, uid=-1): """ Log query into the query and user_query tables. Return id_query or None in case of problems. """ id_query = None if uid >= 0: # log the query only if uid is reasonable res = run_sql("SELECT id FROM query WHERE urlargs=%s", (query_args,), 1) try: id_query = res[0][0] except: id_query = run_sql("INSERT INTO query (type, urlargs) VALUES ('r', %s)", (query_args,)) if id_query: run_sql("INSERT INTO user_query (id_user, id_query, hostname, date) VALUES (%s, %s, %s, %s)", (uid, id_query, hostname, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) return id_query def log_query_info(action, p, f, colls, nb_records_found_total=-1): """Write some info to the log file for later analysis.""" try: log = open(CFG_LOGDIR + "/search.log", "a") log.write(time.strftime("%Y%m%d%H%M%S#", time.localtime())) log.write(action+"#") log.write(p+"#") log.write(f+"#") for coll in colls[:-1]: log.write("%s," % coll) log.write("%s#" % colls[-1]) log.write("%d" % nb_records_found_total) log.write("\n") log.close() except: pass return ### CALLABLES def perform_request_search(req=None, cc=CFG_SITE_NAME, c=None, p="", f="", rg=CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, sf="", so="d", sp="", rm="", of="id", ot="", aas=0, p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", sc=0, jrec=0, recid=-1, recidb=-1, sysno="", id=-1, idb=-1, sysnb="", action="", d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0, dt="", verbose=0, ap=0, ln=CFG_SITE_LANG, ec=None, tab="", wl=CFG_WEBSEARCH_WILDCARD_LIMIT): """Perform search or browse request, without checking for authentication. Return list of recIDs found, if of=id. Otherwise create web page. The arguments are as follows: req - mod_python Request class instance. cc - current collection (e.g. "ATLAS"). The collection the user started to search/browse from. c - collection list (e.g. ["Theses", "Books"]). The collections user may have selected/deselected when starting to search from 'cc'. p - pattern to search for (e.g. "ellis and muon or kaon"). f - field to search within (e.g. "author"). rg - records in groups of (e.g. "10"). Defines how many hits per collection in the search results page are displayed. sf - sort field (e.g. "title"). so - sort order ("a"=ascending, "d"=descending). sp - sort pattern (e.g. "CERN-") -- in case there are more values in a sort field, this argument tells which one to prefer rm - ranking method (e.g. "jif"). Defines whether results should be ranked by some known ranking method. of - output format (e.g. "hb"). Usually starting "h" means HTML output (and "hb" for HTML brief, "hd" for HTML detailed), "x" means XML output, "t" means plain text output, "id" means no output at all but to return list of recIDs found. (Suitable for high-level API.) ot - output only these MARC tags (e.g. "100,700,909C0b"). Useful if only some fields are to be shown in the output, e.g. for library to control some fields. aas - advanced search ("0" means no, "1" means yes). Whether search was called from within the advanced search interface. p1 - first pattern to search for in the advanced search interface. Much like 'p'. f1 - first field to search within in the advanced search interface. Much like 'f'. m1 - first matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op1 - first operator, to join the first and the second unit in the advanced search interface. ("a" add, "o" or, "n" not). p2 - second pattern to search for in the advanced search interface. Much like 'p'. f2 - second field to search within in the advanced search interface. Much like 'f'. m2 - second matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op2 - second operator, to join the second and the third unit in the advanced search interface. ("a" add, "o" or, "n" not). p3 - third pattern to search for in the advanced search interface. Much like 'p'. f3 - third field to search within in the advanced search interface. Much like 'f'. m3 - third matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). sc - split by collection ("0" no, "1" yes). Governs whether we want to present the results in a single huge list, or splitted by collection. jrec - jump to record (e.g. "234"). Used for navigation inside the search results. recid - display record ID (e.g. "20000"). Do not search/browse but go straight away to the Detailed record page for the given recID. recidb - display record ID bis (e.g. "20010"). If greater than 'recid', then display records from recid to recidb. Useful for example for dumping records from the database for reformatting. sysno - display old system SYS number (e.g. ""). If you migrate to Invenio from another system, and store your old SYS call numbers, you can use them instead of recid if you wish so. id - the same as recid, in case recid is not set. For backwards compatibility. idb - the same as recid, in case recidb is not set. For backwards compatibility. sysnb - the same as sysno, in case sysno is not set. For backwards compatibility. action - action to do. "SEARCH" for searching, "Browse" for browsing. Default is to search. d1 - first datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-08-23 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd1' takes precedence over d1y, d1m, d1d if these are defined. d1y - first date's year (e.g. "1998"). Useful for search limits on creation/modification date. d1m - first date's month (e.g. "08"). Useful for search limits on creation/modification date. d1d - first date's day (e.g. "23"). Useful for search limits on creation/modification date. d2 - second datetime in full YYYY-mm-dd HH:MM:DD format (e.g. "1998-09-02 12:34:56"). Useful for search limits on creation/modification date (see 'dt' argument below). Note that 'd2' takes precedence over d2y, d2m, d2d if these are defined. d2y - second date's year (e.g. "1998"). Useful for search limits on creation/modification date. d2m - second date's month (e.g. "09"). Useful for search limits on creation/modification date. d2d - second date's day (e.g. "02"). Useful for search limits on creation/modification date. dt - first and second date's type (e.g. "c"). Specifies whether to search in creation dates ("c") or in modification dates ("m"). When dt is not set and d1* and d2* are set, the default is "c". verbose - verbose level (0=min, 9=max). Useful to print some internal information on the searching process in case something goes wrong. ap - alternative patterns (0=no, 1=yes). In case no exact match is found, the search engine can try alternative patterns e.g. to replace non-alphanumeric characters by a boolean query. ap defines if this is wanted. ln - language of the search interface (e.g. "en"). Useful for internationalization. ec - list of external search engines to search as well (e.g. "SPIRES HEP"). wl - wildcard limit (ex: 100) the wildcard queries will be limited at 100 results """ selected_external_collections_infos = None # wash output format: of = wash_output_format(of) # raise an exception when trying to print out html from the cli if of.startswith("h"): assert req # for every search engine request asking for an HTML output, we # first regenerate cache of collection and field I18N names if # needed; so that later we won't bother checking timestamps for # I18N names at all: if of.startswith("h"): collection_i18nname_cache.recreate_cache_if_needed() field_i18nname_cache.recreate_cache_if_needed() # wash all arguments requiring special care try: (cc, colls_to_display, colls_to_search, hosted_colls, wash_colls_debug) = wash_colls(cc, c, sc, verbose) # which colls to search and to display? except InvenioWebSearchUnknownCollectionError, exc: colname = exc.colname if of.startswith("h"): page_start(req, of, cc, aas, ln, getUid(req), websearch_templates.tmpl_collection_not_found_page_title(colname, ln)) req.write(websearch_templates.tmpl_collection_not_found_page_body(colname, ln)) return page_end(req, of, ln) elif of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) else: return page_end(req, of, ln) p = wash_pattern(p) f = wash_field(f) p1 = wash_pattern(p1) f1 = wash_field(f1) p2 = wash_pattern(p2) f2 = wash_field(f2) p3 = wash_pattern(p3) f3 = wash_field(f3) datetext1, datetext2 = wash_dates(d1, d1y, d1m, d1d, d2, d2y, d2m, d2d) # wash ranking method: if not is_method_valid(None, rm): rm = "" _ = gettext_set_language(ln) # backwards compatibility: id, idb, sysnb -> recid, recidb, sysno (if applicable) if sysnb != "" and sysno == "": sysno = sysnb if id > 0 and recid == -1: recid = id if idb > 0 and recidb == -1: recidb = idb # TODO deduce passed search limiting criterias (if applicable) pl, pl_in_url = "", "" # no limits by default if action != "browse" and req and not isinstance(req, cStringIO.OutputType) \ and req.args: # we do not want to add options while browsing or while calling via command-line fieldargs = cgi.parse_qs(req.args) for fieldcode in get_fieldcodes(): if fieldargs.has_key(fieldcode): for val in fieldargs[fieldcode]: pl += "+%s:\"%s\" " % (fieldcode, val) pl_in_url += "&%s=%s" % (urllib.quote(fieldcode), urllib.quote(val)) # deduce recid from sysno argument (if applicable): if sysno: # ALEPH SYS number was passed, so deduce DB recID for the record: recid = get_mysql_recid_from_aleph_sysno(sysno) if recid is None: recid = 0 # use recid 0 to indicate that this sysno does not exist # deduce collection we are in (if applicable): if recid > 0: referer = None if req: referer = req.headers_in.get('Referer') cc = guess_collection_of_a_record(recid, referer) # deduce user id (if applicable): try: uid = getUid(req) except: uid = 0 ## 0 - start output if recid >= 0: # recid can be 0 if deduced from sysno and if such sysno does not exist ## 1 - detailed record display title, description, keywords = \ websearch_templates.tmpl_record_page_header_content(req, recid, ln) if req is not None and not req.header_only: page_start(req, of, cc, aas, ln, uid, title, description, keywords, recid, tab) # Default format is hb but we are in detailed -> change 'of' if of == "hb": of = "hd" if record_exists(recid): if recidb <= recid: # sanity check recidb = recid + 1 if of == "id": return [recidx for recidx in range(recid, recidb) if record_exists(recidx)] else: print_records(req, range(recid, recidb), -1, -9999, of, ot, ln, search_pattern=p, verbose=verbose, tab=tab, sf=sf, so=so, sp=sp, rm=rm) if req and of.startswith("h"): # register detailed record page view event client_ip_address = str(req.remote_ip) register_page_view_event(recid, uid, client_ip_address) else: # record does not exist if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) elif of.startswith("h"): if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND else: print_warning(req, _("Requested record does not seem to exist.")) elif action == "browse": ## 2 - browse needed of = 'hb' page_start(req, of, cc, aas, ln, uid, _("Browse"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) try: if aas == 1 or (p1 or p2 or p3): browse_pattern(req, colls_to_search, p1, f1, rg, ln) browse_pattern(req, colls_to_search, p2, f2, rg, ln) browse_pattern(req, colls_to_search, p3, f3, rg, ln) else: browse_pattern(req, colls_to_search, p, f, rg, ln) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) elif rm and p.startswith("recid:"): ## 3-ter - similarity search (or old-style citation search) needed if req and not req.header_only: page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h"): req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) if record_exists(p[6:]) != 1: # record does not exist if of.startswith("h"): if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND else: print_warning(req, _("Requested record does not seem to exist.")) if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # record well exists, so find similar ones to it t1 = os.times()[4] results_similar_recIDs, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, results_similar_comments = \ rank_records(rm, 0, get_collection_reclist(cc), string.split(p), verbose) if results_similar_recIDs: t2 = os.times()[4] cpu_time = t2 - t1 if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, cc, len(results_similar_recIDs), jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) print_warning(req, results_similar_comments) print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) elif of=="id": return results_similar_recIDs elif of.startswith("x"): print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) else: # rank_records failed and returned some error message to display: if of.startswith("h"): print_warning(req, results_similar_relevances_prologue) print_warning(req, results_similar_relevances_epilogue) print_warning(req, results_similar_comments) if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) elif p.startswith("cocitedwith:"): #WAS EXPERIMENTAL ## 3-terter - cited by search needed page_start(req, of, cc, aas, ln, uid, _("Search Results"), p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h"): req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) recID = p[12:] if record_exists(recID) != 1: # record does not exist if of.startswith("h"): print_warning(req, _("Requested record does not seem to exist.")) if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # record well exists, so find co-cited ones: t1 = os.times()[4] results_cocited_recIDs = map(lambda x: x[0], calculate_co_cited_with_list(int(recID))) if results_cocited_recIDs: t2 = os.times()[4] cpu_time = t2 - t1 if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, CFG_SITE_NAME, len(results_cocited_recIDs), jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) elif of=="id": return results_cocited_recIDs elif of.startswith("x"): print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) else: # cited rank_records failed and returned some error message to display: if of.startswith("h"): print_warning(req, "nothing found") if of == "id": return [] elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: ## 3 - common search needed query_in_cache = False query_representation_in_cache = repr((p,f,colls_to_search, wl)) page_start(req, of, cc, aas, ln, uid, p=create_page_title_search_pattern_info(p, p1, p2, p3)) if of.startswith("h") and verbose and wash_colls_debug: print_warning(req, "wash_colls debugging info : %s" % wash_colls_debug) # search into the hosted collections only if the output format is html or xml if hosted_colls and (of.startswith("h") or of.startswith("x")) and not p.startswith("recid:"): # hosted_colls_results : the hosted collections' searches that did not timeout # hosted_colls_timeouts : the hosted collections' searches that timed out and will be searched later on again (hosted_colls_results, hosted_colls_timeouts) = calculate_hosted_collections_results(req, [p, p1, p2, p3], f, hosted_colls, verbose, ln, CFG_HOSTED_COLLECTION_TIMEOUT_ANTE_SEARCH) # successful searches if hosted_colls_results: hosted_colls_true_results = [] for result in hosted_colls_results: # if the number of results is None or 0 (or False) then just do nothing if result[1] == None or result[1] == False: # these are the searches the returned no or zero results if verbose: print_warning(req, "Hosted collections (perform_search_request): %s returned no results" % result[0][1].name) else: # these are the searches that actually returned results on time hosted_colls_true_results.append(result) if verbose: print_warning(req, "Hosted collections (perform_search_request): %s returned %s results in %s seconds" % (result[0][1].name, result[1], result[2])) else: if verbose: print_warning(req, "Hosted collections (perform_search_request): there were no hosted collections results to be printed at this time") if hosted_colls_timeouts: if verbose: for timeout in hosted_colls_timeouts: print_warning(req, "Hosted collections (perform_search_request): %s timed out and will be searched again later" % timeout[0][1].name) # we need to know for later use if there were any hosted collections to be searched even if they weren't in the end elif hosted_colls and ((not (of.startswith("h") or of.startswith("x"))) or p.startswith("recid:")): (hosted_colls_results, hosted_colls_timeouts) = (None, None) else: if verbose: print_warning(req, "Hosted collections (perform_search_request): there were no hosted collections to be searched") ## let's define some useful boolean variables: # True means there are actual or potential hosted collections results to be printed hosted_colls_actual_or_potential_results_p = not (not hosted_colls or not ((hosted_colls_results and hosted_colls_true_results) or hosted_colls_timeouts)) # True means there are hosted collections timeouts to take care of later # (useful for more accurate printing of results later) hosted_colls_potential_results_p = not (not hosted_colls or not hosted_colls_timeouts) # True means we only have hosted collections to deal with only_hosted_colls_actual_or_potential_results_p = not colls_to_search and hosted_colls_actual_or_potential_results_p if of.startswith("h"): req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, aas, ln, p1, f1, m1, op1, p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action)) t1 = os.times()[4] - results_in_any_collection = HitSet() + results_in_any_collection = intbitset() if aas == 1 or (p1 or p2 or p3): ## 3A - advanced search try: results_in_any_collection = search_pattern_parenthesised(req, p1, f1, m1, ap=ap, of=of, verbose=verbose, ln=ln, wl=wl) if len(results_in_any_collection) == 0: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) if p2: results_tmp = search_pattern_parenthesised(req, p2, f2, m2, ap=ap, of=of, verbose=verbose, ln=ln, wl=wl) if op1 == "a": # add results_in_any_collection.intersection_update(results_tmp) elif op1 == "o": # or results_in_any_collection.union_update(results_tmp) elif op1 == "n": # not results_in_any_collection.difference_update(results_tmp) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(op1), "Error") if len(results_in_any_collection) == 0: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) if p3: results_tmp = search_pattern_parenthesised(req, p3, f3, m3, ap=ap, of=of, verbose=verbose, ln=ln, wl=wl) if op2 == "a": # add results_in_any_collection.intersection_update(results_tmp) elif op2 == "o": # or results_in_any_collection.union_update(results_tmp) elif op2 == "n": # not results_in_any_collection.difference_update(results_tmp) else: if of.startswith("h"): print_warning(req, "Invalid set operation %s." % cgi.escape(op2), "Error") except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) else: ## 3B - simple search if search_results_cache.cache.has_key(query_representation_in_cache): # query is not in the cache already, so reuse it: query_in_cache = True results_in_any_collection = search_results_cache.cache[query_representation_in_cache] if verbose and of.startswith("h"): print_warning(req, "Search stage 0: query found in cache, reusing cached results.") else: try: # added the display_nearest_terms_box parameter to avoid printing out the "Nearest terms in any collection" # recommendations when there are results only in the hosted collections. Also added the if clause to avoid # searching in case we know we only have actual or potential hosted collections results if not only_hosted_colls_actual_or_potential_results_p: results_in_any_collection = search_pattern_parenthesised(req, p, f, ap=ap, of=of, verbose=verbose, ln=ln, display_nearest_terms_box=not hosted_colls_actual_or_potential_results_p, wl=wl) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if len(results_in_any_collection) == 0 and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) # store this search query results into search results cache if needed: if CFG_WEBSEARCH_SEARCH_CACHE_SIZE and not query_in_cache: if len(search_results_cache.cache) > CFG_WEBSEARCH_SEARCH_CACHE_SIZE: search_results_cache.clear() search_results_cache.cache[query_representation_in_cache] = results_in_any_collection if verbose and of.startswith("h"): print_warning(req, "Search stage 3: storing query results in cache.") # search stage 4: intersection with collection universe: try: # added the display_nearest_terms_box parameter to avoid printing out the "Nearest terms in any collection" # recommendations when there results only in the hosted collections. Also added the if clause to avoid # searching in case we know since the last stage that we have no results in any collection if len(results_in_any_collection) != 0: results_final = intersect_results_with_collrecs(req, results_in_any_collection, colls_to_search, ap, of, verbose, ln, display_nearest_terms_box=not hosted_colls_actual_or_potential_results_p) else: results_final = {} except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {} and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) if of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) # search stage 5: apply search option limits and restrictions: if datetext1 != "" and results_final != {}: if verbose and of.startswith("h"): print_warning(req, "Search stage 5: applying time etc limits, from %s until %s..." % (datetext1, datetext2)) try: results_final = intersect_results_with_hitset(req, results_final, search_unit_in_bibrec(datetext1, datetext2, dt), ap, aptext= _("No match within your time limits, " "discarding this condition..."), of=of) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {} and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) #if of.startswith("x"): # # Print empty, but valid XML # print_records_prologue(req, of) # print_records_epilogue(req, of) return page_end(req, of, ln) if pl and results_final != {}: pl = wash_pattern(pl) if verbose and of.startswith("h"): print_warning(req, "Search stage 5: applying search pattern limit %s..." % cgi.escape(pl)) try: results_final = intersect_results_with_hitset(req, results_final, search_pattern_parenthesised(req, pl, ap=0, ln=ln, wl=wl), ap, aptext=_("No match within your search limits, " "discarding this condition..."), of=of) except: register_exception(req=req, alert_admin=True) if of.startswith("h"): req.write(create_error_box(req, verbose=verbose, ln=ln)) perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) if results_final == {} and not hosted_colls_actual_or_potential_results_p: if of.startswith("h"): perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) if of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) return page_end(req, of, ln) t2 = os.times()[4] cpu_time = t2 - t1 ## search stage 6: display results: results_final_nb_total = 0 results_final_nb = {} # will hold number of records found in each collection # (in simple dict to display overview more easily) for coll in results_final.keys(): results_final_nb[coll] = len(results_final[coll]) #results_final_nb_total += results_final_nb[coll] # Now let us calculate results_final_nb_total more precisely, # in order to get the total number of "distinct" hits across # searched collections; this is useful because a record might # have been attributed to more than one primary collection; so # we have to avoid counting it multiple times. The price to # pay for this accuracy of results_final_nb_total is somewhat # increased CPU time. if results_final.keys() == 1: # only one collection; no need to union them results_final_for_all_selected_colls = results_final.values()[0] results_final_nb_total = results_final_nb.values()[0] else: # okay, some work ahead to union hits across collections: - results_final_for_all_selected_colls = HitSet() + results_final_for_all_selected_colls = intbitset() for coll in results_final.keys(): results_final_for_all_selected_colls.union_update(results_final[coll]) results_final_nb_total = len(results_final_for_all_selected_colls) #if hosted_colls and (of.startswith("h") or of.startswith("x")): if hosted_colls_actual_or_potential_results_p: if hosted_colls_results: for result in hosted_colls_true_results: colls_to_search.append(result[0][1].name) results_final_nb[result[0][1].name] = result[1] results_final_nb_total += result[1] cpu_time += result[2] if hosted_colls_timeouts: for timeout in hosted_colls_timeouts: colls_to_search.append(timeout[1].name) # use -963 as a special number to identify the collections that timed out results_final_nb[timeout[1].name] = -963 # we continue past this point only if there is a hosted collection that has timed out and might offer potential results if results_final_nb_total ==0 and not hosted_colls_potential_results_p: if of.startswith("h"): print_warning(req, "No match found, please enter different search terms.") elif of.startswith("x"): # Print empty, but valid XML print_records_prologue(req, of) print_records_epilogue(req, of) else: # yes, some hits found: good! # collection list may have changed due to not-exact-match-found policy so check it out: for coll in results_final.keys(): if coll not in colls_to_search: colls_to_search.append(coll) # print results overview: if of == "id": # we have been asked to return list of recIDs recIDs = list(results_final_for_all_selected_colls) if sf: # do we have to sort? recIDs = sort_records(req, recIDs, sf, so, sp, verbose, of) elif rm: # do we have to rank? results_final_for_all_colls_rank_records_output = rank_records(rm, 0, results_final_for_all_selected_colls, string.split(p) + string.split(p1) + string.split(p2) + string.split(p3), verbose) if results_final_for_all_colls_rank_records_output[0]: recIDs = results_final_for_all_colls_rank_records_output[0] return recIDs elif of.startswith("h"): if of not in ['hcs']: # added the hosted_colls_potential_results_p parameter to help print out the overview more accurately req.write(print_results_overview(colls_to_search, results_final_nb_total, results_final_nb, cpu_time, ln, ec, hosted_colls_potential_results_p=hosted_colls_potential_results_p)) selected_external_collections_infos = print_external_results_overview(req, cc, [p, p1, p2, p3], f, ec, verbose, ln) # print number of hits found for XML outputs: if of.startswith("x"): req.write("\n" % results_final_nb_total) # print records: if of in ['hcs']: # feed the current search to be summarized: from invenio.search_engine_summarizer import summarize_records search_p = p search_f = f if not p and (aas == 1 or p1 or p2 or p3): op_d = {'n': ' and not ', 'a': ' and ', 'o': ' or ', '': ''} triples = ziplist([f1, f2, f3], [p1, p2, p3], [op1, op2, '']) triples_len = len(triples) for i in range(triples_len): fi, pi, oi = triples[i] # e.g.: if i < triples_len-1 and not triples[i+1][1]: # if p2 empty triples[i+1][0] = '' # f2 must be too oi = '' # and o1 if ' ' in pi: pi = '"'+pi+'"' if fi: fi = fi + ':' search_p += fi + pi + op_d[oi] search_f = '' summarize_records(results_final_for_all_selected_colls, 'hcs', ln, search_p, search_f, req) else: if len(colls_to_search)>1: cpu_time = -1 # we do not want to have search time printed on each collection print_records_prologue(req, of, cc=cc) results_final_colls = [] wlqh_results_overlimit = 0 for coll in colls_to_search: if results_final.has_key(coll) and len(results_final[coll]): if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) results_final_recIDs = list(results_final[coll]) results_final_relevances = [] results_final_relevances_prologue = "" results_final_relevances_epilogue = "" if sf: # do we have to sort? results_final_recIDs = sort_records(req, results_final_recIDs, sf, so, sp, verbose, of) elif rm: # do we have to rank? results_final_recIDs_ranked, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, results_final_comments = \ rank_records(rm, 0, results_final[coll], string.split(p) + string.split(p1) + string.split(p2) + string.split(p3), verbose) if of.startswith("h"): print_warning(req, results_final_comments) if results_final_recIDs_ranked: results_final_recIDs = results_final_recIDs_ranked else: # rank_records failed and returned some error message to display: print_warning(req, results_final_relevances_prologue) print_warning(req, results_final_relevances_epilogue) if len(results_final_recIDs) < CFG_WEBSEARCH_PREV_NEXT_HIT_LIMIT: results_final_colls.append(results_final_recIDs) else: wlqh_results_overlimit = 1 print_records(req, results_final_recIDs, jrec, rg, of, ot, ln, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, search_pattern=p, print_records_prologue_p=False, print_records_epilogue_p=False, verbose=verbose, sf=sf, so=so, sp=sp, rm=rm) if of.startswith("h"): req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) # store the last search results page if req and not isinstance(req, cStringIO.OutputType): session_param_set(req, 'websearch-last-query', req.unparsed_uri) if not wlqh_results_overlimit: # store list of results if user wants to display hits # in a single list, or store list of collections of records # if user displays hits split by collections: session_param_set(req, 'websearch-last-query-hits', results_final_colls) else: results_final_colls = [] session_param_set(req, 'websearch-last-query-hits', results_final_colls) #if hosted_colls and (of.startswith("h") or of.startswith("x")): if hosted_colls_actual_or_potential_results_p: if hosted_colls_results: # TODO: add a verbose message here for result in hosted_colls_true_results: if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, results_final_nb[result[0][1].name], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, limit=rg)) if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, results_final_nb[result[0][1].name], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) if hosted_colls_timeouts: # TODO: add a verbose message here # TODO: check if verbose messages still work when dealing with (re)calculations of timeouts (hosted_colls_timeouts_results, hosted_colls_timeouts_timeouts) = do_calculate_hosted_collections_results(req, ln, None, verbose, None, hosted_colls_timeouts, CFG_HOSTED_COLLECTION_TIMEOUT_POST_SEARCH) if hosted_colls_timeouts_results: for result in hosted_colls_timeouts_results: if result[1] == None or result[1] == False: ## these are the searches the returned no or zero results ## also print a nearest terms box, in case this is the only ## collection being searched and it returns no results? if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, no_records_found=True, limit=rg)) req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) else: # these are the searches that actually returned results on time if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, result[1], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=result[0], ln=ln, of=of, req=req, limit=rg)) if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, result[0][1].name, result[1], jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) if hosted_colls_timeouts_timeouts: for timeout in hosted_colls_timeouts_timeouts: if of.startswith("h"): req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, timeout[1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time)) req.write(print_hosted_results(url_and_engine=timeout[0], ln=ln, of=of, req=req, search_timed_out=True, limit=rg)) req.write(print_hosted_search_info(p, f, sf, so, sp, rm, of, ot, timeout[1].name, -963, jrec, rg, aas, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2, sc, pl_in_url, d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1)) print_records_epilogue(req, of) if f == "author" and of.startswith("h"): req.write(create_similarly_named_authors_link_box(p, ln)) # log query: try: id_query = log_query(req.remote_host, req.args, uid) if of.startswith("h") and id_query: if not of in ['hcs']: # display alert/RSS teaser for non-summary formats: user_info = collect_user_info(req) display_email_alert_part = True if user_info: if user_info['email'] == 'guest': if CFG_ACCESS_CONTROL_LEVEL_ACCOUNTS > 4: display_email_alert_part = False else: if not user_info['precached_usealerts']: display_email_alert_part = False req.write(websearch_templates.tmpl_alert_rss_teaser_box_for_query(id_query, \ ln=ln, display_email_alert_part=display_email_alert_part)) except: # do not log query if req is None (used by CLI interface) pass log_query_info("ss", p, f, colls_to_search, results_final_nb_total) # External searches if of.startswith("h"): if not of in ['hcs']: perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos) return page_end(req, of, ln) def perform_request_cache(req, action="show"): """Manipulates the search engine cache.""" req.content_type = "text/html" req.send_http_header() req.write("") out = "" out += "

    Search Cache

    " # clear cache if requested: if action == "clear": search_results_cache.clear() req.write(out) # show collection reclist cache: out = "

    Collection reclist cache

    " out += "- collection table last updated: %s" % get_table_update_time('collection') out += "
    - reclist cache timestamp: %s" % collection_reclist_cache.timestamp out += "
    - reclist cache contents:" out += "
    " for coll in collection_reclist_cache.cache.keys(): if collection_reclist_cache.cache[coll]: out += "%s (%d)
    " % (coll, len(collection_reclist_cache.cache[coll])) out += "
    " req.write(out) # show search results cache: out = "

    Search Cache

    " out += "- search cache usage: %d queries cached (max. ~%d)" % \ (len(search_results_cache.cache), CFG_WEBSEARCH_SEARCH_CACHE_SIZE) if len(search_results_cache.cache): out += "
    - search cache contents:" out += "
    " for query, hitset in search_results_cache.cache.items(): out += "
    %s ... %s" % (query, hitset) out += """

    clear search results cache""" % CFG_SITE_URL out += "

    " req.write(out) # show field i18nname cache: out = "

    Field I18N names cache

    " out += "- fieldname table last updated: %s" % get_table_update_time('fieldname') out += "
    - i18nname cache timestamp: %s" % field_i18nname_cache.timestamp out += "
    - i18nname cache contents:" out += "
    " for field in field_i18nname_cache.cache.keys(): for ln in field_i18nname_cache.cache[field].keys(): out += "%s, %s = %s
    " % (field, ln, field_i18nname_cache.cache[field][ln]) out += "
    " req.write(out) # show collection i18nname cache: out = "

    Collection I18N names cache

    " out += "- collectionname table last updated: %s" % get_table_update_time('collectionname') out += "
    - i18nname cache timestamp: %s" % collection_i18nname_cache.timestamp out += "
    - i18nname cache contents:" out += "
    " for coll in collection_i18nname_cache.cache.keys(): for ln in collection_i18nname_cache.cache[coll].keys(): out += "%s, %s = %s
    " % (coll, ln, collection_i18nname_cache.cache[coll][ln]) out += "
    " req.write(out) req.write("") return "\n" def perform_request_log(req, date=""): """Display search log information for given date.""" req.content_type = "text/html" req.send_http_header() req.write("") req.write("

    Search Log

    ") if date: # case A: display stats for a day yyyymmdd = string.atoi(date) req.write("

    Date: %d

    " % yyyymmdd) req.write("""""") req.write("" % ("No.", "Time", "Pattern", "Field", "Collection", "Number of Hits")) # read file: p = os.popen("grep ^%d %s/search.log" % (yyyymmdd, CFG_LOGDIR), 'r') lines = p.readlines() p.close() # process lines: i = 0 for line in lines: try: datetime, dummy_aas, p, f, c, nbhits = string.split(line,"#") i += 1 req.write("" \ % (i, datetime[8:10], datetime[10:12], datetime[12:], p, f, c, nbhits)) except: pass # ignore eventual wrong log lines req.write("
    %s%s%s%s%s%s
    #%d%s:%s:%s%s%s%s%s
    ") else: # case B: display summary stats per day yyyymm01 = int(time.strftime("%Y%m01", time.localtime())) yyyymmdd = int(time.strftime("%Y%m%d", time.localtime())) req.write("""""") req.write("" % ("Day", "Number of Queries")) for day in range(yyyymm01, yyyymmdd + 1): p = os.popen("grep -c ^%d %s/search.log" % (day, CFG_LOGDIR), 'r') for line in p.readlines(): req.write("""""" % \ (day, CFG_SITE_URL, day, line)) p.close() req.write("
    %s%s
    %s%s
    ") req.write("") return "\n" +def get_all_field_values(tag): + """ + Return all existing values stored for a given tag. + @param tag: the full tag, e.g. 909C0b + @type tag: string + @return: the list of values + @rtype: list of strings + """ + table = 'bib%2dx' % int(tag[:2]) + return [row[0] for row in run_sql("SELECT DISTINCT(value) FROM %s WHERE tag=%%s" % table, (tag, ))] def get_most_popular_field_values(recids, tags, exclude_values=None, count_repetitive_values=True): """ Analyze RECIDS and look for TAGS and return most popular values and the frequency with which they occur sorted according to descending frequency. If a value is found in EXCLUDE_VALUES, then do not count it. If COUNT_REPETITIVE_VALUES is True, then we count every occurrence of value in the tags. If False, then we count the value only once regardless of the number of times it may appear in a record. (But, if the same value occurs in another record, we count it, of course.) Example: >>> get_most_popular_field_values(range(11,20), '980__a') (('PREPRINT', 10), ('THESIS', 7), ...) >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a')) (('Ellis, J', 10), ('Ellis, N', 7), ...) >>> get_most_popular_field_values(range(11,20), ('100__a', '700__a'), ('Ellis, J')) (('Ellis, N', 7), ...) """ def _get_most_popular_field_values_helper_sorter(val1, val2): "Compare VAL1 and VAL2 according to, firstly, frequency, then secondly, alphabetically." compared_via_frequencies = cmp(valuefreqdict[val2], valuefreqdict[val1]) if compared_via_frequencies == 0: return cmp(val1.lower(), val2.lower()) else: return compared_via_frequencies valuefreqdict = {} ## sanity check: if not exclude_values: exclude_values = [] if isinstance(tags, str): tags = (tags,) ## find values to count: vals_to_count = [] displaytmp = {} if count_repetitive_values: # counting technique A: can look up many records at once: (very fast) for tag in tags: vals_to_count.extend(get_fieldvalues(recids, tag)) else: # counting technique B: must count record-by-record: (slow) for recid in recids: vals_in_rec = [] for tag in tags: for val in get_fieldvalues(recid, tag, False): vals_in_rec.append(val) # do not count repetitive values within this record # (even across various tags, so need to unify again): dtmp = {} for val in vals_in_rec: dtmp[val.lower()] = 1 displaytmp[val.lower()] = val vals_in_rec = dtmp.keys() vals_to_count.extend(vals_in_rec) ## are we to exclude some of found values? for val in vals_to_count: if val not in exclude_values: if valuefreqdict.has_key(val): valuefreqdict[val] += 1 else: valuefreqdict[val] = 1 ## sort by descending frequency of values: out = () vals = valuefreqdict.keys() vals.sort(_get_most_popular_field_values_helper_sorter) for val in vals: tmpdisplv = '' if displaytmp.has_key(val): tmpdisplv = displaytmp[val] else: tmpdisplv = val out += (tmpdisplv, valuefreqdict[val]), return out def profile(p="", f="", c=CFG_SITE_NAME): """Profile search time.""" import profile import pstats profile.run("perform_request_search(p='%s',f='%s', c='%s')" % (p, f, c), "perform_request_search_profile") p = pstats.Stats("perform_request_search_profile") p.strip_dirs().sort_stats("cumulative").print_stats() return 0 diff --git a/modules/websearch/lib/websearch_regression_tests.py b/modules/websearch/lib/websearch_regression_tests.py index 41c182f72..5ad8e6d0e 100644 --- a/modules/websearch/lib/websearch_regression_tests.py +++ b/modules/websearch/lib/websearch_regression_tests.py @@ -1,1958 +1,1958 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # pylint: disable=C0301 # pylint: disable=E1102 """WebSearch module regression tests.""" __revision__ = "$Id$" import unittest import re import urlparse, cgi import sys if sys.hexversion < 0x2040000: # pylint: disable=W0622 from sets import Set as set # pylint: enable=W0622 from mechanize import Browser, LinkNotFoundError from invenio.config import CFG_SITE_URL, CFG_SITE_NAME, CFG_SITE_LANG, \ CFG_SITE_RECORD, CFG_SITE_LANGS from invenio.testutils import make_test_suite, \ run_test_suite, \ make_url, make_surl, test_web_page_content, \ merge_error_messages from invenio.urlutils import same_urls_p from invenio.search_engine import perform_request_search, \ guess_primary_collection_of_a_record, guess_collection_of_a_record, \ collection_restricted_p, get_permitted_restricted_collections, \ search_pattern, search_unit, search_unit_in_bibrec, \ wash_colls from invenio.search_engine_utils import get_fieldvalues if 'fr' in CFG_SITE_LANGS: lang_french_configured = True else: lang_french_configured = False def parse_url(url): parts = urlparse.urlparse(url) query = cgi.parse_qs(parts[4], True) return parts[2].split('/')[1:], query class WebSearchWebPagesAvailabilityTest(unittest.TestCase): """Check WebSearch web pages whether they are up or not.""" def test_search_interface_pages_availability(self): """websearch - availability of search interface pages""" baseurl = CFG_SITE_URL + '/' _exports = ['', 'collection/Poetry', 'collection/Poetry?as=1'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_results_pages_availability(self): """websearch - availability of search results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['', '?c=Poetry', '?p=ellis', '/cache', '/log'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_search_detailed_record_pages_availability(self): """websearch - availability of search detailed record pages""" baseurl = CFG_SITE_URL + '/'+ CFG_SITE_RECORD +'/' _exports = ['', '1', '1/', '1/files', '1/files/'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_browse_results_pages_availability(self): """websearch - availability of browse results pages""" baseurl = CFG_SITE_URL + '/search' _exports = ['?p=ellis&f=author&action_browse=Browse'] error_messages = [] for url in [baseurl + page for page in _exports]: error_messages.extend(test_web_page_content(url)) if error_messages: self.fail(merge_error_messages(error_messages)) return def test_help_page_availability(self): """websearch - availability of Help Central page""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help', expected_text="Help Central")) if lang_french_configured: def test_help_page_availability_fr(self): """websearch - availability of Help Central page in french""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/?ln=fr', expected_text="Centre d'aide")) def test_search_tips_page_availability(self): """websearch - availability of Search Tips""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips', expected_text="Search Tips")) if lang_french_configured: def test_search_tips_page_availability_fr(self): """websearch - availability of Search Tips in french""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-tips?ln=fr', expected_text="Conseils de recherche")) def test_search_guide_page_availability(self): """websearch - availability of Search Guide""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide', expected_text="Search Guide")) if lang_french_configured: def test_search_guide_page_availability_fr(self): """websearch - availability of Search Guide in french""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search-guide?ln=fr', expected_text="Guide de recherche")) class WebSearchTestLegacyURLs(unittest.TestCase): """ Check that the application still responds to legacy URLs for navigating, searching and browsing.""" def test_legacy_collections(self): """ websearch - collections handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # Use the root URL unless we need more check(make_url('/', c=CFG_SITE_NAME), make_url('/', ln=CFG_SITE_LANG)) # Other collections are redirected in the /collection area check(make_url('/', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Drop unnecessary arguments, like ln and as (when they are # the default value) args = {'as': 0} check(make_url('/', c='Poetry', **args), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) # Otherwise, keep them args = {'as': 1, 'ln': CFG_SITE_LANG} check(make_url('/', c='Poetry', **args), make_url('/collection/Poetry', **args)) # Support the /index.py addressing too check(make_url('/index.py', c='Poetry'), make_url('/collection/Poetry', ln=CFG_SITE_LANG)) def test_legacy_search(self): """ websearch - search queries handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # /search.py is redirected on /search # Note that `as' is a reserved word in Python 2.5 check(make_url('/search.py', p='nuclear', ln='en') + 'as=1', make_url('/search', p='nuclear', ln='en') + 'as=1') if lang_french_configured: def test_legacy_search_fr(self): """ websearch - search queries handle legacy urls """ browser = Browser() def check(legacy, new, browser=browser): browser.open(legacy) got = browser.geturl() self.failUnless(same_urls_p(got, new), got) # direct recid searches are redirected to /CFG_SITE_RECORD check(make_url('/search.py', recid=1, ln='fr'), make_url('/%s/1' % CFG_SITE_RECORD, ln='fr')) def test_legacy_search_help_link(self): """websearch - legacy Search Help page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/index.en.html', expected_text="Help Central")) if lang_french_configured: def test_legacy_search_tips_link(self): """websearch - legacy Search Tips page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/tips.fr.html', expected_text="Conseils de recherche")) def test_legacy_search_guide_link(self): """websearch - legacy Search Guide page link""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/help/search/guide.en.html', expected_text="Search Guide")) class WebSearchTestRecord(unittest.TestCase): """ Check the interface of the /CFG_SITE_RECORD results """ def test_format_links(self): """ websearch - check format links for records """ browser = Browser() # We open the record in all known HTML formats for hformat in ('hd', 'hx', 'hm'): browser.open(make_url('/%s/1' % CFG_SITE_RECORD, of=hformat)) if hformat == 'hd': # hd format should have a link to the following # formats for oformat in ('hx', 'hm', 'xm', 'xd'): target = make_url('/%s/1/export/%s?ln=en' % (CFG_SITE_RECORD, oformat)) try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) else: # non-hd HTML formats should have a link back to # the main detailed record target = make_url('/%s/1' % CFG_SITE_RECORD) try: browser.find_link(url=target) except LinkNotFoundError: self.fail('link %r should be in page' % target) return def test_exported_formats(self): """ websearch - check formats exported through /CFG_SITE_RECORD/1/export/ URLs""" self.assertEqual([], test_web_page_content(make_url('/%s/1/export/hm' % CFG_SITE_RECORD), expected_text='245__ $$aALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/%s/1/export/hd' % CFG_SITE_RECORD), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/%s/1/export/xm' % CFG_SITE_RECORD), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/%s/1/export/xd' % CFG_SITE_RECORD), expected_text='ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/%s/1/export/hs' % CFG_SITE_RECORD), expected_text='ALEPH experiment' % \ (CFG_SITE_RECORD, CFG_SITE_LANG))) self.assertEqual([], test_web_page_content(make_url('/%s/1/export/hx' % CFG_SITE_RECORD), expected_text='title = "ALEPH experiment')) self.assertEqual([], test_web_page_content(make_url('/%s/1/export/t?ot=245' % CFG_SITE_RECORD), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/%s/1/export/t?ot=245' % CFG_SITE_RECORD), expected_text='001__')) self.assertEqual([], test_web_page_content(make_url('/%s/1/export/h?ot=245' % CFG_SITE_RECORD), expected_text='245__ $$aALEPH experiment')) self.assertNotEqual([], test_web_page_content(make_url('/%s/1/export/h?ot=245' % CFG_SITE_RECORD), expected_text='001__')) return def test_plots_tab(self): """ websearch - test to ensure the plots tab is working """ self.assertEqual([], test_web_page_content(make_url('/%s/8/plots' % CFG_SITE_RECORD), expected_text='div id="clip"', unexpected_text='Abstract')) class WebSearchTestCollections(unittest.TestCase): def test_traversal_links(self): """ websearch - traverse all the publications of a collection """ browser = Browser() try: for aas in (0, 1): args = {'as': aas} browser.open(make_url('/collection/Preprints', **args)) for jrec in (11, 21, 11, 28): args = {'jrec': jrec, 'cc': 'Preprints'} if aas: args['as'] = aas url = make_url('/search', **args) try: browser.follow_link(url=url) except LinkNotFoundError: args['ln'] = CFG_SITE_LANG url = make_url('/search', **args) browser.follow_link(url=url) except LinkNotFoundError: self.fail('no link %r in %r' % (url, browser.geturl())) def test_collections_links(self): """ websearch - enter in collections and subcollections """ browser = Browser() def tryfollow(url): cur = browser.geturl() body = browser.response().read() try: browser.follow_link(url=url) except LinkNotFoundError: print body self.fail("in %r: could not find %r" % ( cur, url)) return for aas in (0, 1): if aas: kargs = {'as': 1} else: kargs = {} kargs['ln'] = CFG_SITE_LANG # We navigate from immediate son to immediate son... browser.open(make_url('/', **kargs)) tryfollow(make_url('/collection/Articles%20%26%20Preprints', **kargs)) tryfollow(make_url('/collection/Articles', **kargs)) # But we can also jump to a grandson immediately browser.back() browser.back() tryfollow(make_url('/collection/ALEPH', **kargs)) return def test_records_links(self): """ websearch - check the links toward records in leaf collections """ browser = Browser() browser.open(make_url('/collection/Preprints')) def harvest(): """ Parse all the links in the page, and check that for each link to a detailed record, we also have the corresponding link to the similar records.""" records = set() similar = set() for link in browser.links(): path, q = parse_url(link.url) if not path: continue if path[0] == CFG_SITE_RECORD: records.add(int(path[1])) continue if path[0] == 'search': if not q.get('rm') == ['wrd']: continue recid = q['p'][0].split(':')[1] similar.add(int(recid)) self.failUnlessEqual(records, similar) return records # We must have 10 links to the corresponding /CFG_SITE_RECORD found = harvest() self.failUnlessEqual(len(found), 10) # When clicking on the "Search" button, we must also have # these 10 links on the records. browser.select_form(name="search") browser.submit() found = harvest() self.failUnlessEqual(len(found), 10) return class WebSearchTestBrowse(unittest.TestCase): def test_browse_field(self): """ websearch - check that browsing works """ browser = Browser() browser.open(make_url('/')) browser.select_form(name='search') browser['f'] = ['title'] browser.submit(name='action_browse') def collect(): # We'll get a few links to search for the actual hits, plus a # link to the following results. res = [] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, q = parse_url(link.url) res.append((link, q)) return res # if we follow the last link, we should get another # batch. There is an overlap of one item. batch_1 = collect() browser.follow_link(link=batch_1[-1][0]) batch_2 = collect() # FIXME: we cannot compare the whole query, as the collection # set is not equal self.failUnlessEqual(batch_1[-2][1]['p'], batch_2[0][1]['p']) class WebSearchTestOpenURL(unittest.TestCase): def test_isbn_01(self): """ websearch - isbn query via OpenURL 0.1""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', isbn='0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10_rft_id(self): """ websearch - isbn query via OpenURL 1.0 - rft_id""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl', rft_id='urn:ISBN:0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) def test_isbn_10(self): """ websearch - isbn query via OpenURL 1.0""" browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/openurl?rft.isbn=0387940758')) dummy, current_q = parse_url(browser.geturl()) self.failUnlessEqual(current_q, { 'sc' : ['1'], 'p' : ['isbn:"0387940758"'], 'of' : ['hd'] }) class WebSearchTestSearch(unittest.TestCase): def test_hits_in_other_collection(self): """ websearch - check extension of a query to the home collection """ browser = Browser() # We do a precise search in an isolated collection browser.open(make_url('/collection/ISOLDE', ln='en')) browser.select_form(name='search') browser['f'] = ['author'] browser['p'] = 'matsubara' browser.submit() dummy, current_q = parse_url(browser.geturl()) link = browser.find_link(text_regex=re.compile('.*hit', re.I)) dummy, target_q = parse_url(link.url) # the target query should be the current query without any c # or cc specified. for f in ('cc', 'c', 'action_search'): if f in current_q: del current_q[f] self.failUnlessEqual(current_q, target_q) def test_nearest_terms(self): """ websearch - provide a list of nearest terms """ browser = Browser() browser.open(make_url('')) # Search something weird browser.select_form(name='search') browser['p'] = 'gronf' browser.submit() dummy, original = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in original: del original[to_drop] if 'ln' not in original: original['ln'] = [CFG_SITE_LANG] # we should get a few searches back, which are identical # except for the p field being substituted (and the cc field # being dropped). if 'cc' in original: del original['cc'] for link in browser.links(url_regex=re.compile(CFG_SITE_URL + r'/search\?')): if link.text == 'Advanced Search': continue dummy, target = parse_url(link.url) if 'ln' not in target: target['ln'] = [CFG_SITE_LANG] original['p'] = [link.text] self.failUnlessEqual(original, target) return def test_switch_to_simple_search(self): """ websearch - switch to simple search """ browser = Browser() args = {'as': 1} browser.open(make_url('/collection/ISOLDE', **args)) browser.select_form(name='search') browser['p1'] = 'tandem' browser['f1'] = ['title'] browser.submit() browser.follow_link(text='Simple Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p': ['tandem'], 'f': ['title'], 'ln': ['en']}) def test_switch_to_advanced_search(self): """ websearch - switch to advanced search """ browser = Browser() browser.open(make_url('/collection/ISOLDE')) browser.select_form(name='search') browser['p'] = 'tandem' browser['f'] = ['title'] browser.submit() browser.follow_link(text='Advanced Search') dummy, q = parse_url(browser.geturl()) self.failUnlessEqual(q, {'cc': ['ISOLDE'], 'p1': ['tandem'], 'f1': ['title'], 'as': ['1'], 'ln' : ['en']}) def test_no_boolean_hits(self): """ websearch - check the 'no boolean hits' proposed links """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'quasinormal muon' browser.submit() dummy, q = parse_url(browser.geturl()) for to_drop in ('cc', 'action_search', 'f'): if to_drop in q: del q[to_drop] for bsu in ('quasinormal', 'muon'): l = browser.find_link(text=bsu) q['p'] = bsu if not same_urls_p(l.url, make_url('/search', **q)): self.fail(repr((l.url, make_url('/search', **q)))) def test_similar_authors(self): """ websearch - test similar authors box """ browser = Browser() browser.open(make_url('')) browser.select_form(name='search') browser['p'] = 'Ellis, R K' browser['f'] = ['author'] browser.submit() l = browser.find_link(text="Ellis, R S") self.failUnless(same_urls_p(l.url, make_url('/search', p="Ellis, R S", f='author', ln='en'))) class WebSearchTestWildcardLimit(unittest.TestCase): """Checks if the wildcard limit is correctly passed and that users without autorization can not exploit it""" def test_wildcard_limit_correctly_passed_when_not_set(self): """websearch - wildcard limit is correctly passed when default""" self.assertEqual(search_pattern(p='e*', f='author'), search_pattern(p='e*', f='author', wl=1000)) def test_wildcard_limit_correctly_passed_when_set(self): """websearch - wildcard limit is correctly passed when set""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=e*&f=author&of=id&wl=5', expected_text="[9, 10, 11, 17, 46, 48, 50, 51, 52, 53, 54, 67, 72, 74, 81, 88, 92, 96]")) def test_wildcard_limit_correctly_not_active(self): """websearch - wildcard limit is not active when there is no wildcard query""" self.assertEqual(search_pattern(p='ellis', f='author'), search_pattern(p='ellis', f='author', wl=1)) def test_wildcard_limit_increased_by_authorized_users(self): """websearch - wildcard limit increased by authorized user""" browser = Browser() #try a search query, with no wildcard limit set by the user browser.open(make_url('/search?p=a*&of=id')) recid_list_guest_no_limit = browser.response().read() # so the limit is CGF_WEBSEARCH_WILDCARD_LIMIT #try a search query, with a wildcard limit imposed by the user #wl=1000000 - a very high limit,higher then what the CFG_WEBSEARCH_WILDCARD_LIMIT might be browser.open(make_url('/search?p=a*&of=id&wl=1000000')) recid_list_guest_with_limit = browser.response().read() #same results should be returned for a search without the wildcard limit set by the user #and for a search with a large limit set by the user #in this way we know that nomatter how large the limit is, the wildcard query will be #limitted by CFG_WEBSEARCH_WILDCARD_LIMIT (for a guest user) self.failIf(len(recid_list_guest_no_limit.split(',')) != len(recid_list_guest_with_limit.split(','))) ##login as admin browser.open(make_surl('/youraccount/login')) browser.select_form(nr=0) browser['p_un'] = 'admin' browser['p_pw'] = '' browser.submit() #try a search query, with a wildcard limit imposed by an authorized user #wl = 10000 a very high limit, higher then what the CFG_WEBSEARCH_WILDCARD_LIMIT might be browser.open(make_surl('/search?p=a*&of=id&wl=10000')) recid_list_authuser_with_limit = browser.response().read() #the authorized user can set whatever limit he might wish #so, the results returned for the auth. users should exceed the results returned for unauth. users self.failUnless(len(recid_list_guest_no_limit.split(',')) <= len(recid_list_authuser_with_limit.split(','))) #logout browser.open(make_surl('/youraccount/logout')) browser.response().read() browser.close() class WebSearchNearestTermsTest(unittest.TestCase): """Check various alternatives of searches leading to the nearest terms box.""" def test_nearest_terms_box_in_okay_query(self): """ websearch - no nearest terms box for a successful query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text="jump to record")) def test_nearest_terms_box_in_unsuccessful_simple_query(self): """ websearch - nearest terms box for unsuccessful simple query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=embed", expected_link_label='embed')) def test_nearest_terms_box_in_unsuccessful_simple_accented_query(self): """ websearch - nearest terms box for unsuccessful accented query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=elliszà', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=embed", expected_link_label='embed')) def test_nearest_terms_box_in_unsuccessful_structured_query(self): """ websearch - nearest terms box for unsuccessful structured query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellisz&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=fabbro&f=author", expected_link_label='fabbro')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3Afabbro", expected_link_label='fabbro')) def test_nearest_terms_box_in_query_with_invalid_index(self): """ websearch - nearest terms box for queries with invalid indexes specified """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=bednarz%3Aellis', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=bednarz", expected_link_label='bednarz')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=1%3Aellis', expected_text="no index 1.", expected_link_target=CFG_SITE_URL+"/record/47?ln=en", expected_link_label="Detailed record")) def test_nearest_terms_box_in_unsuccessful_phrase_query(self): """ websearch - nearest terms box for unsuccessful phrase query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+Z%22', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3A%22Enqvist%2C+K%22", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%22ellisz%22&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%22Enqvist%2C+K%22&f=author", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%22elliszà%22&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%22Enqvist%2C+K%22&f=author", expected_link_label='Enqvist, K')) def test_nearest_terms_box_in_unsuccessful_partial_phrase_query(self): """ websearch - nearest terms box for unsuccessful partial phrase query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%27Ellis%2C+Z%27', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=author%3A%27Enqvist%2C+K%27", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%27ellisz%27&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%27Enqvist%2C+K%27&f=author", expected_link_label='Enqvist, K')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%27elliszà%27&f=author', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=%27Enqvist%2C+K%27&f=author", expected_link_label='Enqvist, K')) def test_nearest_terms_box_in_unsuccessful_partial_phrase_advanced_query(self): """ websearch - nearest terms box for unsuccessful partial phrase advanced search query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p1=aaa&f1=title&m1=p&as=1', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&f1=title&as=1&p1=A+simple+functional+form+for+proton-nucleus+total+reaction+cross+sections&m1=p", expected_link_label='A simple functional form for proton-nucleus total reaction cross sections')) def test_nearest_terms_box_in_unsuccessful_exact_phrase_advanced_query(self): """ websearch - nearest terms box for unsuccessful exact phrase advanced search query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p1=aaa&f1=title&m1=e&as=1', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&f1=title&as=1&p1=A+simple+functional+form+for+proton-nucleus+total+reaction+cross+sections&m1=e", expected_link_label='A simple functional form for proton-nucleus total reaction cross sections')) def test_nearest_terms_box_in_unsuccessful_boolean_query(self): """ websearch - nearest terms box for unsuccessful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aellisz+author%3Aellisz', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3Aenergi+author%3Aenergie', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist", expected_link_label='enqvist')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aellisz+author%3Aellisz&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aellisz&f=keyword", expected_link_label='energi')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=title%3Aenergi+author%3Aenergie&f=keyword', expected_text="Nearest terms in any collection are", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=title%3Aenergi+author%3Aenqvist&f=keyword", expected_link_label='enqvist')) class WebSearchBooleanQueryTest(unittest.TestCase): """Check various boolean queries.""" def test_successful_boolean_query(self): """ websearch - successful boolean query """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon', expected_text="records found", expected_link_label="Detailed record")) def test_unsuccessful_boolean_query_where_all_individual_terms_match(self): """ websearch - unsuccessful boolean query where all individual terms match """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis+muon+letter', expected_text="Boolean query returned no hits. Please combine your search terms differently.")) class WebSearchAuthorQueryTest(unittest.TestCase): """Check various author-related queries.""" def test_propose_similar_author_names_box(self): """ websearch - propose similar author names box """ self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=Ellis%2C+R&f=author', expected_text="See also: similar author names", expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K")) def test_do_not_propose_similar_author_names_box(self): """ websearch - do not propose similar author names box """ errmsgs = test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis%2C+R%22', expected_link_target=CFG_SITE_URL+"/search?ln=en&p=Ellis%2C+R+K&f=author", expected_link_label="Ellis, R K") if errmsgs[0].find("does not contain link to") > -1: pass else: self.fail("Should not propose similar author names box.") return class WebSearchSearchEnginePythonAPITest(unittest.TestCase): """Check typical search engine Python API calls on the demo data.""" def test_search_engine_python_api_for_failed_query(self): """websearch - search engine Python API for failed query""" self.assertEqual([], perform_request_search(p='aoeuidhtns')) def test_search_engine_python_api_for_successful_query(self): """websearch - search engine Python API for successful query""" self.assertEqual([8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47], perform_request_search(p='ellis')) def test_search_engine_python_api_for_existing_record(self): """websearch - search engine Python API for existing record""" self.assertEqual([8], perform_request_search(recid=8)) def test_search_engine_python_api_for_nonexisting_record(self): """websearch - search engine Python API for non-existing record""" self.assertEqual([], perform_request_search(recid=16777215)) def test_search_engine_python_api_for_nonexisting_collection(self): """websearch - search engine Python API for non-existing collection""" self.assertEqual([], perform_request_search(c='Foo')) def test_search_engine_python_api_for_range_of_records(self): """websearch - search engine Python API for range of records""" self.assertEqual([1, 2, 3, 4, 5, 6, 7, 8, 9], perform_request_search(recid=1, recidb=10)) def test_search_engine_python_api_ranked_by_citation(self): """websearch - search engine Python API for citation ranking""" self.assertEqual([82, 83, 87, 89], perform_request_search(p='recid:81', rm='citation')) def test_search_engine_python_api_textmarc(self): """websearch - search engine Python API for Text MARC output""" # we are testing example from /help/hacking/search-engine-api import cStringIO tmp = cStringIO.StringIO() perform_request_search(req=tmp, p='higgs', of='tm', ot=['100', '700']) out = tmp.getvalue() tmp.close() self.assertEqual(out, """\ 000000085 100__ $$aGirardello, L$$uINFN$$uUniversita di Milano-Bicocca 000000085 700__ $$aPorrati, Massimo 000000085 700__ $$aZaffaroni, A 000000001 100__ $$aPhotolab """) class WebSearchSearchEngineWebAPITest(unittest.TestCase): """Check typical search engine Web API calls on the demo data.""" def test_search_engine_web_api_for_failed_query(self): """websearch - search engine Web API for failed query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=aoeuidhtns&of=id', expected_text="[]")) def test_search_engine_web_api_for_successful_query(self): """websearch - search engine Web API for successful query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=id', expected_text="[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 47]")) def test_search_engine_web_api_for_existing_record(self): """websearch - search engine Web API for existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=8&of=id', expected_text="[8]")) def test_search_engine_web_api_for_nonexisting_record(self): """websearch - search engine Web API for non-existing record""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=123456789&of=id', expected_text="[]")) def test_search_engine_web_api_for_nonexisting_collection(self): """websearch - search engine Web API for non-existing collection""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?c=Foo&of=id', expected_text="[]")) def test_search_engine_web_api_for_range_of_records(self): """websearch - search engine Web API for range of records""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?recid=1&recidb=10&of=id', expected_text="[1, 2, 3, 4, 5, 6, 7, 8, 9]")) class WebSearchRestrictedCollectionTest(unittest.TestCase): """Test of the restricted Theses collection behaviour.""" def test_restricted_collection_interface_page(self): """websearch - restricted collection interface page body""" # there should be no Latest additions box for restricted collections self.assertNotEqual([], test_web_page_content(CFG_SITE_URL + '/collection/Theses', expected_text="Latest additions")) def test_restricted_search_as_anonymous_guest(self): """websearch - restricted collection not searchable by anonymous guest""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') response = browser.response().read() if response.find("If you think you have right to access it, please authenticate yourself.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_search_as_authorized_person(self): """websearch - restricted collection searchable by authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() if browser.response().read().find("records found") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to search Theses collection.") def test_restricted_search_as_unauthorized_person(self): """websearch - restricted collection not searchable by unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/search?c=Theses') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() # Mr. Hyde should not be able to connect: if browser.response().read().find("Authorization failure") <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to search Theses collection.") def test_restricted_detailed_record_page_as_anonymous_guest(self): """websearch - restricted detailed record page not accessible to guests""" browser = Browser() browser.open(CFG_SITE_URL + '/%s/35' % CFG_SITE_RECORD) if browser.response().read().find("You can use your nickname or your email address to login.") > -1: pass else: self.fail("Oops, searching restricted collection without password should have redirected to login dialog.") return def test_restricted_detailed_record_page_as_authorized_person(self): """websearch - restricted detailed record page accessible to authorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'jekyll' browser['p_pw'] = 'j123ekyll' browser.submit() browser.open(CFG_SITE_URL + '/%s/35' % CFG_SITE_RECORD) # Dr. Jekyll should be able to connect # (add the pw to the whole CFG_SITE_URL because we shall be # redirected to '/reordrestricted/'): if browser.response().read().find("A High-performance Video Browsing System") > -1: pass else: self.fail("Oops, Dr. Jekyll should be able to access restricted detailed record page.") def test_restricted_detailed_record_page_as_unauthorized_person(self): """websearch - restricted detailed record page not accessible to unauthorized person""" browser = Browser() browser.open(CFG_SITE_URL + '/youraccount/login') browser.select_form(nr=0) browser['p_un'] = 'hyde' browser['p_pw'] = 'h123yde' browser.submit() browser.open(CFG_SITE_URL + '/%s/35' % CFG_SITE_RECORD) # Mr. Hyde should not be able to connect: if browser.response().read().find('You are not authorized') <= -1: # if we got here, things are broken: self.fail("Oops, Mr.Hyde should not be able to access restricted detailed record page.") def test_collection_restricted_p(self): """websearch - collection_restricted_p""" self.failUnless(collection_restricted_p('Theses'), True) self.failIf(collection_restricted_p('Books & Reports')) def test_get_permitted_restricted_collections(self): """websearch - get_permitted_restricted_collections""" from invenio.webuser import get_uid_from_email, collect_user_info self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('jekyll@cds.cern.ch'))), ['Theses']) self.assertEqual(get_permitted_restricted_collections(collect_user_info(get_uid_from_email('hyde@cds.cern.ch'))), []) class WebSearchRestrictedPicturesTest(unittest.TestCase): """ Check whether restricted pictures on the demo site can be accessed well by people who have rights to access them. """ def test_restricted_pictures_guest(self): """websearch - restricted pictures not available to guest""" error_messages = test_web_page_content(CFG_SITE_URL + '/%s/1/files/0106015_01.jpg' % CFG_SITE_RECORD, expected_text=['This file is restricted. If you think you have right to access it, please authenticate yourself.']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_romeo(self): """websearch - restricted pictures available to Romeo""" error_messages = test_web_page_content(CFG_SITE_URL + '/%s/1/files/0106015_01.jpg' % CFG_SITE_RECORD, username='romeo', password='r123omeo', expected_text=[], unexpected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.fail(merge_error_messages(error_messages)) def test_restricted_pictures_hyde(self): """websearch - restricted pictures not available to Mr. Hyde""" error_messages = test_web_page_content(CFG_SITE_URL + '/%s/1/files/0106015_01.jpg' % CFG_SITE_RECORD, username='hyde', password='h123yde', expected_text=['This file is restricted', 'You are not authorized']) if error_messages: self.failUnless("HTTP Error 401: Unauthorized" in merge_error_messages(error_messages)) class WebSearchRSSFeedServiceTest(unittest.TestCase): """Test of the RSS feed service.""" def test_rss_feed_service(self): """websearch - RSS feed service""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/rss', expected_text=' -1: self.fail("Oops, when split by collection is off, " "results overview should not be present.") if body.find('') == -1: self.fail("Oops, when split by collection is off, " "Atlantis collection should be found.") if body.find('') > -1: self.fail("Oops, when split by collection is off, " "Multimedia & Arts should not be found.") try: browser.find_link(url='#15') self.fail("Oops, when split by collection is off, " "a link to Multimedia & Arts should not be found.") except LinkNotFoundError: pass def test_results_overview_split_on(self): """websearch - results overview box when split by collection is on""" browser = Browser() browser.open(CFG_SITE_URL + '/search?p=of&sc=1') body = browser.response().read() if body.find("Results overview") == -1: self.fail("Oops, when split by collection is on, " "results overview should be present.") if body.find('') > -1: self.fail("Oops, when split by collection is on, " "Atlantis collection should not be found.") if body.find('') == -1: self.fail("Oops, when split by collection is on, " "Multimedia & Arts should be found.") try: browser.find_link(url='#15') except LinkNotFoundError: self.fail("Oops, when split by collection is on, " "a link to Multimedia & Arts should be found.") class WebSearchSortResultsTest(unittest.TestCase): """Test of the search results page's sorting capability.""" def test_sort_results_default(self): """websearch - search results sorting, default method""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1', expected_text="[TESLA-FEL-99-07]")) def test_sort_results_ascending(self): """websearch - search results sorting, ascending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=a', expected_text="ISOLTRAP")) def test_sort_results_descending(self): """websearch - search results sorting, descending field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d', expected_text=" [TESLA-FEL-99-07]")) def test_sort_results_sort_pattern(self): """websearch - search results sorting, preferential sort pattern""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=of&f=title&rg=1&sf=reportnumber&so=d&sp=cern', expected_text="[CERN-TH-2002-069]")) class WebSearchSearchResultsXML(unittest.TestCase): """Test search results in various output""" def test_search_results_xm_output_split_on(self): """ websearch - check document element of search results in xm output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xm') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xm_output_split_off(self): """ websearch - check document element of search results in xm output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xm') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xd_output_split_on(self): """ websearch - check document element of search results in xd output (split by collection on)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=1&of=xd') body = browser.response().read() num_doc_element = body.count("" "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") def test_search_results_xd_output_split_off(self): """ websearch - check document element of search results in xd output (split by collection off)""" browser = Browser() browser.open(CFG_SITE_URL + '/search?sc=0&of=xd') body = browser.response().read() num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") num_doc_element = body.count("") if num_doc_element == 0: self.fail("Oops, no document element " "found in search results.") elif num_doc_element > 1: self.fail("Oops, multiple document elements " "found in search results.") class WebSearchUnicodeQueryTest(unittest.TestCase): """Test of the search results for queries containing Unicode characters.""" def test_unicode_word_query(self): """websearch - Unicode word query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%CE%99%CE%B8%CE%AC%CE%BA%CE%B7', expected_text="[76]")) def test_unicode_word_query_not_found_term(self): """websearch - Unicode word query, not found term""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3A%CE%99%CE%B8', expected_text="ιθάκη")) def test_unicode_exact_phrase_query(self): """websearch - Unicode exact phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%22%CE%99%CE%B8%CE%AC%CE%BA%CE%B7%22', expected_text="[76]")) def test_unicode_partial_phrase_query(self): """websearch - Unicode partial phrase query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%27%CE%B7%27', expected_text="[76]")) def test_unicode_regexp_query(self): """websearch - Unicode regexp query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=title%3A%2F%CE%B7%2F', expected_text="[76]")) class WebSearchMARCQueryTest(unittest.TestCase): """Test of the search results for queries containing physical MARC tags.""" def test_single_marc_tag_exact_phrase_query(self): """websearch - single MARC tag, exact phrase query (100__a)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=100__a%3A%22Ellis%2C+J%22', expected_text="[9, 14, 18]")) def test_single_marc_tag_partial_phrase_query(self): """websearch - single MARC tag, partial phrase query (245__b)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245__b%3A%27and%27', expected_text="[28]")) def test_many_marc_tags_partial_phrase_query(self): """websearch - many MARC tags, partial phrase query (245)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%27and%27', expected_text="[1, 8, 9, 14, 15, 20, 22, 24, 28, 33, 47, 48, 49, 51, 53, 64, 69, 71, 79, 82, 83, 85, 91, 96]")) def test_single_marc_tag_regexp_query(self): """websearch - single MARC tag, regexp query""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%2Fand%2F', expected_text="[1, 8, 9, 14, 15, 20, 22, 24, 28, 33, 47, 48, 49, 51, 53, 64, 69, 71, 79, 82, 83, 85, 91, 96]")) class WebSearchExtSysnoQueryTest(unittest.TestCase): """Test of queries using external system numbers.""" def test_existing_sysno_html_output(self): """websearch - external sysno query, existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER', expected_text="The wall of the cave")) def test_existing_sysno_id_output(self): """websearch - external sysno query, existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CER&of=id', expected_text="[95]")) def test_nonexisting_sysno_html_output(self): """websearch - external sysno query, non-existing sysno, HTML output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR', expected_text="Requested record does not seem to exist.")) def test_nonexisting_sysno_id_output(self): """websearch - external sysno query, non-existing sysno, ID output""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?sysno=000289446CERRRR&of=id', expected_text="[]")) class WebSearchResultsRecordGroupingTest(unittest.TestCase): """Test search results page record grouping (rg).""" def test_search_results_rg_guest(self): """websearch - search results, records in groups of, guest""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', expected_text="1 - 17")) def test_search_results_rg_nonguest(self): """websearch - search results, records in groups of, non-guest""" # This test used to fail due to saved user preference fetching # not overridden by URL rg argument. self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?rg=17', username='admin', expected_text="1 - 17")) class WebSearchSpecialTermsQueryTest(unittest.TestCase): """Test of the search results for queries containing special terms.""" def test_special_terms_u1(self): """websearch - query for special terms, U(1)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl(self): """websearch - query for special terms, U(1) SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+SL%282%2CZ%29', expected_text="[88]")) def test_special_terms_u1_and_sl_or(self): """websearch - query for special terms, U(1) OR SL(2,Z)""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=U%281%29+OR+SL%282%2CZ%29', expected_text="[57, 79, 80, 88]")) def test_special_terms_u1_and_sl_or_parens(self): """websearch - query for special terms, (U(1) OR SL(2,Z))""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=%28U%281%29+OR+SL%282%2CZ%29%29', expected_text="[57, 79, 80, 88]")) class WebSearchJournalQueryTest(unittest.TestCase): """Test of the search results for journal pubinfo queries.""" def test_query_journal_title_only(self): """websearch - journal publication info query, title only""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B', expected_text="[77, 78, 85, 87]")) def test_query_journal_full_pubinfo(self): """websearch - journal publication info query, full reference""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&f=journal&p=Phys.+Lett.+B+531+%282002%29+301', expected_text="[78]")) class WebSearchStemmedIndexQueryTest(unittest.TestCase): """Test of the search results for queries using stemmed indexes.""" def test_query_stemmed_lowercase(self): """websearch - stemmed index query, lowercase""" # note that dasse/Dasse is stemmed into dass/Dass, as expected self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=dasse', expected_text="[25, 26]")) def test_query_stemmed_uppercase(self): """websearch - stemmed index query, uppercase""" # ... but note also that DASSE is stemmed into DASSE(!); so # the test would fail if the search engine would not lower the # query term. (Something that is not necessary for # non-stemmed indexes.) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=DASSE', expected_text="[25, 26]")) class WebSearchSummarizerTest(unittest.TestCase): """Test of the search results summarizer functions.""" def test_most_popular_field_values_singletag(self): """websearch - most popular field values, simple tag""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 37), ('ARTICLE', 28), ('BOOK', 14), ('THESIS', 8), ('PICTURE', 7), ('POETRY', 2), ('REPORT', 2), ('ATLANTISTIMESNEWS', 1)), get_most_popular_field_values(range(0,100), '980__a')) def test_most_popular_field_values_singletag_multiexclusion(self): """websearch - most popular field values, simple tag, multiple exclusions""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('PREPRINT', 37), ('ARTICLE', 28), ('BOOK', 14), ('REPORT', 2), ('ATLANTISTIMESNEWS', 1)), get_most_popular_field_values(range(0,100), '980__a', ('THESIS', 'PICTURE', 'POETRY'))) def test_most_popular_field_values_multitag(self): """websearch - most popular field values, multiple tags""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Ellis, J', 3), ('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'))) def test_most_popular_field_values_multitag_singleexclusion(self): """websearch - most popular field values, multiple tags, single exclusion""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('Enqvist, K', 1), ('Ibanez, L E', 1), ('Nanopoulos, D V', 1), ('Ross, G G', 1)), get_most_popular_field_values((9, 14, 18), ('100__a', '700__a'), ('Ellis, J'))) def test_most_popular_field_values_multitag_countrepetitive(self): """websearch - most popular field values, multiple tags, counting repetitive occurrences""" from invenio.search_engine import get_most_popular_field_values self.assertEqual((('THESIS', 2), ('REPORT', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=True)) self.assertEqual((('REPORT', 1), ('THESIS', 1)), get_most_popular_field_values((41,), ('690C_a', '980__a'), count_repetitive_values=False)) def test_ellis_citation_summary(self): """websearch - query ellis, citation summary output format""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis&of=hcs', expected_text="Less known papers (1-9)", expected_link_target=CFG_SITE_URL+"/search?p=ellis%20AND%20cited%3A1-%3E9&rm=citation", expected_link_label='1')) def test_ellis_not_quark_citation_summary_advanced(self): """websearch - ellis and not quark, citation summary format advanced""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&as=1&m1=a&p1=ellis&f1=author&op1=n&m2=a&p2=quark&f2=&op2=a&m3=a&p3=&f3=&action_search=Search&sf=&so=a&rm=&rg=10&sc=1&of=hcs', expected_text="Less known papers (1-9)", expected_link_target=CFG_SITE_URL+'/search?p=author%3Aellis%20and%20not%20quark%20AND%20cited%3A1-%3E9&rm=citation', expected_link_label='1')) def test_ellis_not_quark_citation_summary_regular(self): """websearch - ellis and not quark, citation summary format advanced""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?ln=en&p=author%3Aellis+and+not+quark&f=&action_search=Search&sf=&so=d&rm=&rg=10&sc=0&of=hcs', expected_text="Less known papers (1-9)", expected_link_target=CFG_SITE_URL+'/search?p=author%3Aellis%20and%20not%20quark%20AND%20cited%3A1-%3E9&rm=citation', expected_link_label='1')) class WebSearchRecordCollectionGuessTest(unittest.TestCase): """Primary collection guessing tests.""" def test_guess_primary_collection_of_a_record(self): """websearch - guess_primary_collection_of_a_record""" self.assertEqual(guess_primary_collection_of_a_record(96), 'Articles') def test_guess_collection_of_a_record(self): """websearch - guess_collection_of_a_record""" self.assertEqual(guess_collection_of_a_record(96), 'Articles') self.assertEqual(guess_collection_of_a_record(96, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Articles') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical Physics (TH)?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') self.assertEqual(guess_collection_of_a_record(12, '%s/collection/Theoretical%%20Physics%%20%%28TH%%29?ln=en' % CFG_SITE_URL), 'Theoretical Physics (TH)') class WebSearchGetFieldValuesTest(unittest.TestCase): """Testing get_fieldvalues() function.""" def test_get_fieldvalues_001(self): """websearch - get_fieldvalues() for bibxxx-agnostic tags""" self.assertEqual(get_fieldvalues(10, '001___'), ['10']) def test_get_fieldvalues_980(self): """websearch - get_fieldvalues() for bibxxx-powered tags""" self.assertEqual(get_fieldvalues(18, '700__a'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C1u'), ['CERN']) def test_get_fieldvalues_wildcard(self): """websearch - get_fieldvalues() for tag wildcards""" self.assertEqual(get_fieldvalues(18, '%'), []) self.assertEqual(get_fieldvalues(18, '7%'), []) self.assertEqual(get_fieldvalues(18, '700%'), ['Enqvist, K', 'Nanopoulos, D V']) self.assertEqual(get_fieldvalues(18, '909C0%'), ['1985', '13','TH']) def test_get_fieldvalues_recIDs(self): """websearch - get_fieldvalues() for list of recIDs""" self.assertEqual(get_fieldvalues([], '001___'), []) self.assertEqual(get_fieldvalues([], '700__a'), []) self.assertEqual(get_fieldvalues([10, 13], '001___'), ['10', '13']) self.assertEqual(get_fieldvalues([18, 13], '700__a'), ['Dawson, S', 'Ellis, R K', 'Enqvist, K', 'Nanopoulos, D V']) def test_get_fieldvalues_repetitive(self): """websearch - get_fieldvalues() for repetitive values""" self.assertEqual(get_fieldvalues([17, 18], '909C1u'), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=True), ['CERN', 'CERN']) self.assertEqual(get_fieldvalues([17, 18], '909C1u', repetitive_values=False), ['CERN']) class WebSearchAddToBasketTest(unittest.TestCase): """Test of the add-to-basket presence depending on user rights.""" def test_add_to_basket_guest(self): """websearch - add-to-basket facility allowed for guests""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='Add to basket')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='')) def test_add_to_basket_jekyll(self): """websearch - add-to-basket facility allowed for Dr. Jekyll""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='Add to basket', username='jekyll', password='j123ekyll')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', expected_text='', username='jekyll', password='j123ekyll')) def test_add_to_basket_hyde(self): """websearch - add-to-basket facility denied to Mr. Hyde""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', unexpected_text='Add to basket', username='hyde', password='h123yde')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=recid%3A10', unexpected_text='', username='hyde', password='h123yde')) class WebSearchAlertTeaserTest(unittest.TestCase): """Test of the alert teaser presence depending on user rights.""" def test_alert_teaser_guest(self): """websearch - alert teaser allowed for guests""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_link_label='email alert')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed')) def test_alert_teaser_jekyll(self): """websearch - alert teaser allowed for Dr. Jekyll""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='email alert', username='jekyll', password='j123ekyll')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed', username='jekyll', password='j123ekyll')) def test_alert_teaser_hyde(self): """websearch - alert teaser allowed for Mr. Hyde""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='email alert', username='hyde', password='h123yde')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=ellis', expected_text='RSS feed', username='hyde', password='h123yde')) class WebSearchSpanQueryTest(unittest.TestCase): """Test of span queries.""" def test_span_in_word_index(self): """websearch - span query in a word index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=year%3A1992-%3E1996&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_in_phrase_index(self): """websearch - span query in a phrase index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=year%3A%221992%22-%3E%221996%22&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_in_bibxxx(self): """websearch - span query in MARC tables""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=909C0y%3A%221992%22-%3E%221996%22&of=id&ap=0', expected_text='[17, 66, 69, 71]')) def test_span_with_spaces(self): """websearch - no span query when a space is around""" # useful for reaction search self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=title%3A%27mu%20--%3E%20e%27&of=id&ap=0', expected_text='[67]')) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=245%3A%27mu%20--%3E%20e%27&of=id&ap=0', expected_text='[67]')) def test_span_in_author(self): """websearch - span query in special author index""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=author%3A%22Ellis,%20K%22-%3E%22Ellis,%20RZ%22&of=id&ap=0', expected_text='[8, 11, 13, 17, 47]')) class WebSearchReferstoCitedbyTest(unittest.TestCase): """Test of refersto/citedby search operators.""" def test_refersto_recid(self): 'websearch - refersto:recid:84' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Arecid%3A84&of=id&ap=0', expected_text='[85, 88, 91]')) def test_refersto_repno(self): 'websearch - refersto:reportnumber:hep-th/0205061' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Areportnumber%3Ahep-th/0205061&of=id&ap=0', expected_text='[91]')) def test_refersto_author_word(self): 'websearch - refersto:author:klebanov' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Aauthor%3Aklebanov&of=id&ap=0', expected_text='[85, 86, 88, 91]')) def test_refersto_author_phrase(self): 'websearch - refersto:author:"Klebanov, I"' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Aauthor%3A%22Klebanov,%20I%22&of=id&ap=0', expected_text='[85, 86, 88, 91]')) def test_citedby_recid(self): 'websearch - citedby:recid:92' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Arecid%3A92&of=id&ap=0', expected_text='[74, 91]')) def test_citedby_repno(self): 'websearch - citedby:reportnumber:hep-th/0205061' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Areportnumber%3Ahep-th/0205061&of=id&ap=0', expected_text='[78]')) def test_citedby_author_word(self): 'websearch - citedby:author:klebanov' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Aauthor%3Aklebanov&of=id&ap=0', expected_text='[95]')) def test_citedby_author_phrase(self): 'websearch - citedby:author:"Klebanov, I"' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Aauthor%3A%22Klebanov,%20I%22&of=id&ap=0', expected_text='[95]')) def test_refersto_bad_query(self): 'websearch - refersto:title:' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=refersto%3Atitle%3A', expected_text='There are no records referring to title:.')) def test_citedby_bad_query(self): 'websearch - citedby:title:' self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=citedby%3Atitle%3A', expected_text='There are no records cited by title:.')) class WebSearchSPIRESSyntaxTest(unittest.TestCase): """Test of SPIRES syntax issues""" def test_and_not_parens(self): 'websearch - find a ellis, j and not a enqvist' self.assertEqual([], test_web_page_content(CFG_SITE_URL +'/search?p=find+a+ellis%2C+j+and+not+a+enqvist&of=id&ap=0', expected_text='[9, 12, 14, 47]')) def test_dadd_search(self): 'websearch - find da > today - 3650' # XXX: assumes we've reinstalled our site in the last 10 years # should return every document in the system self.assertEqual([], test_web_page_content(CFG_SITE_URL +'/search?ln=en&p=find+da+%3E+today+-+3650&f=&of=id', expected_text='[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104]')) class WebSearchDateQueryTest(unittest.TestCase): """Test various date queries.""" def setUp(self): """Establish variables we plan to re-use""" - from invenio.intbitset import intbitset as HitSet - self.empty = HitSet() + from invenio.intbitset import intbitset + self.empty = intbitset() def test_search_unit_hits_for_datecreated_previous_millenia(self): """websearch - search_unit with datecreated returns >0 hits for docs in the last 1000 years""" self.assertNotEqual(self.empty, search_unit('1000-01-01->9999-12-31', 'datecreated')) def test_search_unit_hits_for_datemodified_previous_millenia(self): """websearch - search_unit with datemodified returns >0 hits for docs in the last 1000 years""" self.assertNotEqual(self.empty, search_unit('1000-01-01->9999-12-31', 'datemodified')) def test_search_unit_in_bibrec_for_datecreated_previous_millenia(self): """websearch - search_unit_in_bibrec with creationdate gets >0 hits for past 1000 years""" self.assertNotEqual(self.empty, search_unit_in_bibrec("1000-01-01", "9999-12-31", 'creationdate')) def test_search_unit_in_bibrec_for_datecreated_next_millenia(self): """websearch - search_unit_in_bibrec with creationdate gets 0 hits for after year 3000""" self.assertEqual(self.empty, search_unit_in_bibrec("3000-01-01", "9999-12-31", 'creationdate')) class WebSearchSynonymQueryTest(unittest.TestCase): """Test of queries using synonyms.""" def test_journal_phrvd(self): """websearch - search-time synonym search, journal title""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=PHRVD&f=journal&of=id', expected_text="[66, 72]")) def test_journal_phrvd_54_1996_4234(self): """websearch - search-time synonym search, journal article""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=PHRVD%2054%20%281996%29%204234&f=journal&of=id', expected_text="[66]")) def test_journal_beta_decay_title(self): """websearch - index-time synonym search, beta decay in title""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=beta+decay&f=title&of=id', expected_text="[59]")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%CE%B2+decay&f=title&of=id', expected_text="[59]")) def test_journal_beta_decay_global(self): """websearch - index-time synonym search, beta decay in any field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=beta+decay&of=id', expected_text="[52, 59]")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%CE%B2+decay&of=id', expected_text="[52, 59]")) def test_journal_beta_title(self): """websearch - index-time synonym search, beta in title""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=beta&f=title&of=id', expected_text="[59]")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%CE%B2&f=title&of=id', expected_text="[59]")) def test_journal_beta_global(self): """websearch - index-time synonym search, beta in any field""" self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=beta&of=id', expected_text="[52, 59]")) self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?p=%CE%B2&of=id', expected_text="[52, 59]")) class WebSearchWashCollectionsTest(unittest.TestCase): """Test if the collection argument is washed correctly""" def test_wash_coll_when_coll_restricted(self): """websearch - washing of restricted daughter collections""" self.assertEqual( sorted(wash_colls(cc='', c=['Books & Reports', 'Theses'])[1]), ['Books & Reports', 'Theses']) self.assertEqual( sorted(wash_colls(cc='', c=['Books & Reports', 'Theses'])[2]), ['Books & Reports', 'Theses']) TEST_SUITE = make_test_suite(WebSearchWebPagesAvailabilityTest, WebSearchTestSearch, WebSearchTestBrowse, WebSearchTestOpenURL, WebSearchTestCollections, WebSearchTestRecord, WebSearchTestLegacyURLs, WebSearchNearestTermsTest, WebSearchBooleanQueryTest, WebSearchAuthorQueryTest, WebSearchSearchEnginePythonAPITest, WebSearchSearchEngineWebAPITest, WebSearchRestrictedCollectionTest, WebSearchRestrictedPicturesTest, WebSearchRSSFeedServiceTest, WebSearchXSSVulnerabilityTest, WebSearchResultsOverview, WebSearchSortResultsTest, WebSearchSearchResultsXML, WebSearchUnicodeQueryTest, WebSearchMARCQueryTest, WebSearchExtSysnoQueryTest, WebSearchResultsRecordGroupingTest, WebSearchSpecialTermsQueryTest, WebSearchJournalQueryTest, WebSearchStemmedIndexQueryTest, WebSearchSummarizerTest, WebSearchRecordCollectionGuessTest, WebSearchGetFieldValuesTest, WebSearchAddToBasketTest, WebSearchAlertTeaserTest, WebSearchSpanQueryTest, WebSearchReferstoCitedbyTest, WebSearchSPIRESSyntaxTest, WebSearchDateQueryTest, WebSearchTestWildcardLimit, WebSearchSynonymQueryTest, WebSearchWashCollectionsTest) if __name__ == "__main__": run_test_suite(TEST_SUITE, warn_user=True) diff --git a/modules/websearch/lib/websearch_webcoll.py b/modules/websearch/lib/websearch_webcoll.py index dd0d22910..5b42c16c3 100644 --- a/modules/websearch/lib/websearch_webcoll.py +++ b/modules/websearch/lib/websearch_webcoll.py @@ -1,1076 +1,1076 @@ ## This file is part of Invenio. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Create Invenio collection cache.""" __revision__ = "$Id$" import calendar import copy import sys import cgi import re import os import string import time from invenio.config import \ CFG_CERN_SITE, \ CFG_WEBSEARCH_INSTANT_BROWSE, \ CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \ CFG_WEBSEARCH_I18N_LATEST_ADDITIONS, \ CFG_CACHEDIR, \ CFG_SITE_LANG, \ CFG_SITE_NAME, \ CFG_SITE_LANGS, \ CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES, \ CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE from invenio.messages import gettext_set_language, language_list_long -from invenio.search_engine import HitSet, search_pattern_parenthesised, get_creation_date, get_field_i18nname, collection_restricted_p, sort_records +from invenio.search_engine import search_pattern_parenthesised, get_creation_date, get_field_i18nname, collection_restricted_p, sort_records from invenio.dbquery import run_sql, Error, get_table_update_time from invenio.bibrank_record_sorter import get_bibrank_methods from invenio.dateutils import convert_datestruct_to_dategui from invenio.bibformat import format_record from invenio.shellutils import mymkdir from invenio.intbitset import intbitset from invenio.websearch_external_collections import \ external_collection_load_states, \ dico_collection_external_searches, \ external_collection_sort_engine_by_name from invenio.bibtask import task_init, task_get_option, task_set_option, \ write_message, task_has_option, task_update_progress, \ task_sleep_now_if_required import invenio.template websearch_templates = invenio.template.load('websearch') from invenio.websearch_external_collections_searcher import external_collections_dictionary from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_TIMEOUT from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS ## global vars COLLECTION_HOUSE = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ... # CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE -- cache timestamp # tolerance (in seconds), to account for the fact that an admin might # accidentally happen to edit the collection definitions at exactly # the same second when some webcoll process was about to be started. # In order to be safe, let's put an exaggerated timestamp tolerance # value such as 20 seconds: CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE = 20 # CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE -- location of the cache # timestamp file: CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_updated" % CFG_CACHEDIR # CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE -- location of the cache # timestamp file usef when running webcoll in the fast-mode. CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_fast_updated" % CFG_CACHEDIR def get_collection(colname): """Return collection object from the collection house for given colname. If does not exist, then create it.""" if not COLLECTION_HOUSE.has_key(colname): colobject = Collection(colname) COLLECTION_HOUSE[colname] = colobject return COLLECTION_HOUSE[colname] ## auxiliary functions: def is_selected(var, fld): "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes." if var == fld: return ' selected="selected"' else: return "" def get_field(recID, tag): "Gets list of field 'tag' for the record with 'recID' system number." out = [] digit = tag[0:2] bx = "bib%sx" % digit bibx = "bibrec_bib%sx" % digit query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \ % (bx, bibx, recID, tag) res = run_sql(query) for row in res: out.append(row[0]) return out def check_nbrecs_for_all_external_collections(): """Check if any of the external collections have changed their total number of records, aka nbrecs. Return True if any of the total numbers of records have changed and False if they're all the same.""" res = run_sql("SELECT name FROM collection WHERE dbquery LIKE 'hostedcollection:%';") for row in res: coll_name = row[0] if (get_collection(coll_name)).check_nbrecs_for_external_collection(): write_message("External collection %s found updated." % coll_name, verbose=6) return True write_message("All external collections are up to date.", verbose=6) return False class Collection: "Holds the information on collections (id,name,dbquery)." def __init__(self, name=""): "Creates collection instance by querying the DB configuration database about 'name'." self.calculate_reclist_run_already = 0 # to speed things up without much refactoring self.update_reclist_run_already = 0 # to speed things up without much refactoring - self.reclist_with_nonpublic_subcolls = HitSet() + self.reclist_with_nonpublic_subcolls = intbitset() # used to store the temporary result of the calculation of nbrecs of an external collection self.nbrecs_tmp = None if not name: self.name = CFG_SITE_NAME # by default we are working on the home page self.id = 1 self.dbquery = None self.nbrecs = None - self.reclist = HitSet() + self.reclist = intbitset() else: self.name = name try: res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection WHERE name=%s""", (name,)) if res: self.id = res[0][0] self.name = res[0][1] self.dbquery = res[0][2] self.nbrecs = res[0][3] try: - self.reclist = HitSet(res[0][4]) + self.reclist = intbitset(res[0][4]) except: - self.reclist = HitSet() + self.reclist = intbitset() else: # collection does not exist! self.id = None self.dbquery = None self.nbrecs = None - self.reclist = HitSet() + self.reclist = intbitset() except Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1) def get_example_search_queries(self): """Returns list of sample search queries for this collection. """ res = run_sql("""SELECT example.body FROM example LEFT JOIN collection_example on example.id=collection_example.id_example WHERE collection_example.id_collection=%s ORDER BY collection_example.score""", (self.id,)) return [query[0] for query in res] def get_name(self, ln=CFG_SITE_LANG, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""): """Return nicely formatted collection name for language LN. The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc.""" out = prolog i18name = "" res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type)) try: i18name += res[0][0] except IndexError: pass if i18name: out += i18name else: out += self.name out += epilog return out def get_ancestors(self): "Returns list of ancestors of the current collection." ancestors = [] ancestors_ids = intbitset() id_son = self.id while 1: query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son) res = run_sql(query, None, 1) if res: col_ancestor = get_collection(res[0][1]) # looking for loops if self.id in ancestors_ids: write_message("Loop found in collection %s" % self.name, stream=sys.stderr) raise OverflowError("Loop found in collection %s" % self.name) else: ancestors.append(col_ancestor) ancestors_ids.add(col_ancestor.id) id_son = res[0][0] else: break ancestors.reverse() return ancestors def restricted_p(self): """Predicate to test if the collection is restricted or not. Return the contect of the `restrited' column of the collection table (typically Apache group). Otherwise return None if the collection is public.""" if collection_restricted_p(self.name): return 1 return None def get_sons(self, type='r'): "Returns list of direct sons of type 'type' for the current collection." sons = [] id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC, c.name ASC" % (int(id_dad), type) res = run_sql(query) for row in res: sons.append(get_collection(row[1])) return sons def get_descendants(self, type='r'): "Returns list of all descendants of type 'type' for the current collection." descendants = [] descendant_ids = intbitset() id_dad = self.id query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\ "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC" % (int(id_dad), type) res = run_sql(query) for row in res: col_desc = get_collection(row[1]) # looking for loops if self.id in descendant_ids: write_message("Loop found in collection %s" % self.name, stream=sys.stderr) raise OverflowError("Loop found in collection %s" % self.name) else: descendants.append(col_desc) descendant_ids.add(col_desc.id) tmp_descendants = col_desc.get_descendants() for descendant in tmp_descendants: descendant_ids.add(descendant.id) descendants += tmp_descendants return descendants def write_cache_file(self, filename='', filebody=''): "Write a file inside collection cache." # open file: dirname = "%s/collections/%d" % (CFG_CACHEDIR, self.id) mymkdir(dirname) fullfilename = dirname + "/%s.html" % filename try: os.umask(022) f = open(fullfilename, "w") except IOError, v: try: (code, message) = v except: code = 0 message = v print "I/O Error: " + str(message) + " (" + str(code) + ")" sys.exit(1) # print user info: write_message("... creating %s" % fullfilename, verbose=6) sys.stdout.flush() # print page body: f.write(filebody) # close file: f.close() def update_webpage_cache(self): """Create collection page header, navtrail, body (including left and right stripes) and footer, and call write_cache_file() afterwards to update the collection webpage cache.""" ## precalculate latest additions for non-aggregate ## collections (the info is ln and as independent) if self.dbquery and not CFG_WEBSEARCH_I18N_LATEST_ADDITIONS: self.create_latest_additions_info() ## do this for each language: for lang, lang_fullname in language_list_long(): # but only if some concrete language was not chosen only: if lang in task_get_option("language", [lang]): if self.dbquery and CFG_WEBSEARCH_I18N_LATEST_ADDITIONS: self.create_latest_additions_info(ln=lang) # load the right message language _ = gettext_set_language(lang) ## first, update navtrail: for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: self.write_cache_file("navtrail-as=%s-ln=%s" % (aas, lang), self.create_navtrail_links(aas, lang)) ## second, update page body: for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages: body = websearch_templates.tmpl_webcoll_body( ln=lang, collection=self.name, te_portalbox = self.create_portalbox(lang, 'te'), searchfor = self.create_searchfor(aas, lang), np_portalbox = self.create_portalbox(lang, 'np'), narrowsearch = self.create_narrowsearch(aas, lang, 'r'), focuson = self.create_narrowsearch(aas, lang, "v") + \ self.create_external_collections_box(lang), instantbrowse = self.create_instant_browse(aas=aas, ln=lang), ne_portalbox = self.create_portalbox(lang, 'ne') ) self.write_cache_file("body-as=%s-ln=%s" % (aas, lang), body) ## third, write portalboxes: self.write_cache_file("portalbox-tp-ln=%s" % lang, self.create_portalbox(lang, "tp")) self.write_cache_file("portalbox-te-ln=%s" % lang, self.create_portalbox(lang, "te")) self.write_cache_file("portalbox-lt-ln=%s" % lang, self.create_portalbox(lang, "lt")) self.write_cache_file("portalbox-rt-ln=%s" % lang, self.create_portalbox(lang, "rt")) ## fourth, write 'last updated' information: self.write_cache_file("last-updated-ln=%s" % lang, convert_datestruct_to_dategui(time.localtime(), ln=lang)) return def create_navtrail_links(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): """Creates navigation trail links, i.e. links to collection ancestors (except Home collection). If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. """ dads = [] for dad in self.get_ancestors(): if dad.name != CFG_SITE_NAME: # exclude Home collection dads.append((dad.name, dad.get_name(ln))) return websearch_templates.tmpl_navtrail_links( aas=aas, ln=ln, dads=dads) def create_portalbox(self, lang=CFG_SITE_LANG, position="rt"): """Creates portalboxes of language CFG_SITE_LANG of the position POSITION by consulting DB configuration database. The position may be: 'lt'='left top', 'rt'='right top', etc.""" out = "" query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\ " WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\ " ORDER BY cp.score DESC" % (self.id, lang, position) res = run_sql(query) for row in res: title, body = row[0], row[1] if title: out += websearch_templates.tmpl_portalbox(title = title, body = body) else: # no title specified, so print body ``as is'' only: out += body return out def create_narrowsearch(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG, type="r"): """Creates list of collection descendants of type 'type' under title 'title'. If aas==1, then links to Advanced Search interfaces; otherwise Simple Search. Suitable for 'Narrow search' and 'Focus on' boxes.""" # get list of sons and analyse it sons = self.get_sons(type) if not sons: return '' # get descendents descendants = self.get_descendants(type) grandsons = [] if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS: # load grandsons for each son for son in sons: grandsons.append(son.get_sons()) # return "" return websearch_templates.tmpl_narrowsearch( aas = aas, ln = ln, type = type, father = self, has_grandchildren = len(descendants)>len(sons), sons = sons, display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, grandsons = grandsons ) def create_external_collections_box(self, ln=CFG_SITE_LANG): external_collection_load_states() if not dico_collection_external_searches.has_key(self.id): return "" engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id]) return websearch_templates.tmpl_searchalso(ln, engines_list, self.id) def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG): """ Create info about latest additions that will be used for create_instant_browse() later. """ self.latest_additions_info = [] if self.nbrecs and self.reclist: # firstly, get last 'rg' records: recIDs = list(self.reclist) of = 'hb' # CERN hack begins: tweak latest additions for selected collections: if CFG_CERN_SITE: # alter recIDs list for some CERN collections: this_year = time.strftime("%Y", time.localtime()) if self.name in ['CERN Yellow Reports','Videos']: last_year = str(int(this_year) - 1) # detect recIDs only from this and past year: recIDs = list(self.reclist & \ search_pattern_parenthesised(p='year:%s or year:%s' % \ (this_year, last_year))) elif self.name in ['VideosXXX']: # detect recIDs only from this year: recIDs = list(self.reclist & \ search_pattern_parenthesised(p='year:%s' % this_year)) elif self.name == 'CMS Physics Analysis Summaries' and \ 1281585 in self.reclist: # REALLY, REALLY temporary hack recIDs = list(self.reclist) recIDs.remove(1281585) # apply special filters: if self.name in ['Videos']: # select only videos with movies: recIDs = list(intbitset(recIDs) & \ search_pattern_parenthesised(p='collection:"PUBLVIDEOMOVIE"')) of = 'hvp' # sort some CERN collections specially: if self.name in ['Videos', 'Video Clips', 'Video Movies', 'Video News', 'Video Rushes', 'Webcast', 'ATLAS Videos', 'Restricted Video Movies', 'Restricted Video Rushes', 'LHC First Beam Videos', 'CERN openlab Videos']: recIDs = sort_records(None, recIDs, '269__c') # CERN hack ends. total = len(recIDs) to_display = min(rg, total) for idx in range(total-1, total-to_display-1, -1): recid = recIDs[idx] self.latest_additions_info.append({'id': recid, 'format': format_record(recid, of, ln=ln), 'date': get_creation_date(recid, fmt="%Y-%m-%d
    %H:%i")}) return def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): "Searches database and produces list of last 'rg' records." if self.restricted_p(): return websearch_templates.tmpl_box_restricted_content(ln = ln) if str(self.dbquery).startswith("hostedcollection:"): return websearch_templates.tmpl_box_hosted_collection(ln = ln) if rg == 0: # do not show latest additions box return "" # CERN hack: do not display latest additions for some CERN collections: if CFG_CERN_SITE and self.name in ['Periodicals', 'Electronic Journals', 'Press Office Photo Selection', 'Press Office Video Selection']: return "" try: self.latest_additions_info latest_additions_info_p = True except: latest_additions_info_p = False if latest_additions_info_p: passIDs = [] for idx in range(0, min(len(self.latest_additions_info), rg)): # CERN hack: display the records in a grid layout, so do not show the related links if CFG_CERN_SITE and self.name in ['Videos']: passIDs.append({'id': self.latest_additions_info[idx]['id'], 'body': self.latest_additions_info[idx]['format'], 'date': self.latest_additions_info[idx]['date']}) else: passIDs.append({'id': self.latest_additions_info[idx]['id'], 'body': self.latest_additions_info[idx]['format'] + \ websearch_templates.tmpl_record_links(recid=self.latest_additions_info[idx]['id'], rm='citation', ln=ln), 'date': self.latest_additions_info[idx]['date']}) if self.nbrecs > rg: url = websearch_templates.build_search_url( cc=self.name, jrec=rg+1, ln=ln, aas=aas) else: url = "" # CERN hack: display the records in a grid layout if CFG_CERN_SITE and self.name in ['Videos']: return websearch_templates.tmpl_instant_browse( aas=aas, ln=ln, recids=passIDs, more_link=url, grid_layout=True) return websearch_templates.tmpl_instant_browse( aas=aas, ln=ln, recids=passIDs, more_link=url) return websearch_templates.tmpl_box_no_records(ln=ln) def create_searchoptions(self): "Produces 'Search options' portal box." box = "" query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id ORDER BY cff.score DESC""" % self.id res = run_sql(query) if res: for row in res: field_id = row[0] field_code = row[1] field_name = row[2] query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id) res_bis = run_sql(query_bis) if res_bis: values = [{'value' : '', 'text' : 'any' + ' ' + field_name}] # FIXME: internationalisation of "any" for row_bis in res_bis: values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]}) box += websearch_templates.tmpl_select( fieldname = field_code, values = values ) return box def create_sortoptions(self, ln=CFG_SITE_LANG): """Produces 'Sort options' portal box.""" # load the right message language _ = gettext_set_language(ln) box = "" query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id values = [{'value' : '', 'text': "- %s -" % _("latest first")}] res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': row[1]}) else: for tmp in ('title', 'author', 'report number', 'year'): values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) box = websearch_templates.tmpl_select( fieldname = 'sf', css_class = 'address', values = values ) box += websearch_templates.tmpl_select( fieldname = 'so', css_class = 'address', values = [ {'value' : 'a' , 'text' : _("asc.")}, {'value' : 'd' , 'text' : _("desc.")} ] ) return box def create_rankoptions(self, ln=CFG_SITE_LANG): "Produces 'Rank options' portal box." # load the right message language _ = gettext_set_language(ln) values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}] for (code, name) in get_bibrank_methods(self.id, ln): values.append({'value' : code, 'text': name}) box = websearch_templates.tmpl_select( fieldname = 'rm', css_class = 'address', values = values ) return box def create_displayoptions(self, ln=CFG_SITE_LANG): "Produces 'Display options' portal box." # load the right message language _ = gettext_set_language(ln) values = [] for i in ['10', '25', '50', '100', '250', '500']: values.append({'value' : i, 'text' : i + ' ' + _("results")}) box = websearch_templates.tmpl_select( fieldname = 'rg', css_class = 'address', values = values ) if self.get_sons(): box += websearch_templates.tmpl_select( fieldname = 'sc', css_class = 'address', values = [ {'value' : '1' , 'text' : _("split by collection")}, {'value' : '0' , 'text' : _("single list")} ] ) return box def create_formatoptions(self, ln=CFG_SITE_LANG): "Produces 'Output format options' portal box." # load the right message language _ = gettext_set_language(ln) box = "" values = [] query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf WHERE cf.id_collection=%d AND cf.id_format=f.id AND f.visibility='1' ORDER BY cf.score DESC, f.name ASC""" % self.id res = run_sql(query) if res: for row in res: values.append({'value' : row[0], 'text': row[1]}) else: values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")}) box = websearch_templates.tmpl_select( fieldname = 'of', css_class = 'address', values = values ) return box def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'): """Produces 'search within' selection box for the current collection.""" # get values query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id ORDER BY cff.score DESC, f.name ASC""" % self.id res = run_sql(query) values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}] if res: for row in res: values.append({'value' : row[0], 'text' : get_field_i18nname(row[1], ln)}) else: if CFG_CERN_SITE: for tmp in ['title', 'author', 'abstract', 'report number', 'year']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) else: for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'journal', 'year', 'fulltext', 'reference']: values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)}) return websearch_templates.tmpl_searchwithin_select( fieldname = fieldname, ln = ln, selected = value, values = values ) def create_searchexample(self): "Produces search example(s) for the current collection." out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id return out def create_searchfor(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG): "Produces either Simple or Advanced 'Search for' box for the current collection." if aas == 1: return self.create_searchfor_advanced(ln) elif aas == 0: return self.create_searchfor_simple(ln) else: return self.create_searchfor_light(ln) def create_searchfor_light(self, ln=CFG_SITE_LANG): "Produces light 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_light( ln=ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, example_search_queries=self.get_example_search_queries(), ) def create_searchfor_simple(self, ln=CFG_SITE_LANG): "Produces simple 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_simple( ln=ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option = self.create_searchwithin_selection_box(ln=ln), ) def create_searchfor_advanced(self, ln=CFG_SITE_LANG): "Produces advanced 'Search for' box for the current collection." return websearch_templates.tmpl_searchfor_advanced( ln = ln, collection_id = self.name, collection_name=self.get_name(ln=ln), record_count=self.nbrecs, middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln), middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln), middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln), searchoptions = self.create_searchoptions(), sortoptions = self.create_sortoptions(ln), rankoptions = self.create_rankoptions(ln), displayoptions = self.create_displayoptions(ln), formatoptions = self.create_formatoptions(ln) ) def calculate_reclist(self): """Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection.""" if self.calculate_reclist_run_already or str(self.dbquery).startswith("hostedcollection:"): # do we have to recalculate? return (self.reclist, self.reclist_with_nonpublic_subcolls) write_message("... calculating reclist of %s" % self.name, verbose=6) - reclist = HitSet() # will hold results for public sons only; good for storing into DB - reclist_with_nonpublic_subcolls = HitSet() # will hold results for both public and nonpublic sons; good for deducing total + reclist = intbitset() # will hold results for public sons only; good for storing into DB + reclist_with_nonpublic_subcolls = intbitset() # will hold results for both public and nonpublic sons; good for deducing total # number of documents if not self.dbquery: # A - collection does not have dbquery, so query recursively all its sons # that are either non-restricted or that have the same restriction rules for coll in self.get_sons(): coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist() if ((coll.restricted_p() is None) or (coll.restricted_p() == self.restricted_p())): # add this reclist ``for real'' only if it is public reclist.union_update(coll_reclist) reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls) else: # B - collection does have dbquery, so compute it: # (note: explicitly remove DELETED records) if CFG_CERN_SITE: reclist = search_pattern_parenthesised(None, self.dbquery + \ ' -980__:"DELETED" -980__:"DUMMY"') else: reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"') reclist_with_nonpublic_subcolls = copy.deepcopy(reclist) # store the results: self.nbrecs = len(reclist_with_nonpublic_subcolls) self.reclist = reclist self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls # last but not least, update the speed-up flag: self.calculate_reclist_run_already = 1 # return the two sets: return (self.reclist, self.reclist_with_nonpublic_subcolls) def calculate_nbrecs_for_external_collection(self, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT): """Calculate the total number of records, aka nbrecs, for given external collection.""" #if self.calculate_reclist_run_already: # do we have to recalculate? #return self.nbrecs #write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6) if external_collections_dictionary.has_key(self.name): engine = external_collections_dictionary[self.name] if engine.parser: self.nbrecs_tmp = engine.parser.parse_nbrecs(timeout) if self.nbrecs_tmp >= 0: return self.nbrecs_tmp # the parse_nbrecs() function returns negative values for some specific cases # maybe we can handle these specific cases, some warnings or something # for now the total number of records remains silently the same else: return self.nbrecs else: write_message("External collection %s does not have a parser!" % self.name, verbose=6) else: write_message("External collection %s not found!" % self.name, verbose=6) return 0 # last but not least, update the speed-up flag: #self.calculate_reclist_run_already = 1 def check_nbrecs_for_external_collection(self): """Check if the external collections has changed its total number of records, aka nbrecs. Rerurns True if the total number of records has changed and False if it's the same""" write_message("*** self.nbrecs = %s / self.cal...ion = %s ***" % (str(self.nbrecs), str(self.calculate_nbrecs_for_external_collection())), verbose=6) write_message("*** self.nbrecs != self.cal...ion = %s ***" % (str(self.nbrecs != self.calculate_nbrecs_for_external_collection()),), verbose=6) return self.nbrecs != self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS) def set_nbrecs_for_external_collection(self): """Set this external collection's total number of records, aka nbrecs""" if self.calculate_reclist_run_already: # do we have to recalculate? return write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6) if self.nbrecs_tmp: self.nbrecs = self.nbrecs_tmp else: self.nbrecs = self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS) # last but not least, update the speed-up flag: self.calculate_reclist_run_already = 1 def update_reclist(self): "Update the record universe for given collection; nbrecs, reclist of the collection table." if self.update_reclist_run_already: # do we have to reupdate? return 0 write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs), verbose=6) sys.stdout.flush() try: run_sql("UPDATE collection SET nbrecs=%s, reclist=%s WHERE id=%s", (self.nbrecs, self.reclist.fastdump(), self.id)) self.reclist_updated_since_start = 1 except Error, e: print "Database Query Error %d: %s." % (e.args[0], e.args[1]) sys.exit(1) # last but not least, update the speed-up flag: self.update_reclist_run_already = 1 return 0 def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"): """Returns a date string according to the format string. It can handle normal date strings and shifts with respect to now.""" date = time.time() shift_re = re.compile("([-\+]{0,1})([\d]+)([dhms])") factors = {"d":24*3600, "h":3600, "m":60, "s":1} m = shift_re.match(var) if m: sign = m.groups()[0] == "-" and -1 or 1 factor = factors[m.groups()[2]] value = float(m.groups()[1]) date = time.localtime(date + sign * factor * value) date = time.strftime(format_string, date) else: date = time.strptime(var, format_string) date = time.strftime(format_string, date) return date def get_current_time_timestamp(): """Return timestamp corresponding to the current time.""" return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) def compare_timestamps_with_tolerance(timestamp1, timestamp2, tolerance=0): """Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form '2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument (in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2 minus TOLERANCE, 0 if they are equal within TOLERANCE limit, and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE. """ # remove any trailing .00 in timestamps: timestamp1 = re.sub(r'\.[0-9]+$', '', timestamp1) timestamp2 = re.sub(r'\.[0-9]+$', '', timestamp2) # first convert timestamps to Unix epoch seconds: timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S")) timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S")) # now compare them: if timestamp1_seconds < timestamp2_seconds - tolerance: return -1 elif timestamp1_seconds > timestamp2_seconds + tolerance: return 1 else: return 0 def get_database_last_updated_timestamp(): """Return last updated timestamp for collection-related and record-related database tables. """ database_tables_timestamps = [] database_tables_timestamps.append(get_table_update_time('bibrec')) database_tables_timestamps.append(get_table_update_time('bibfmt')) try: database_tables_timestamps.append(get_table_update_time('idxWORD%')) except ValueError: # There are no indexes in the database. That's OK. pass database_tables_timestamps.append(get_table_update_time('collection%')) database_tables_timestamps.append(get_table_update_time('portalbox')) database_tables_timestamps.append(get_table_update_time('field%')) database_tables_timestamps.append(get_table_update_time('format%')) database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME')) return max(database_tables_timestamps) def get_cache_last_updated_timestamp(): """Return last updated cache timestamp.""" try: f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "r") except: return "1970-01-01 00:00:00" timestamp = f.read() f.close() return timestamp def set_cache_last_updated_timestamp(timestamp): """Set last updated cache timestamp to TIMESTAMP.""" try: f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "w") except: pass f.write(timestamp) f.close() return timestamp def main(): """Main that construct all the bibtask.""" task_init(authorization_action="runwebcoll", authorization_msg="WebColl Task Submission", description="""Description: webcoll updates the collection cache (record universe for a given collection plus web page elements) based on invenio.conf and DB configuration parameters. If the collection name is passed as an argument, only this collection's cache will be updated. If the recursive option is set as well, the collection's descendants will also be updated.\n""", help_specific_usage=" -c, --collection\t Update cache for the given " "collection only. [all]\n" " -r, --recursive\t Update cache for the given collection and all its\n" "\t\t\t descendants (to be used in combination with -c). [no]\n" " -f, --force\t\t Force update even if cache is up to date. [no]\n" " -p, --part\t\t Update only certain cache parts (1=reclist," " 2=webpage). [both]\n" " -l, --language\t Update pages in only certain language" " (e.g. fr,it,...). [all]\n", version=__revision__, specific_params=("c:rfp:l:", [ "collection=", "recursive", "force", "part=", "language=" ]), task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_submit_check_options_fnc=task_submit_check_options, task_run_fnc=task_run_core) def task_submit_elaborate_specific_parameter(key, value, opts, args): """ Given the string key it checks it's meaning, eventually using the value. Usually it fills some key in the options dict. It must return True if it has elaborated the key, False, if it doesn't know that key. eg: if key in ['-n', '--number']: self.options['number'] = value return True return False """ if key in ("-c", "--collection"): task_set_option("collection", value) elif key in ("-r", "--recursive"): task_set_option("recursive", 1) elif key in ("-f", "--force"): task_set_option("force", 1) elif key in ("-p", "--part"): task_set_option("part", int(value)) elif key in ("-l", "--language"): languages = task_get_option("language", []) languages += value.split(',') for ln in languages: if ln not in CFG_SITE_LANGS: print 'ERROR: "%s" is not a recognized language code' % ln return False task_set_option("language", languages) else: return False return True def task_submit_check_options(): if task_has_option('collection'): coll = get_collection(task_get_option("collection")) if coll.id is None: print 'ERROR: Collection "%s" does not exist' % coll.name return False return True def task_run_core(): """ Reimplement to add the body of the task.""" ## ## ------->--->time--->------> ## (-1) | ( 0) | ( 1) ## | | | ## [T.db] | [T.fc] | [T.db] ## | | | ## |<-tol|tol->| ## ## the above is the compare_timestamps_with_tolerance result "diagram" ## [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp ## ( -1, 0, 1) stand for the returned value ## tol stands for the tolerance in seconds ## ## When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc ## and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the ## collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus ## slightly before the T.db (practically the time distance between the start of the task and the last call of ## update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the ## meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if ## no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if ## webcoll runs again it will not be fully ran ## task_run_start_timestamp = get_current_time_timestamp() colls = [] # decide whether we need to run or not, by comparing last updated timestamps: write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3) write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3) if task_has_option("part"): write_message("Running cache update part %s only." % task_get_option("part"), verbose=3) if check_nbrecs_for_all_external_collections() or task_has_option("force") or \ compare_timestamps_with_tolerance(get_database_last_updated_timestamp(), get_cache_last_updated_timestamp(), CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0: ## either forced update was requested or cache is not up to date, so recreate it: # firstly, decide which collections to do: if task_has_option("collection"): coll = get_collection(task_get_option("collection")) colls.append(coll) if task_has_option("recursive"): r_type_descendants = coll.get_descendants(type='r') colls += r_type_descendants v_type_descendants = coll.get_descendants(type='v') colls += v_type_descendants else: res = run_sql("SELECT name FROM collection ORDER BY id") for row in res: colls.append(get_collection(row[0])) # secondly, update collection reclist cache: if task_get_option('part', 1) == 1: i = 0 for coll in colls: i += 1 write_message("%s / reclist cache update" % coll.name) if str(coll.dbquery).startswith("hostedcollection:"): coll.set_nbrecs_for_external_collection() else: coll.calculate_reclist() task_sleep_now_if_required() coll.update_reclist() task_update_progress("Part 1/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # thirdly, update collection webpage cache: if task_get_option("part", 2) == 2: i = 0 for coll in colls: i += 1 write_message("%s / webpage cache update" % coll.name) coll.update_webpage_cache() task_update_progress("Part 2/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # finally update the cache last updated timestamp: # (but only when all collections were updated, not when only # some of them were forced-updated as per admin's demand) if not task_has_option("collection"): set_cache_last_updated_timestamp(task_run_start_timestamp) write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3) else: ## cache up to date, we don't have to run write_message("Collection cache is up to date, no need to run.") ## we are done: return True ### okay, here we go: if __name__ == '__main__': main() diff --git a/modules/webstyle/lib/webinterface_handler_wsgi.py b/modules/webstyle/lib/webinterface_handler_wsgi.py index 5c535d22c..67b3ca78a 100644 --- a/modules/webstyle/lib/webinterface_handler_wsgi.py +++ b/modules/webstyle/lib/webinterface_handler_wsgi.py @@ -1,638 +1,648 @@ # -*- coding: utf-8 -*- ## This file is part of Invenio. ## Copyright (C) 2009, 2010, 2011 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """mod_python->WSGI Framework""" import sys import os import re import cgi import inspect from fnmatch import fnmatch from urlparse import urlparse, urlunparse from wsgiref.validate import validator from wsgiref.util import FileWrapper, guess_scheme if __name__ != "__main__": # Chances are that we are inside mod_wsgi. ## You can't write to stdout in mod_wsgi, but some of our ## dependecies do this! (e.g. 4Suite) sys.stdout = sys.stderr from invenio.session import get_session from invenio.webinterface_handler import CFG_HAS_HTTPS_SUPPORT, CFG_FULL_HTTPS from invenio.webinterface_layout import invenio_handler from invenio.webinterface_handler_wsgi_utils import table, FieldStorage from invenio.webinterface_handler_config import \ HTTP_STATUS_MAP, SERVER_RETURN, OK, DONE, \ HTTP_NOT_FOUND, HTTP_INTERNAL_SERVER_ERROR from invenio.config import CFG_WEBDIR, CFG_SITE_LANG, \ CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST, CFG_DEVEL_SITE, CFG_SITE_URL, \ CFG_SITE_SECURE_URL from invenio.errorlib import register_exception, get_pretty_traceback ## Static files are usually handled directly by the webserver (e.g. Apache) ## However in case WSGI is required to handle static files too (such ## as when running wsgiref simple server), then this flag can be ## turned on (it is done automatically by wsgi_handler_test). CFG_WSGI_SERVE_STATIC_FILES = False ## Magic regexp to search for usage of CFG_SITE_URL within src/href or ## any src usage of an external website _RE_HTTPS_REPLACES = re.compile(r"\b((?:src\s*=|url\s*\()\s*[\"']?)http\://", re.I) def _http_replace_func(match): ## src external_site -> CFG_SITE_SECURE_URL/sslredirect/external_site return match.group(1) + CFG_SITE_SECURE_URL + '/sslredirect/' _ESCAPED_CFG_SITE_URL = cgi.escape(CFG_SITE_URL, True) _ESCAPED_CFG_SITE_SECURE_URL = cgi.escape(CFG_SITE_SECURE_URL, True) def https_replace(html): html = html.replace(_ESCAPED_CFG_SITE_URL, _ESCAPED_CFG_SITE_SECURE_URL) return _RE_HTTPS_REPLACES.sub(_http_replace_func, html) class InputProcessed(object): """ Auxiliary class used when reading input. @see: . """ def read(self, *args): raise EOFError('The wsgi.input stream has already been consumed') readline = readlines = __iter__ = read class SimulatedModPythonRequest(object): """ mod_python like request object. Minimum and cleaned implementation to make moving out of mod_python easy. @see: """ def __init__(self, environ, start_response): self.__environ = environ self.__start_response = start_response self.__response_sent_p = False self.__buffer = '' self.__low_level_headers = [] self.__headers = table(self.__low_level_headers) self.__headers.add = self.__headers.add_header self.__status = "200 OK" self.__filename = None self.__disposition_type = None self.__bytes_sent = 0 self.__allowed_methods = [] self.__cleanups = [] self.headers_out = self.__headers ## See: self.__write = None self.__write_error = False self.__errors = environ['wsgi.errors'] self.__headers_in = table([]) self.__tainted = False self.__is_https = int(guess_scheme(self.__environ) == 'https') self.__replace_https = False - + self.track_writings = False + self.__what_was_written = "" for key, value in environ.iteritems(): if key.startswith('HTTP_'): self.__headers_in[key[len('HTTP_'):].replace('_', '-')] = value if environ.get('CONTENT_LENGTH'): self.__headers_in['content-length'] = environ['CONTENT_LENGTH'] if environ.get('CONTENT_TYPE'): self.__headers_in['content-type'] = environ['CONTENT_TYPE'] def get_wsgi_environ(self): return self.__environ def get_post_form(self): self.__tainted = True post_form = self.__environ.get('wsgi.post_form') input = self.__environ['wsgi.input'] if (post_form is not None and post_form[0] is input): return post_form[2] # This must be done to avoid a bug in cgi.FieldStorage self.__environ.setdefault('QUERY_STRING', '') ## Video handler hack: uri = self.__environ['PATH_INFO'] if uri.endswith("upload_video"): tmp_shared = True else: tmp_shared = False fs = FieldStorage(self, keep_blank_values=1, to_tmp_shared=tmp_shared) if fs.wsgi_input_consumed: new_input = InputProcessed() post_form = (new_input, input, fs) self.__environ['wsgi.post_form'] = post_form self.__environ['wsgi.input'] = new_input else: post_form = (input, None, fs) self.__environ['wsgi.post_form'] = post_form return fs def get_response_sent_p(self): return self.__response_sent_p def get_low_level_headers(self): return self.__low_level_headers def get_buffer(self): return self.__buffer def write(self, string, flush=1): if isinstance(string, unicode): self.__buffer += string.encode('utf8') else: self.__buffer += string if flush: self.flush() def flush(self): self.send_http_header() if self.__buffer: self.__bytes_sent += len(self.__buffer) try: if not self.__write_error: if self.__replace_https: self.__write(https_replace(self.__buffer)) else: self.__write(self.__buffer) + if self.track_writings: + if self.__replace_https: + self.__what_was_written += https_replace(self.__buffer) + else: + self.__what_was_written += self.__buffer except IOError, err: if "failed to write data" in str(err) or "client connection closed" in str(err): ## Let's just log this exception without alerting the admin: register_exception(req=self) self.__write_error = True ## This flag is there just ## to not report later other errors to the admin. else: raise self.__buffer = '' def set_content_type(self, content_type): self.__headers['content-type'] = content_type if self.__is_https: if content_type.startswith("text/html") or content_type.startswith("application/rss+xml"): self.__replace_https = True def get_content_type(self): return self.__headers['content-type'] def send_http_header(self): if not self.__response_sent_p: self.__tainted = True if self.__allowed_methods and self.__status.startswith('405 ') or self.__status.startswith('501 '): self.__headers['Allow'] = ', '.join(self.__allowed_methods) ## See: #print self.__low_level_headers self.__write = self.__start_response(self.__status, self.__low_level_headers) self.__response_sent_p = True #print "Response sent: %s" % self.__headers def get_unparsed_uri(self): return '?'.join([self.__environ['PATH_INFO'], self.__environ['QUERY_STRING']]) def get_uri(self): return self.__environ['PATH_INFO'] def get_headers_in(self): return self.__headers_in def get_subprocess_env(self): return self.__environ def add_common_vars(self): pass def get_args(self): return self.__environ['QUERY_STRING'] def get_remote_ip(self): if 'X-FORWARDED-FOR' in self.__headers_in and \ self.__headers_in.get('X-FORWARDED-SERVER', '') == \ self.__headers_in.get('X-FORWARDED-HOST', '') == \ urlparse(CFG_SITE_URL)[1]: ip = self.__headers_in['X-FORWARDED-FOR'].split(',')[0] if ip: return ip return self.__environ.get('REMOTE_ADDR') def get_remote_host(self): return self.__environ.get('REMOTE_HOST') def get_header_only(self): return self.__environ['REQUEST_METHOD'] == 'HEAD' def set_status(self, status): self.__status = '%s %s' % (status, HTTP_STATUS_MAP.get(int(status), 'Explanation not available')) def get_status(self): return int(self.__status.split(' ')[0]) def get_wsgi_status(self): return self.__status def sendfile(self, path, offset=0, the_len=-1): try: self.send_http_header() file_to_send = open(path) file_to_send.seek(offset) file_wrapper = FileWrapper(file_to_send) count = 0 if the_len < 0: for chunk in file_wrapper: count += len(chunk) self.__bytes_sent += len(chunk) self.__write(chunk) else: for chunk in file_wrapper: if the_len >= len(chunk): the_len -= len(chunk) count += len(chunk) self.__bytes_sent += len(chunk) self.__write(chunk) else: count += the_len self.__bytes_sent += the_len self.__write(chunk[:the_len]) break except IOError, err: if "failed to write data" in str(err) or "client connection closed" in str(err): ## Let's just log this exception without alerting the admin: register_exception(req=self) else: raise return self.__bytes_sent def set_content_length(self, content_length): if content_length is not None: self.__headers['content-length'] = str(content_length) else: del self.__headers['content-length'] def is_https(self): return self.__is_https def get_method(self): return self.__environ['REQUEST_METHOD'] def get_hostname(self): return self.__environ.get('HTTP_HOST', '') def set_filename(self, filename): self.__filename = filename if self.__disposition_type is None: self.__disposition_type = 'inline' self.__headers['content-disposition'] = '%s; filename=%s' % (self.__disposition_type, self.__filename) def set_encoding(self, encoding): if encoding: self.__headers['content-encoding'] = str(encoding) else: del self.__headers['content-encoding'] def get_bytes_sent(self): return self.__bytes_sent def log_error(self, message): self.__errors.write(message.strip() + '\n') def get_content_type_set_p(self): return bool(self.__headers['content-type']) def allow_methods(self, methods, reset=0): if reset: self.__allowed_methods = [] self.__allowed_methods += [method.upper().strip() for method in methods] def get_allowed_methods(self): return self.__allowed_methods def readline(self, hint=None): try: return self.__environ['wsgi.input'].readline(hint) except TypeError: ## the hint param is not part of wsgi pep, although ## it's great to exploit it in when reading FORM ## with large files, in order to avoid filling up the memory ## Too bad it's not there :-( return self.__environ['wsgi.input'].readline() def readlines(self, hint=None): return self.__environ['wsgi.input'].readlines(hint) def read(self, hint=None): return self.__environ['wsgi.input'].read(hint) def register_cleanup(self, callback, data=None): self.__cleanups.append((callback, data)) def get_cleanups(self): return self.__cleanups def get_referer(self): return self.headers_in.get('referer') + def get_what_was_written(self): + return self.__what_was_written + def __str__(self): from pprint import pformat out = "" for key in dir(self): try: if not callable(getattr(self, key)) and not key.startswith("_SimulatedModPythonRequest") and not key.startswith('__'): out += 'req.%s: %s\n' % (key, pformat(getattr(self, key))) except: pass return out def get_original_wsgi_environment(self): """ Return the original WSGI environment used to initialize this request object. @return: environ, start_response @raise AssertionError: in case the environment has been altered, i.e. either the input has been consumed or something has already been written to the output. """ assert not self.__tainted, "The original WSGI environment is tainted since at least req.write or req.form has been used." return self.__environ, self.__start_response content_type = property(get_content_type, set_content_type) unparsed_uri = property(get_unparsed_uri) uri = property(get_uri) headers_in = property(get_headers_in) subprocess_env = property(get_subprocess_env) args = property(get_args) header_only = property(get_header_only) status = property(get_status, set_status) method = property(get_method) hostname = property(get_hostname) filename = property(fset=set_filename) encoding = property(fset=set_encoding) bytes_sent = property(get_bytes_sent) content_type_set_p = property(get_content_type_set_p) allowed_methods = property(get_allowed_methods) response_sent_p = property(get_response_sent_p) form = property(get_post_form) remote_ip = property(get_remote_ip) remote_host = property(get_remote_host) referer = property(get_referer) + what_was_written = property(get_what_was_written) def alert_admin_for_server_status_p(status, referer): """ Check the configuration variable CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST to see if the exception should be registered and the admin should be alerted. """ status = str(status) for pattern in CFG_WEBSTYLE_HTTP_STATUS_ALERT_LIST: pattern = pattern.lower() must_have_referer = False if pattern.endswith('r'): ## e.g. "404 r" must_have_referer = True pattern = pattern[:-1].strip() ## -> "404" if fnmatch(status, pattern) and (not must_have_referer or referer): return True return False def application(environ, start_response): """ Entry point for wsgi. """ ## Needed for mod_wsgi, see: req = SimulatedModPythonRequest(environ, start_response) #print 'Starting mod_python simulation' try: try: possible_module, possible_handler = is_mp_legacy_publisher_path(environ['PATH_INFO']) if possible_module is not None: mp_legacy_publisher(req, possible_module, possible_handler) elif CFG_WSGI_SERVE_STATIC_FILES: possible_static_path = is_static_path(environ['PATH_INFO']) if possible_static_path is not None: from invenio.bibdocfile import stream_file stream_file(req, possible_static_path) else: ret = invenio_handler(req) else: ret = invenio_handler(req) req.flush() except SERVER_RETURN, status: status = int(str(status)) if status not in (OK, DONE): req.status = status req.headers_out['content-type'] = 'text/html' admin_to_be_alerted = alert_admin_for_server_status_p(status, req.headers_in.get('referer')) if admin_to_be_alerted: register_exception(req=req, alert_admin=True) if not req.response_sent_p: start_response(req.get_wsgi_status(), req.get_low_level_headers(), sys.exc_info()) return generate_error_page(req, admin_to_be_alerted) else: req.flush() except: register_exception(req=req, alert_admin=True) if not req.response_sent_p: req.status = HTTP_INTERNAL_SERVER_ERROR req.headers_out['content-type'] = 'text/html' start_response(req.get_wsgi_status(), req.get_low_level_headers(), sys.exc_info()) if CFG_DEVEL_SITE: return ["

    %s
    " % cgi.escape(get_pretty_traceback(req=req, exc_info=sys.exc_info()))] from cgitb import html return [html(sys.exc_info())] return generate_error_page(req) else: return generate_error_page(req, page_already_started=True) finally: for (callback, data) in req.get_cleanups(): callback(data) return [] def generate_error_page(req, admin_was_alerted=True, page_already_started=False): """ Returns an iterable with the error page to be sent to the user browser. """ from invenio.webpage import page from invenio import template webstyle_templates = template.load('webstyle') ln = req.form.get('ln', CFG_SITE_LANG) if page_already_started: return [webstyle_templates.tmpl_error_page(status=req.get_wsgi_status(), ln=ln, admin_was_alerted=admin_was_alerted)] else: return [page(title=req.get_wsgi_status(), body=webstyle_templates.tmpl_error_page(status=req.get_wsgi_status(), ln=ln, admin_was_alerted=admin_was_alerted), language=ln, req=req)] def is_static_path(path): """ Returns True if path corresponds to an exsting file under CFG_WEBDIR. @param path: the path. @type path: string @return: True if path corresponds to an exsting file under CFG_WEBDIR. @rtype: bool """ path = os.path.abspath(CFG_WEBDIR + path) if path.startswith(CFG_WEBDIR) and os.path.isfile(path): return path return None def is_mp_legacy_publisher_path(path): """ Checks path corresponds to an exsting Python file under CFG_WEBDIR. @param path: the path. @type path: string @return: the path of the module to load and the function to call there. @rtype: tuple """ path = path.split('/') for index, component in enumerate(path): if component.endswith('.py'): possible_module = os.path.abspath(CFG_WEBDIR + os.path.sep + os.path.sep.join(path[:index + 1])) possible_handler = '/'.join(path[index + 1:]).strip() if possible_handler.startswith('_'): return None, None if not possible_handler: possible_handler = 'index' if os.path.exists(possible_module) and possible_module.startswith(CFG_WEBDIR): return (possible_module, possible_handler) else: return None, None def mp_legacy_publisher(req, possible_module, possible_handler): """ mod_python legacy publisher minimum implementation. """ the_module = open(possible_module).read() module_globals = {} exec(the_module, module_globals) if possible_handler in module_globals and callable(module_globals[possible_handler]): from invenio.webinterface_handler import _check_result ## req is the required first parameter of any handler expected_args = list(inspect.getargspec(module_globals[possible_handler])[0]) if not expected_args or 'req' != expected_args[0]: ## req was not the first argument. Too bad! raise SERVER_RETURN, HTTP_NOT_FOUND ## the req.form must be casted to dict because of Python 2.4 and earlier ## otherwise any object exposing the mapping interface can be ## used with the magic ** form = dict(req.form) for key, value in form.items(): ## FIXME: this is a backward compatibility workaround ## because most of the old administration web handler ## expect parameters to be of type str. ## When legacy publisher will be removed all this ## pain will go away anyway :-) if isinstance(value, str): form[key] = str(value) else: ## NOTE: this is a workaround for e.g. legacy webupload ## that is still using legacy publisher and expect to ## have a file (Field) instance instead of a string. form[key] = value if (CFG_FULL_HTTPS or CFG_HAS_HTTPS_SUPPORT and get_session(req).need_https) and not req.is_https(): from invenio.urlutils import redirect_to_url # We need to isolate the part of the URI that is after # CFG_SITE_URL, and append that to our CFG_SITE_SECURE_URL. original_parts = urlparse(req.unparsed_uri) plain_prefix_parts = urlparse(CFG_SITE_URL) secure_prefix_parts = urlparse(CFG_SITE_SECURE_URL) # Compute the new path plain_path = original_parts[2] plain_path = secure_prefix_parts[2] + \ plain_path[len(plain_prefix_parts[2]):] # ...and recompose the complete URL final_parts = list(secure_prefix_parts) final_parts[2] = plain_path final_parts[-3:] = original_parts[-3:] target = urlunparse(final_parts) redirect_to_url(req, target) try: return _check_result(req, module_globals[possible_handler](req, **form)) except TypeError, err: if ("%s() got an unexpected keyword argument" % possible_handler) in str(err) or ('%s() takes at least' % possible_handler) in str(err): inspected_args = inspect.getargspec(module_globals[possible_handler]) expected_args = list(inspected_args[0]) expected_defaults = list(inspected_args[3]) expected_args.reverse() expected_defaults.reverse() register_exception(req=req, prefix="Wrong GET parameter set in calling a legacy publisher handler for %s: expected_args=%s, found_args=%s" % (possible_handler, repr(expected_args), repr(req.form.keys())), alert_admin=CFG_DEVEL_SITE) cleaned_form = {} for index, arg in enumerate(expected_args): if arg == 'req': continue if index < len(expected_defaults): cleaned_form[arg] = form.get(arg, expected_defaults[index]) else: cleaned_form[arg] = form.get(arg, None) return _check_result(req, module_globals[possible_handler](req, **cleaned_form)) else: raise else: raise SERVER_RETURN, HTTP_NOT_FOUND def check_wsgiref_testing_feasability(): """ In order to use wsgiref for running Invenio, CFG_SITE_URL and CFG_SITE_SECURE_URL must not use HTTPS because SSL is not supported. """ if CFG_SITE_URL.lower().startswith('https'): print >> sys.stderr, """ ERROR: SSL is not supported by the wsgiref simple server implementation. Please set CFG_SITE_URL not to start with "https". Currently CFG_SITE_URL is set to: "%s".""" % CFG_SITE_URL sys.exit(1) if CFG_SITE_SECURE_URL.lower().startswith('https'): print >> sys.stderr, """ ERROR: SSL is not supported by the wsgiref simple server implementation. Please set CFG_SITE_SECURE_URL not to start with "https". Currently CFG_SITE_SECURE_URL is set to: "%s".""" % CFG_SITE_SECURE_URL sys.exit(1) def wsgi_handler_test(port=80): """ Simple WSGI testing environment based on wsgiref. """ from wsgiref.simple_server import make_server global CFG_WSGI_SERVE_STATIC_FILES CFG_WSGI_SERVE_STATIC_FILES = True check_wsgiref_testing_feasability() validator_app = validator(application) httpd = make_server('', port, validator_app) print "Serving on port %s..." % port httpd.serve_forever() def main(): from optparse import OptionParser parser = OptionParser() parser.add_option('-t', '--test', action='store_true', dest='test', default=False, help="Run a WSGI test server via wsgiref (not using Apache).") parser.add_option('-p', '--port', type='int', dest='port', default='80', help="The port where the WSGI test server will listen. [80]") (options, args) = parser.parse_args() if options.test: wsgi_handler_test(options.port) else: parser.print_help() if __name__ == "__main__": main()

    You are viewing an HTML version of the XML OAI response. To see the underlying XML use your web browsers view source option. More information about this XSLT is at the bottom of the page.