bibclassify_daemon.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, May 9, 20:16

bibclassify_daemon.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""
	BibClassify daemon.

	FIXME: the code below requires collection table to be updated to add column:
	clsMETHOD_fk mediumint(9) unsigned NOT NULL,
	This is not clean and should be fixed.
	"""

	__revision__ = "$Id$"

	import sys

	from invenio.dbquery import run_sql
	from invenio.bibtask import task_init, write_message, get_datetime, \
	task_set_option, task_get_option, task_get_task_param, task_update_status, \
	task_update_progress
	from invenio.bibclassifylib import generate_keywords_rdf
	from invenio.config import *
	from os import popen, remove, listdir
	import sys
	from invenio.intbitset import intbitset
	from invenio.search_engine import get_collection_reclist
	from invenio.bibdocfile import BibRecDocs
	import time
	import os


	def get_recids_foreach_ontology():
	"""Returns an array containing hash objects containing the collection, its
	corresponding ontology and the records belonging to the given collection."""

	rec_onts = []
	res = run_sql("""SELECT clsMETHOD.name, last_updated, collection.name
	FROM clsMETHOD JOIN collection_clsMETHOD ON clsMETHOD.id=id_clsMETHOD
	JOIN collection ON id_collection=collection.id""")
	for ontology, date_last_run, collection in res:
	recs = get_collection_reclist(collection)
	if recs:
	if not date_last_run:
	date_last_run = '0000-00-00'
	modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date >=%s", (date_last_run, )))
	recs &= modified_records
	if recs:
	rec_onts.append({
	'ontology' : ontology,
	'collection' : collection,
	'recIDs' : recs
	})
	return rec_onts

	def update_date_of_last_run():
	"""
	Update bibclassify daemon table information about last run time.
	"""
	run_sql("UPDATE clsMETHOD SET last_updated=NOW()")

	def task_run_core():
	"""Runs anayse_documents for each ontology,collection,record ids set."""
	outfilename = CFG_TMPDIR + "/bibclassifyd_%s.xml" % time.strftime("%Y%m%dH%M%S", time.localtime())
	outfiledesc = open(outfilename, "w")
	coll_counter = 0
	print >> outfiledesc, """<?xml version="1.0" encoding="UTF-8"?>"""
	print >> outfiledesc, """<collection xmlns="http://www.loc.gov/MARC21/slim">"""
	for onto_rec in get_recids_foreach_ontology():
	write_message('Applying taxonomy %s to collection %s (%s records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])))
	if onto_rec['recIDs']:
	coll_counter += analyse_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection'], outfilename, outfiledesc)
	print >> outfiledesc, '</collection>'
	outfiledesc.close()
	if coll_counter:
	cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, outfilename)
	errcode = 0
	try:
	errcode = os.system(cmd)
	except OSError, e:
	print 'command' + cmd + ' failed ',e
	if errcode != 0:
	write_message("WARNING, %s failed, error code is %s" % (cmd,errcode))
	return 0
	update_date_of_last_run()
	return 1

	def analyse_documents(recs, ontology, collection, outfilename, outfiledesc):
	"""For each collection, parse the documents attached to the records in collection with the corresponding ontology."""

	time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
	did_something = False
	counter = 1
	max = len(recs)
	# store running time:
	# see which records we'll have to process:
	#recIDs = get_recIDs_of_modified_records_since_last_run()
	temp_text = None
	if recs:
	# process records:
	cmd = None
	path = None
	temp_text = CFG_TMPDIR + '/bibclassify.pdftotext.' + str(os.getpid())
	for rec in recs:
	bibdocfiles = BibRecDocs(rec).list_latest_files()
	found_one_pdf = False
	for bibdocfile in bibdocfiles:
	if bibdocfile.get_format() == '.pdf':
	found_one_pdf = True
	if found_one_pdf:
	did_something = True
	print >> outfiledesc, '<record>'
	print >> outfiledesc, """<controlfield tag="001">%(recID)s</controlfield>""" % ({'recID':rec})
	for f in bibdocfiles:
	if f.get_format() == '.pdf':

	cmd = "%s '%s' '%s'" % (CFG_PATH_PDFTOTEXT, f.get_full_path(), temp_text)
	else:
	write_message("Can't parse file %s." % f.get_full_path(), verbose=3)
	continue
	errcode = os.system(cmd)
	if errcode != 0 or not os.path.exists("%s" % temp_text):
	write_message("Error while executing command %s Error code was: %s " % (cmd, errcode))
	write_message('Generating keywords for %s' % f.get_full_path())
	print >> outfiledesc, generate_keywords_rdf(temp_text, CFG_ETCDIR + '/bibclassify/' + ontology + '.rdf', 2, 70, 25, 0, False, verbose=0, ontology=ontology)
	print >> outfiledesc, '</record>'
	task_update_progress("Done %s of %s for collction %s." % (counter, max, collection))
	counter += 1
	else:
	write_message("Nothing to be done, move along")
	return did_something

	def cleanup_tmp():
	"""Remove old temporary files created by this module"""
	for f in listdir(CFG_TMPDIR):
	if 'bibclassify' in f: remove(CFG_TMPDIR + '/' +f)

	def main():
	"""Constructs the bibclassifyd bibtask."""
	cleanup_tmp()
	task_init(authorization_action='runbibclassify',
	authorization_msg="BibClassify Task Submission",
	description="""Examples:
	%s -u admin
	""" % (sys.argv[0],),
	version=__revision__,
	task_run_fnc = task_run_core)

	if __name__ == '__main__':
	main()


	# FIXME: one can have more than one ontologies in clsMETHOD.
	# bibclassifyd -w HEP,Pizza

	# FIXME: add more CLI options like bibindex ones, e.g.
	# bibclassifyd -a -i 10-20

	# FIXME: outfiledesc can be multiple files, e.g. when processing
	# 100000 records it is good to store results by 1000 records
	# (see oaiharvest)

bibclassify_daemon.pyNo OneTemporaryActions

File Metadata

bibclassify_daemon.pyView Options

Event Timeline

bibclassify_daemon.py
No OneTemporary
Actions

bibclassify_daemon.py
View Options