bibwords.wml
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Dec 3, 08:41

bibwords.wml
View Options

This document is not UTF8. It was detected as Shift JIS and converted to UTF8 for display.

	## $Id$
	## BibWords bibliographic data, reference and fulltext indexing utility.

	## This file is part of the CERN Document Server Software (CDSware).
	## Copyright (C) 2002 CERN.
	##
	## The CDSware is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## The CDSware is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDSware; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	## read config variables:
	#include "config.wml"
	#include "configbis.wml"
	## start Python:
	<protect>#!</protect><PYTHON>
	<protect>## $Id$</protect>
	<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
	"""
	BibWords bibliographic data, reference and fulltext indexing utility.

	Usage: bibwords %s [options]
	Examples:
	bibwords -a -i 234-250,293,300-500 -u admin@cdsware
	bibwords -a -w author,fulltext -M 8192 -d
	bibwords -x -m +4d -A on --flush=10000

	Command options:
	-a, --add add or update selected records
	-x, --del delete selected records
	-i, --id=low[-high] select according to record recID.
	-m, --modified=from[,to] select according to modification date
	-w, --wordsindex=w1[,w2] words indexes to consider (all)

	Specific options:
	-M, --maxmem=XXX maximum memory usage in kB (no limit)
	-f, --flush=NNN full consistent table flush after NNN records (5000)

	Scheduler options:
	-u, --user=USER user name to store task, password needed
	-p, --sleeptime=SLEEP time after which to repeat tasks (no)
	e.g.: 1s, 30m, 24h, 7d
	-t, --time=DATE moment for the task to be active (now)
	e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26

	General options:
	-h, --help print this help
	-v, --version print version
	-d, --debug log debugging information
	-V, --verbose log extra debugging information verbose
	"""

	## fill config variables:
	pylibdir = "<LIBDIR>/python"

	## programs used to convert fulltext files to text:
	conv_programs = {#"ps": ["<PSTOTEXT>","<PSTOASCII>"], # switched off at the moment, since PDF is faster
	#"ps.gz": ["<PSTOTEXT>","<PSTOASCII>"],
	"pdf": ["<PDFTOTEXT>","<PSTOTEXT>","<PSTOASCII>"],
	"doc": ["<ANTIWORD>","<CATDOC>","<WVTEXT>"],
	"ppt": ["<PPTHTML>"],
	"xls": ["<XLHTML>"]
	}
	## helper programs used if the above programs convert only to html or other intermediate file formats:
	conv_programs_helpers = {"html": "<HTMLTOTEXT>",
	"gz": "<GZIP>" }

	## okay, rest of the Python code goes below
	#######

	## version number:
	__version__ = "$Id$"


	## import interesting modules:
	try:
	from marshal import loads,dumps
	from zlib import compress,decompress
	from string import split,translate,lower,upper
	import getopt
	import string
	import os
	import re
	import sys
	import time
	import MySQLdb
	import Numeric
	import urllib
	import signal
	import threading
	import unicodedata
	import traceback
	import cStringIO
	except ImportError, e:
	print "Error: %s" % e
	import sys
	sys.exit(1)

	try:
	sys.path.append('%s' % pylibdir)
	from cdsware.config import *
	from cdsware.search_engine_config import cfg_max_recID
	from cdsware.dbquery import run_sql
	except ImportError, e:
	print "Error: %s" % e
	import sys
	sys.exit(1)


	## import optional modules:
	try:
	import psyco
	psyco.bind(get_words_from_phrase)
	psyco.bind(merge_with_old_recIDs)
	psyco.bind(serialize_via_numeric_array)
	psyco.bind(serialize_via_marshal)
	psyco.bind(deserialize_via_numeric_array)
	psyco.bind(deserialize_via_marshal)
	except:
	pass

	## override urllib's default password-asking behaviour:
	class MyFancyURLopener(urllib.FancyURLopener):
	def prompt_user_passwd(self, host, realm):
	# supply some dummy credentials by default
	return ("mysuperuser", "mysuperpass")
	def http_error_401(self, url, fp, errcode, errmsg, headers):
	# do not bother with protected pages
	raise IOError, (999, 'unauthorized access')
	return None

	#urllib._urlopener = MyFancyURLopener()

	def Log(msg, type=""):
	"Logs a message to the database."
	if type == "Debug" and not opts_dict["debug"]:
	return
	if type == "Verbose" and not (opts_dict["debug"] or opts_dict["verbose"]):
	return

	if type == "Error":
	out = sys.stderr
	else:
	out = sys.stdout

	out.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
	out.write(str(msg))
	if type:
	out.write(" (" + type + ")")
	out.write("\n")
	out.flush()

	## precompile some often-used regexp for speed reasons:
	re_subfields = re.compile('\$\$\w');

	nb_char_in_line = 50 # for verbose pretty printing
	chunksize = 1000 # default size of chunks that the records will be treated by
	wordTables = []
	taskid = -1
	base_process_size = 4500 # process base size
	opts_dict = {} # will hold task options

	## build iso-latin-1 to "undotted" ascii translation table
	def build_table_latin1_to_ascii():
	"""Builds translation table from ISO Latin-1 into ASCII.
	For example, 'f?lin' gets translated into 'felin', etc.
	Suitable for search string pattern replacement. Much faster than regexps."""
	table = range(256)
	for i in table:
	x = unicodedata.decomposition(unichr(i))
	if x and x[0] == "0":
	table[i] = int(x.split()[0], 16)
	return string.join(map(chr, table), "")

	# build conversion table:
	table_latin1_to_ascii = build_table_latin1_to_ascii()


	## Dictionary merging functions

	def intersection(dict, dict2):
	"Returns intersection of the two dictionaries."
	int_dict = {}
	if len(dict1) > len(dict2):
	for e in dict2:
	if dict1.has_key(e):
	int_dict[e] = 1
	else:
	for e in dict1:
	if dict2.has_key(e):
	int_dict[e] = 1
	return int_dict

	def union(dict1, dict2):
	"Returns union of the two dictionaries."
	union_dict = {}
	for e in dict1.keys():
	union_dict[e] = 1
	for e in dict2.keys():
	union_dict[e] = 1
	return union_dict

	def diff(dict1, dict2):
	"Returns dict1 - dict2."
	diff_dict = {}
	for e in dict1.keys():
	if not dict2.has_key(e):
	diff_dict[e] = 1
	return diff_dict

	def list_union(list1, list2):
	"Returns union of the two lists."
	union_dict = {}
	for e in list1:
	union_dict[e] = 1
	for e in list2:
	union_dict[e] = 1
	return union_dict.keys()

	## MARC-21 tag/field access functions

	def get_fieldvalues(recID, tag):
	"""Returns list of values of the MARC-21 'tag' fields for the record
	'recID'."""

	out = []
	bibXXx = "bib" + tag[0] + tag[1] + "x"
	bibrec_bibXXx = "bibrec_" + bibXXx
	query = "SELECT value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id AND tag LIKE '%s'" \
	% (bibXXx, bibrec_bibXXx, recID, tag)
	res = run_sql(query)
	for row in res:
	out.append(row[0])
	return out

	def get_field_tags(field):
	"""Returns a list of MARC tags for the field code 'field'.
	Returns empty list in case of error.
	Example: field='author', output=['100__%','700__%']."""
	out = []
	query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f
	WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag
	ORDER BY ft.score DESC""" % field
	res = run_sql(query)
	for row in res:
	out.append(row[0])
	return out

	## Fulltext word extraction functions
	def separator_table(separators):
	"""Returns a translation table to convert the characters according to
	the separators regex into spaces."""
	expr = re.compile(separators)
	table = range(256)
	for i in table:
	x = unichr(i)
	if expr.match(x):
	table[i] = ord(' ')
	return string.join(map(chr, table), "")

	# Standard separator table
	word_separator_table = separator_table("[\s]")

	def get_fulltext_urls_from_html_page(htmlpagebody):
	"""Parses htmlpagebody data looking for url_directs referring to
	probable fulltexts.
	Returns an array of (ext,url_direct) to fulltexts.
	Note: it looks for file format extensions as defined by global
	'conv_programs'structure.
	"""
	out = []
	for ext in conv_programs.keys():
	expr = re.compile( r"(\"http://[\w]+\.+[\w]+[^\"'><]*\.)(" + \
	ext + r")\"")
	match = expr.search(htmlpagebody)
	if match:
	out.append([ext,match.group()])
	return out

	def get_words_from_fulltext(url_indirect,separators="[^\w]",split=string.split):
	"""Returns all the words contained in the fulltext whose url
	is contained in the document pointed to in phrase.
	Please note the double indirection. url_indirect
	returns a document that has to be parsed to get the actual
	urls."""

	url_direct = None
	fulltext_urls = None
	# check for direct link in url
	url_indirect_ext = lower(split(url_indirect,".")[-1])

	if url_indirect_ext in conv_programs.keys():
	fulltext_urls = [(url_indirect_ext,url_indirect)]

	# Indirect url. Try to fetch the real fulltext(s)
	if not fulltext_urls:
	# read "setlink" data
	try:
	htmlpagebody = urllib.urlopen(url_indirect).read()
	except:
	sys.stderr.write("Error: Cannot read %s.\n" % url_indirect)
	return []
	fulltext_urls = get_fulltext_urls_from_html_page(htmlpagebody)

	words = {}
	table = separator_table(separators)

	# process as many urls as they were found:
	for (ext,url_direct) in fulltext_urls:

	if opts_dict["debug"]:
	print "Debug: processing %s from %s" % (ext,url_direct)

	# sanity check:
	if not url_direct:
	break;

	# read fulltext file:
	try:
	url = urllib.urlopen(url_direct[1:-1])
	except:
	sys.stderr.write("Error: Cannot read %s.\n" % url_direct[1:-1])
	break # try other fulltext files...

	tmp_name = os.tempnam()
	tmp_fd = open(tmp_name, "w")
	data_chunk = url.read(8*1024)
	while data_chunk:
	tmp_fd.write(data_chunk)
	data_chunk = url.read(8*1024)
	tmp_fd.close()

	# try all available conversion programs according to their order:
	bingo = 0
	for conv_program in conv_programs[ext]:
	if os.path.exists(conv_program):
	# intelligence on how to run various conversion programs:
	cmd = "" # wil keep command to run
	bingo = 0 # had we success?
	if os.path.basename(conv_program) == "pdftotext":
	cmd = "%s %s %s.txt" % (conv_program, tmp_name, tmp_name)
	elif os.path.basename(conv_program) == "pstotext":
	if ext == "ps.gz":
	# is there gzip available?
	if os.path.exists(conv_programs_helpers["gz"]):
	cmd = "%s -cd %s \| %s > %s.txt" \
	% (conv_programs_helpers["gz"], tmp_name, conv_program, tmp_name)
	else:
	cmd = "%s %s > %s.txt" \
	% (conv_program, tmp_name, tmp_name)
	elif os.path.basename(conv_program) == "ps2ascii":
	if ext == "ps.gz":
	# is there gzip available?
	if os.path.exists(conv_programs_helpers["gz"]):
	cmd = "%s -cd %s \| %s > %s.txt"\
	% (conv_programs_helpers["gz"], tmp_name,
	conv_program, tmp_name)
	else:
	cmd = "%s %s %s.txt" \
	% (conv_program, tmp_name, tmp_name)
	elif os.path.basename(conv_program) == "antiword":
	cmd = "%s %s > %s.txt" % (conv_program, tmp_name, tmp_name)
	elif os.path.basename(conv_program) == "catdoc":
	cmd = "%s %s > %s.txt" % (conv_program, tmp_name, tmp_name)
	elif os.path.basename(conv_program) == "wvText":
	cmd = "%s %s %s.txt" % (conv_program, tmp_name, tmp_name)
	elif os.path.basename(conv_program) == "ppthtml":
	# is there html2text available?
	if os.path.exists(conv_programs_helpers["html"]):
	cmd = "%s %s \| %s > %s.txt"\
	% (conv_program, tmp_name,
	conv_programs_helpers["html"], tmp_name)
	else:
	cmd = "%s %s > %s.txt" \
	% (conv_program, tmp_name, tmp_name)
	elif os.path.basename(conv_program) == "xlhtml":
	# is there html2text available?
	if os.path.exists(conv_programs_helpers["html"]):
	cmd = "%s %s \| %s > %s.txt" % \
	(conv_program, tmp_name,
	conv_programs_helpers["html"], tmp_name)
	else:
	cmd = "%s %s > %s.txt" % \
	(conv_program, tmp_name, tmp_name)
	else:
	sys.stderr.write("Error: Do not know how to handle %s conversion program.\n" % conv_program)
	# try to run it:
	try:
	if opts_dict["debug"]:
	print "Debug: launching %s" % cmd
	errcode = os.system(cmd)
	if errcode == 0 and os.path.exists("%s.txt" % tmp_name):
	bingo = 1
	break # bingo!
	else:
	sys.stderr.write("Error: while running %s for %s.\n" % (cmd, url_direct))
	except:
	if opts_dict["debug"]:
	sys.stderr.write("Debug: Error running %s for %s.\n" % (cmd, url_direct))

	# were we successful?
	if bingo:
	tmp_name_txt_file = open("%s.txt" % tmp_name)
	for phrase in tmp_name_txt_file.xreadlines():
	phrase = asciify_accented_letters(phrase)
	phrase = translate(phrase, table)
	for word in split(phrase):
	if not words.has_key(word):
	words[word] = 1;
	tmp_name_txt_file.close()
	else:
	if opts_dict["debug"]:
	sys.stderr.write("Debug: No conversion success for %s.\n" % (url_direct))

	# delete temp files (they might not exist):
	try:
	os.unlink(tmp_name)
	os.unlink(tmp_name + ".txt")
	except StandardError:
	Log("Could not delete file. It didn't exist","Error")

	# print interesting info:
	if opts_dict["debug"]:
	print "Debug: words table size is %d" % len(words)


	return words.keys()

	# tagToFunctions mapping. It offers an indirection level necesary for
	# indexing fulltext. The default is get_words_from_phrase
	tagToWordsFunctions = {'8564_u':get_words_from_fulltext}

	def get_words_from_phrase(phrase,
	chars_punctuation=r"[\.\,\:\;\?\!\"]",
	chars_alphanumericseparators=r"[^\w]",
	split=string.split):
	"Returns list of words from phrase 'phrase'."
	words = {}
	# 1st split phrase into blocks according to whitespace
	for block in split(asciify_accented_letters(phrase)):
	# 2nd remove leading/trailing punctuation and add block:
	block = re.sub(r"^"+chars_punctuation+"+", "", block)
	block = re.sub(chars_punctuation+"+$", "", block)
	if block:
	words[block] = 1
	# 3rd break each block into subblocks according to punctuation and add subblocks:
	for subblock in re.split(chars_punctuation, block):
	if subblock:
	words[subblock] = 1
	# 4th break each subblock into alphanumeric groups and add groups:
	for alphanumeric_group in re.split(chars_alphanumericseparators, subblock):
	if alphanumeric_group:
	words[alphanumeric_group] = 1
	return words.keys()

	def remove_subfields(s):
	"Removes subfields from string, e.g. 'foo $$c bar' becomes 'foo bar'."
	return re_subfields.sub(' ', s)


	def asciify_accented_letters(s):
	"Translates ISO-8859-1 accented letters into their ASCII equivalents."
	return translate(s, table_latin1_to_ascii)

	def get_field_tags(field):
	"""Returns a list of MARC tags for the field code 'field'.
	Returns empty list in case of error.
	Example: field='author', output=['100__%','700__%']."""
	out = []
	query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f
	WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag
	ORDER BY ft.score DESC""" % field
	res = run_sql(query)
	for row in res:
	out.append(row[0])
	return out

	def get_wordsindex_id(wordsindexname):
	"""Returns the words index id for 'wordsindexname' name.
	Returns empty string in case there is no words table for this index.
	Example: field='author', output=4."""
	out = 0
	query = """SELECT w.id FROM wordsindex AS w
	WHERE w.name='%s' LIMIT 1""" % wordsindexname
	res = run_sql(query, None, 1)
	if res:
	out = res[0][0]
	return out

	def get_wordsindex_tags(wordsindexname):
	"""Returns the list of tags that are indexed inside 'wordsindexname' words
	index.
	Returns empty list in case there are no tags indexed in this index.
	Note: uses get_field_tags() defined before.
	Example: field='author', output=['100__%', '700__%']."""
	out = []
	query = """SELECT f.code FROM wordsindex AS w, wordsindex_field AS wf,
	field AS f WHERE w.name='%s' AND w.id=wf.id_wordsindex
	AND f.id=wf.id_field""" % wordsindexname
	res = run_sql(query)
	for row in res:
	out.extend(get_field_tags(row[0]))
	return out

	def get_all_wordsindexes():
	"""Returns the list of the names of all defined words indexes.
	Returns empty list in case there are no tags indexed in this index.
	Example: output=['global', 'author']."""
	out = []
	query = """SELECT name FROM wordsindex"""
	res = run_sql(query)
	for row in res:
	out.append(row[0])
	return out

	def usage(code, msg=''):
	"Prints usage for this module."
	if msg:
	sys.stderr.write("Error: %s.\n" % msg)
	<protect>
	print >> sys.stderr, \
	""" Usage: %s [options]
	Examples:
	%s -a -i 234-250,293,300-500 -u admin@cdsware
	%s -a -w author,fulltext -M 8192 -d
	%s -x -m +4d -A on --flush=10000

	Command options:
	-a, --add add or update selected records
	-x, --del delete selected records
	-c, --check check consistency for all records in the table(s)
	-r, --repair try to repair all records in the table(s)
	-i, --id=low[-high] select according to doc recID.
	-m, --modified=from[,to] select according to modification date
	-w, --wordsindex=w1[,w2] words indexes to consider (all)

	Specific options:
	-M, --maxmem=XXX maximum memory usage in kB (no limit)
	-f, --flush=NNN full consistent table flush after NNN records (5000)

	Scheduler options:
	-u, --user=USER user name to store task, password needed
	-p, --sleeptime=SLEEP time after which to repeat tasks (no)
	e.g.: 1s, 30m, 24h, 7d
	-t, --time=DATE moment for the task to be active (now)
	e.g.: +15s, 5m, 3h , 2002-10-27 13:57:26

	General options:
	-h, --help print this help
	-V, --version print version
	-d, --debug log debugging information
	-v, --verbose log extra debugging information verbose
	""" % ((sys.argv[0],) * 4)
	</protect>
	sys.exit(code)

	def getpass(prompt = "Password: "):
	"""Prompts for a password without echoing it back to the screen"""
	import termios, sys
	fd = sys.stdin.fileno()
	old = termios.tcgetattr(fd)
	new = termios.tcgetattr(fd)
	new[3] = new[3] & ~termios.ECHO # lflags
	passwd = ""
	try:
	termios.tcsetattr(fd, termios.TCSADRAIN, new)
	passwd = raw_input(prompt)
	print
	finally:
	termios.tcsetattr(fd, termios.TCSADRAIN, old)
	return passwd


	def authenticate(user):
	"""Authenticates a user against the user database.
	NOTE: Access might be more complex in the future"""
	print "BibWords Task Submission"
	print "========================"
	if user == "":
	print >> sys.stdout, "\rUsername: ",
	user = string.strip(string.lower(sys.stdin.readline()))
	else:
	print >> sys.stdout, "\rUsername:", user

	query = "select password from user where email='%s'" % MySQLdb.escape_string(user)
	res = run_sql(query, None, 1)
	if res:
	row = res[0]
	password_db = row[0]
	if password_db != '':
	# authentication needed
	password_entered = getpass()
	if password_db == password_entered:
	return user
	else:
	print "Sorry, you seem to be unauthorized user. Exiting."
	sys.exit(1)
	else:
	# no authentication needed
	return user
	else:
	print "Sorry, %s seems to be unauthorized user. Exiting." % user
	sys.exit(1)

	def split_ranges(parse_string):
	recIDs = []
	ranges = string.split(parse_string, ",")
	for range in ranges:
	tmp_recIDs = string.split(range, "-")

	if len(tmp_recIDs)==1:
	recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[0])])
	else:
	if int(tmp_recIDs[0]) > int(tmp_recIDs[1]): # sanity check
	tmp = tmp_recIDs[0]
	tmp_recIDs[0] = tmp_recIDs[1]
	tmp_recIDs[1] = tmp
	recIDs.append([int(tmp_recIDs[0]), int(tmp_recIDs[1])])
	return recIDs

	def get_word_tables(tables):
	wordTables = []
	if tables:
	wordsindexes = string.split(tables, ",")
	for wordsindex in wordsindexes:
	wordsindex_id = get_wordsindex_id(wordsindex)
	if wordsindex_id:
	wordTables.append({"bibwords%d" % wordsindex_id: \
	get_wordsindex_tags(wordsindex)})
	else:
	Log("There is no %s words table." % wordsindex, "Error")
	else:
	for wordsindex in get_all_wordsindexes():
	wordsindex_id = get_wordsindex_id(wordsindex)
	wordTables.append({"bibwords%d" % wordsindex_id: \
	get_wordsindex_tags(wordsindex)})
	return wordTables

	def get_date_range(var):
	"Returns the two dates contained as a low,high tuple"
	limits = string.split(var, ",")
	if len(limits)==1:
	low = get_date(limits[0])
	return low,None
	if len(limits)==2:
	low = get_date(limits[0])
	high = get_date(limits[1])
	return low,high

	def get_date(var, format_string = "%Y-%m-%d %H:%M:%S"):
	"""Returns a date string according to the format string.
	It can handle normal date strings and shifts with respect
	to now."""
	date = time.time()
	shift_re=re.compile("([-\+]{0,1})([\d]+)([dhms])")
	factors = {"d":24*3600, "h":3600, "m":60, "s":1}


	m = shift_re.match(var)
	if m:
	sign = m.groups()[0] == "-" and -1 or 1
	factor = factors[m.groups()[2]]
	value = float(m.groups()[1])
	print value
	date = time.localtime(date + sign * factor * value)
	date = time.strftime(format_string, date)
	else:
	date = time.strptime(var, format_string)
	date = time.strftime(format_string, date)

	return date

	def create_range_list(res):
	"""Creates a range list from a recID select query result contained
	in res. The result is expected to have ascending numerical order."""
	if not res:
	return []
	row = res[0]
	if not row:
	return []
	else:
	range_list = [[row[0],row[0]]]
	for row in res[1:]:
	id = row[0]
	if id == range_list[-1][1] + 1:
	range_list[-1][1] = id
	else:
	range_list.append([id,id])
	return range_list

	def beautify_range_list(range_list):
	"""Returns a non overlapping, maximal range list"""
	ret_list = []
	for new in range_list:
	found = 0
	for old in ret_list:
	if new[0] <= old[0] <= new[1] + 1 or new[0] - 1 <= old[1] <= new[1]:
	old[0] = min(old[0], new[0])
	old[1] = max(old[1], new[1])
	found = 1
	break

	if not found:
	ret_list.append(new)

	return ret_list

	def serialize_via_numeric_array_dumps(arr):
	return Numeric.dumps(arr)
	def serialize_via_numeric_array_compr(str):
	return compress(str)
	def serialize_via_numeric_array_escape(str):
	return MySQLdb.escape_string(str)
	def serialize_via_numeric_array(arr):
	"""Serialize Numeric array into a compressed string."""
	#return MySQLdb.escape_string(compress(Numeric.dumps(arr)))
	return serialize_via_numeric_array_escape(serialize_via_numeric_array_compr(serialize_via_numeric_array_dumps(arr)))

	def deserialize_via_numeric_array(string):
	"""Decompress and deserialize string into a Numeric array."""
	return Numeric.loads(decompress(string))

	def serialize_via_marshal(obj):
	"""Serialize Python object via marshal into a compressed string."""
	return MySQLdb.escape_string(compress(dumps(obj)))

	def deserialize_via_marshal(string):
	"""Decompress and deserialize string into a Python object via marshal."""
	return loads(decompress(string))

	class WordTable:
	"A class to hold the words table."

	def __init__(self, tablename, fields_to_index, separators="[^\s]"):
	"Creates words table instance."
	self.tablename = tablename
	self.recIDs_in_mem = []
	self.fields_to_index = fields_to_index
	self.separators = separators
	self.value = {}

	def get_field(self, recID, tag):
	"""Returns list of values of the MARC-21 'tag' fields for the
	record 'recID'."""

	out = []
	bibXXx = "bib" + tag[0] + tag[1] + "x"
	bibrec_bibXXx = "bibrec_" + bibXXx
	query = """SELECT value FROM %s AS b, %s AS bb
	WHERE bb.id_bibrec=%s AND bb.id_bibxxx=b.id
	AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID, tag);
	res = run_sql(query)
	for row in res:
	out.append(row[0])
	return out

	def clean(self):
	"Cleans the words table."
	self.value={}

	def put_into_db(self, mode="normal", split=string.split):
	"""Updates the current words table in the corresponding MySQL's
	bibwordsX table. Mode 'normal' means normal execution,
	mode 'emergency' means words index reverting to old state.
	"""
	Log("%s %s wordtable flush started" % (self.tablename,mode) )
	Log('...updating %d words into %s started' % \
	(len(self.value), self.tablename), "Verbose")

	self.recIDs_in_mem = beautify_range_list(self.recIDs_in_mem)

	if mode == "normal":
	for group in self.recIDs_in_mem:
	query = """UPDATE %sr SET type='TEMPORARY' WHERE id_bibrec
	BETWEEN '%d' AND '%d' AND type='CURRENT'""" % \
	(self.tablename, group[0], group[1])
	Log(query, "Debug")
	run_sql(query)

	nb_words_total = len(self.value)
	nb_words_report = int(nb_words_total/10)
	nb_words_done = 0
	for word in self.value.keys():
	self.put_word_into_db(word)
	nb_words_done += 1
	if nb_words_report!=0 and ((nb_words_done % nb_words_report) == 0):
	Log('......updated %d/%d words' % (nb_words_done, nb_words_total))
	Log('...updating %d words into %s ended' % \
	(nb_words_total, self.tablename))

	Log('...updating reverse table %sr started' % self.tablename)
	if mode == "normal":
	for group in self.recIDs_in_mem:
	query = """UPDATE %sr SET type='CURRENT' WHERE id_bibrec
	BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
	(self.tablename, group[0], group[1])
	Log(query, "Debug")
	run_sql(query)
	query = """DELETE FROM %sr WHERE id_bibrec
	BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
	(self.tablename, group[0], group[1])
	Log(query, "Debug")
	run_sql(query)
	Log('End of updating wordTable into %s' % self.tablename, "Debug")
	elif mode == "emergency":
	for group in self.recIDs_in_mem:
	query = """UPDATE %sr SET type='CURRENT' WHERE id_bibrec
	BETWEEN '%d' AND '%d' AND type='TEMPORARY'""" % \
	(self.tablename, group[0], group[1])
	Log(query, "Debug")
	run_sql(query)
	query = """DELETE FROM %sr WHERE id_bibrec
	BETWEEN '%d' AND '%d' AND type='FUTURE'""" % \
	(self.tablename, group[0], group[1])
	Log(query, "Debug")
	run_sql(query)
	Log('End of emergency flushing wordTable into %s' % self.tablename,
	"Debug")
	Log('...updating reverse table %sr ended' % self.tablename)

	self.clean()
	self.recIDs_in_mem = []
	Log("%s %s wordtable flush ended" % (self.tablename, mode) )

	def load_old_recIDs(self,word):
	query = "SELECT hitlist FROM %s WHERE word='%s'" % \
	(self.tablename, MySQLdb.escape_string(word))
	res = run_sql(query)
	if res:
	return deserialize_via_numeric_array(res[0][0])
	else:
	return None

	def merge_with_old_recIDs(self,word,set):
	"""Merge the system numbers stored in memory (hash of recIDs with value +1 or -1
	according to where to add/delete them) with those stored in the database index
	and received in set universe of recIDs for the given word. """

	for recID,sign in self.value[word].items():
	if sign == -1:
	set[recID] = 0
	else:
	set[recID] = 1

	def put_word_into_db(self, word, split=string.split):
	"""Flush a single word to the database and delete it from memory"""

	set = self.load_old_recIDs(word)
	if set: # merge the word recIDs found in memory:
	self.merge_with_old_recIDs(word,set)
	query = "UPDATE %s SET hitlist='%s' WHERE word='%s'" % \
	(self.tablename, serialize_via_numeric_array(set),
	MySQLdb.escape_string(word))
	else: # the word is new, will create new set:
	set = Numeric.zeros(cfg_max_recID+1, Numeric.Int0)
	Numeric.put(set, self.value[word].keys(), 1)
	query = "INSERT INTO %s (word, hitlist) VALUES ('%s', '%s')" %\
	(self.tablename, MySQLdb.escape_string(word),
	serialize_via_numeric_array(set))

	if not set: # never store empty words
	query = "DELETE from %s WHERE word='%s'" % \
	(self.tablename, MySQLdb.escape_string(word))

	run_sql(query)

	del self.value[word]

	def display(self):
	"Displays the word table."
	keys = self.value.keys()
	keys.sort()
	for k in keys:
	Log("%s: %s" % (k, self.value[k]))

	def count(self):
	"Returns the number of words in the table."
	return len(self.value)

	def info(self):
	"Prints some information on the words table."
	Log("The words table contains %d words." % self.count())

	def lookup_words(self, word=""):
	"Lookup word from the words table."

	if not word:
	done = 0
	while not done:
	try:
	word = raw_input("Enter word: ")
	done = 1
	except (EOFError, KeyboardInterrupt):
	return

	if self.value.has_key(word):
	Log("The word '%s' is found %d times." \
	% (word, len(self.value[word])))
	else:
	Log("The word '%s' does not exist in the word file."\
	% word)

	def add_recIDs(self, recIDs):
	"""Fetches records which id in the recIDs range list and adds
	them to the wordTable."""
	global chunksize
	flush_count = 0
	records_done = 0
	records_to_go = 0

	for range in recIDs:
	records_to_go = records_to_go + range[1] - range[0] + 1

	time_started = time.time() # will measure profile time
	for range in recIDs:
	i_low = range[0]
	chunksize_count = 0
	while i_low <= range[1]:
	# calculate chunk group of recIDs and treat it:
	i_high = min(i_low+opts_dict["flush"]-flush_count-1,range[1])
	i_high = min(i_low+chunksize-chunksize_count-1, i_high)

	self.chk_recID_range(i_low, i_high)
	Log("%s adding records #%d-#%d started" % \
	(self.tablename, i_low, i_high))
	self.del_recID_range(i_low, i_high)
	just_processed = self.add_recID_range(i_low, i_high)
	flush_count = flush_count + i_high - i_low + 1
	chunksize_count = chunksize_count + i_high - i_low + 1
	records_done = records_done + just_processed
	Log("%s adding records #%d-#%d ended " % \
	(self.tablename, i_low, i_high))

	if chunksize_count >= chunksize:
	chunksize_count = 0
	# flush if necessary:
	if flush_count >= opts_dict["flush"]:
	self.put_into_db()
	self.clean()
	Log("%s backing up" % (self.tablename))
	flush_count = 0
	self.log_progress(time_started,records_done,records_to_go)
	# iterate:
	i_low = i_high + 1
	if flush_count > 0:
	self.put_into_db()
	self.log_progress(time_started,records_done,records_to_go)

	def add_date(self, date):
	# If date is not set, then retrieve it from the database.
	# Reindex all formats newer than the modification date
	global Log
	if not date:
	id = self.tablename[len("bibwords"):]
	query = """SELECT last_updated FROM wordsindex WHERE id='%s'
	""" % id
	res = run_sql(query)
	if not res:
	return
	if not res[0][0]:
	date = "0000-00-00"
	else:
	date = res[0][0]
	query = """SELECT b.id FROM bibrec AS b WHERE b.modification_date >=
	'%s' ORDER BY b.id ASC""" % date
	res = run_sql(query)
	list = create_range_list(res)
	if not list:
	Log( "No new records added. %s is up to date" % self.tablename )
	else:
	self.add_recIDs(list)

	def add_recID_range(self, recID1, recID2):
	empty_list_string = serialize_via_marshal([])
	wlist = {}
	self.recIDs_in_mem.append([recID1,recID2])
	# secondly fetch all needed tags:
	for tag in self.fields_to_index:
	if tag in tagToWordsFunctions.keys():
	get_words_function = tagToWordsFunctions[ tag ]
	else: get_words_function = get_words_from_phrase
	bibXXx = "bib" + tag[0] + tag[1] + "x"
	bibrec_bibXXx = "bibrec_" + bibXXx
	query = """SELECT bb.id_bibrec,b.value FROM %s AS b, %s AS bb
	WHERE bb.id_bibrec BETWEEN %d AND %d
	AND bb.id_bibxxx=b.id AND tag LIKE '%s'""" % (bibXXx, bibrec_bibXXx, recID1, recID2, tag)
	res = run_sql(query)
	nb_total_to_read = len(res)
	verbose_idx = 0 # for verbose pretty printing
	for row in res:
	recID,phrase = row
	if not wlist.has_key(recID): wlist[recID] = []
	new_words = get_words_function(phrase) # ,self.separators
	wlist[recID] = list_union(new_words,wlist[recID])

	if len(wlist) == 0: return 0
	recIDs = wlist.keys()

	# Using cStringIO for speed.
	query_factory = cStringIO.StringIO()
	qwrite = query_factory.write

	qwrite( "INSERT INTO %sr (id_bibrec,wordlist,type) VALUES" % self.tablename)
	qwrite( "('" )
	qwrite( str(recIDs[0]) )
	qwrite( "','" )
	qwrite( serialize_via_marshal(wlist[recIDs[0]]) )
	qwrite( "','FUTURE')" )

	for recID in recIDs[1:]:
	qwrite(",('")
	qwrite(str(recID))
	qwrite("','")
	qwrite(serialize_via_marshal(wlist[recID]))
	qwrite("','FUTURE')")

	query = query_factory.getvalue()
	query_factory.close()
	run_sql(query)

	query_factory = cStringIO.StringIO()
	qwrite = query_factory.write

	qwrite("INSERT INTO %sr (id_bibrec,wordlist,type) VALUES" % self.tablename)
	qwrite("('")
	qwrite(str(recIDs[0]))
	qwrite("','")
	qwrite(serialize_via_marshal(wlist[recIDs[0]]))
	qwrite("','CURRENT')")

	for recID in recIDs[1:]:
	qwrite( ",('" )
	qwrite( str(recID) )
	qwrite( "','" )
	qwrite( empty_list_string )
	qwrite( "','CURRENT')" )

	query = query_factory.getvalue()
	query_factory.close()

	try:
	run_sql(query)
	except MySQLdb.DatabaseError:
	# ok, we tried to add an existent record. No problem
	pass

	put = self.put
	for recID in recIDs:
	for w in wlist[recID]:
	put(recID, w, 1)

	return len(recIDs)

	def log_progress(self, start, done, todo):
	"""Calculate progress and store it.
	start: start time,
	done: records processed,
	todo: total number of records"""
	time_elapsed = time.time() - start
	# consistency check
	if time_elapsed == 0 or done > todo:
	return

	time_recs_per_min = done/(time_elapsed/60.0)
	Log("%d records took %.1f seconds to complete.(%1.f recs/min)"\
	% (done, time_elapsed, time_recs_per_min))

	if time_recs_per_min:
	Log("Estimated runtime: %.1f minutes" % \
	((todo-done)/time_recs_per_min))

	def put(self, recID, word, sign):
	"Adds/deletes a word to the word list."
	try:
	word = lower(word[:50])
	if self.value.has_key(word):
	# the word 'word' exist already: update sign
	self.value[word][recID] = sign
	else:
	self.value[word] = {recID: sign}
	except:
	Log("Cannot put word %s with sign %d for recID %s." % (word, sign, recID))


	def del_recIDs(self, recIDs):
	"""Fetches records which id in the recIDs range list and adds
	them to the wordTable."""
	count = 0
	for range in recIDs:
	self.del_recID_range(range[0],range[1])
	count = count + range[1] - range[0]
	self.put_into_db()

	def del_recID_range(self, low, high):
	"""Deletes records with 'recID' system number between low
	and high from memory words index table."""
	self.recIDs_in_mem.append([low,high])
	query = """SELECT id_bibrec,wordlist FROM %sr as bb WHERE bb.id_bibrec
	BETWEEN '%d' AND '%d'""" % (self.tablename, low, high)
	recID_rows = run_sql(query)
	for recID_row in recID_rows:
	recID = recID_row[0]
	wlist = deserialize_via_marshal(recID_row[1])
	for word in wlist:
	self.put(recID, word, -1)

	def report_on_table_consistency(self):
	"""Check reverse words index tables (e.g. bibwords1r) for
	interesting states such as 'TEMPORARY' state.
	Prints small report (no of words, no of bad words).
	"""

	# find number of words:
	query = """SELECT COUNT(*) FROM %s""" % (self.tablename)
	res = run_sql(query, None, 1)
	if res:
	nb_words = res[0][0]
	else:
	nb_words = 0

	# find number of records:
	query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sr""" % (self.tablename)
	res = run_sql(query, None, 1)
	if res:
	nb_records = res[0][0]
	else:
	nb_records = 0

	# report stats:
	Log("%s contains %d words from %d records" % (self.tablename, nb_words, nb_records))

	# find possible bad states in reverse tables:
	query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sr WHERE type <> 'CURRENT'""" % (self.tablename)
	res = run_sql(query)
	if res:
	nb_bad_records = res[0][0]
	else:
	nb_bad_records = 999999999
	if nb_bad_records:
	Log("%s needs to repair %d of %d records" % \
	(self.tablename, nb_bad_records, nb_records), "EMERGENCY")
	else:
	Log("%s is in consistent state" % (self.tablename))

	return nb_bad_records

	def repair(self):
	"""Repair the whole table"""
	# find possible bad states in reverse tables:
	query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sr WHERE type <> 'CURRENT'""" % (self.tablename)
	res = run_sql(query, None, 1)
	if res:
	nb_bad_records = res[0][0]
	else:
	nb_bad_records = 0

	# find number of records:
	query = """SELECT COUNT(DISTINCT(id_bibrec)) FROM %sr""" % (self.tablename)
	res = run_sql(query)
	if res:
	nb_records = res[0][0]
	else:
	nb_records = 0
	if nb_bad_records == 0:
	return
	query = """SELECT id_bibrec FROM %sr WHERE type <> 'CURRENT' ORDER BY id_bibrec""" \
	% (self.tablename)
	res = run_sql(query)
	recIDs = create_range_list(res)

	flush_count = 0
	records_done = 0
	records_to_go = 0

	for range in recIDs:
	records_to_go = records_to_go + range[1] - range[0] + 1

	time_started = time.time() # will measure profile time
	for range in recIDs:
	i_low = range[0]
	chunksize_count = 0
	while i_low <= range[1]:
	# calculate chunk group of recIDs and treat it:
	i_high = min(i_low+opts_dict["flush"]-flush_count-1,range[1])
	i_high = min(i_low+chunksize-chunksize_count-1, i_high)

	self.fix_recID_range(i_low, i_high)

	flush_count = flush_count + i_high - i_low + 1
	chunksize_count = chunksize_count + i_high - i_low + 1
	records_done = records_done + i_high - i_low + 1
	if chunksize_count >= chunksize:
	chunksize_count = 0
	# flush if necessary:
	if flush_count >= opts_dict["flush"]:
	self.put_into_db("emergency")
	self.clean()
	flush_count = 0
	self.log_progress(time_started,records_done,records_to_go)
	# iterate:
	i_low = i_high + 1
	if flush_count > 0:
	self.put_into_db("emergency")
	self.log_progress(time_started,records_done,records_to_go)
	Log("%s inconsistencies repaired." % self.tablename, "EMERGENCY")



	def chk_recID_range(self, low, high):
	"""Check if the reverse wordsindex table is in proper state"""
	## check db
	query = """SELECT COUNT(*) FROM %sr WHERE type <> 'CURRENT'
	AND id_bibrec BETWEEN '%d' AND '%d'""" % (self.tablename, low, high)
	res = run_sql(query, None, 1)
	if res[0][0]==0:
	Log("%s for %d-%d is in consistent state"%(self.tablename,low,high))
	return # okay, words table is consistent

	## inconsistency detected!
	Log("%s inconsistencies detected..." % self.tablename, "EMERGENCY")
	Log("""Errors found. You should check consistency of the %s - %sr tables.\nRunning 'bibwords --repair' is recommended.""" \
	% (self.tablename, self.tablename),
	"EMERGENCY!")
	raise StandardError


	def fix_recID_range(self, low, high):
	"""Try to fix reverse wordsindex database consistency (table bibwords1r) in the low,high doc-id range.

	Possible states for a recID follow:
	CUR TMP FUT: very bad things have happened: warn!
	CUR TMP : very bad things have happened: warn!
	CUR FUT: delete FUT (crash before flushing)
	CUR : database is ok
	TMP FUT: add TMP to memory and del FUT from memory
	flush (revert to old state)
	TMP : very bad things have happened: warn!
	FUT: very bad things have happended: warn!
	"""

	state = {}
	query = "SELECT id_bibrec,type FROM %sr WHERE id_bibrec BETWEEN '%d' AND '%d'"\
	% (self.tablename, low, high)
	res = run_sql(query)
	for row in res:
	if not state.has_key(row[0]):
	state[row[0]]=[]
	state[row[0]].append(row[1])

	ok = 1 # will hold info on whether we will be able to repair
	for recID in state.keys():
	if not 'TEMPORARY' in state[recID]:
	if 'FUTURE' in state[recID]:
	if 'CURRENT' not in state[recID]:
	Log("Record %d is in inconsistent state. Can't repair it",
	"EMERGENCY")
	ok = 0
	else:
	Log("Inconsistency in record %d detected" % recID,
	"EMERGENCY")
	query = """DELETE FROM %sr
	WHERE id_bibrec='%d'""" % (self.tablename, recID)
	run_sql(query)
	Log("Inconsistency in record %d repaired." % recID)

	else:
	if 'FUTURE' in state[recID] and not 'CURRENT' in state[recID]:
	self.recIDs_in_mem.append([recID,recID])

	# Get the words file
	query = """SELECT type,wordlist FROM %sr
	WHERE id_bibrec='%d'""" % (self.tablename, recID)
	Log(query, "Debug")
	res = run_sql(query)
	for row in res:
	wlist = deserialize_via_marshal(row[1])
	Log("Words are %s " % wlist, "Debug")
	if row[0] == 'TEMPORARY':
	sign = 1
	else:
	sign = -1
	for word in wlist:
	self.put(recID, word, sign)

	else:
	Log("%s for %d is in inconsistent state. Couldn't repair it." % (self.tablename, recID),
	"EMERGENCY")
	ok = 0

	if not ok:
	Log("""Unrepairable errors found. You should check consistency
	of the %s - %sr tables. Deleting affected records is
	recommended.""" % (self.tablename, self.tablename),
	"EMERGENCY")
	raise StandardError



	def task_run(row):
	"""Run the indexing task. The row argument is the BibSched task
	queue row, containing if, arguments, etc.
	"""

	global opts_dict, taskid, wordTables
	taskid = row[0]
	opts_dict = loads(row[5])

	# sanity check:
	if row[1] != "bibwords":
	Log("The task #%d does not seem to be a BibWords task." % taskid, "Error")
	return
	if row[6] != "WAITING":
	Log("The task #%d is %s. I expected WAITING." % (taskid, row[6]), "Error")
	return

	task_update_state("RUNNING")

	# install signal handlers
	signal.signal(signal.SIGUSR1, task_sig_sleep)
	signal.signal(signal.SIGTERM, task_sig_stop)
	signal.signal(signal.SIGABRT, task_sig_suicide)
	signal.signal(signal.SIGCONT, task_sig_wakeup)
	signal.signal(signal.SIGINT, task_sig_unknown)

	## go ahead and treat each table :
	for table in opts_dict["wordsindex"]:
	wordTable = WordTable(table.keys()[0], table.values()[0])
	wordTable.report_on_table_consistency()
	try:
	if opts_dict["cmd"] == "del":
	if opts_dict["id"]:
	wordTable.del_recIDs(opts_dict["id"])
	else:
	wordTable.del_date(opts_dict["modified"])

	elif opts_dict["cmd"] == "add":
	if opts_dict["id"]:
	wordTable.add_recIDs(opts_dict["id"])
	else:
	wordTable.add_date(opts_dict["modified"])
	elif opts_dict["cmd"] == "repair":
	wordTable.repair()
	else:
	Log("Invalid command found processing %s" % \
	wordTable.tablename, "Error")
	raise StandardError

	id = wordTable.tablename[len("bibwords"):]
	query = """UPDATE wordsindex SET last_updated=NOW()
	WHERE id='%s'""" % id
	Log(query, "Debug")
	run_sql(query)

	except StandardError, e:
	Log("Exception caught: %s" % e, "Error")
	traceback.print_tb(sys.exc_info()[2])
	task_update_state("ERROR")
	task_sig_stop_commands()
	sys.exit(1)

	wordTable.report_on_table_consistency()

	# We are done. State it in the database, close and quit
	task_update_state("DONE")
	Log("Done. Finished processing.")
	sys.exit(0)

	def command_line():
	global opts_dict
	long_flags =["add","del","check","repair","id=","modified=","wordsindex=",
	"maxmem=", "flush=","user=","sleeptime=",
	"time=","help", "version", "debug", "verbose"]
	short_flags ="axcri:m:w:M:f:u:p:t:hvdD"
	format_string = "%Y-%m-%d %H:%M:%S"
	tables = None
	sleeptime = ""
	tmp_name = os.tempnam()

	try:
	opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags)
	except getopt.GetoptError, err:
	Log(err, "Error")
	usage(1)
	if args:
	usage(1)

	opts_dict={"cmd":"add", "id":[], "modified":[], "maxmem":0,
	"flush":5000, "sleeptime":0, "debug":0, "verbose":0 }
	sched_time = time.strftime(format_string)

	user = ""
	# Check for key options

	try:
	for opt in opts:
	if opt == ("-h","") or opt == ("--help",""):
	usage(1)
	elif opt == ("-v","") or opt == ("--version",""):
	print __version__
	sys.exit(1)
	elif opt == ("-d","") or opt == ("--debug",""):
	opts_dict["debug"] = 1
	elif opt == ("--verbose","") or opt == ("-V",""):
	opts_dict["verbose"] = 1
	elif opt == ("-a","") or opt == ("--add",""):
	opts_dict["cmd"] = "add"
	if ("-x","") in opts or ("--del","") in opts:
	usage(1)
	elif opt == ("-c","") or opt == ("--check",""):
	opts_dict["cmd"] = "check"
	elif opt == ("-r","") or opt == ("--repair",""):
	opts_dict["cmd"] = "repair"
	elif opt == ("-x","") or opt == ("--del",""):
	opts_dict["cmd"]="del"
	elif opt[0] in [ "-i", "--id" ]:
	opts_dict["id"] = opts_dict["id"] + split_ranges(opt[1])
	elif opt[0] in [ "-m", "--modified" ]:
	opts_dict["modified"] = get_date_range(opt[1])
	elif opt[0] in [ "-w", "--wordsindex" ]:
	tables = opt[1]
	elif opt[0] in [ "-M", "--maxmem"]:
	opts_dict["maxmem"]=int(opt[1])
	if opts_dict["maxmem"] < base_process_size + 1000:
	raise StandardError, "Memory usage should be higher than %d kB" % (base_process_size + 1000)
	elif opt[0] in [ "-f", "--flush"]:
	opts_dict["flush"]=int(opt[1])
	elif opt[0] in [ "-u", "--user"]:
	user = opt[1]
	elif opt[0] in [ "-p", "--sleeptime" ]:
	get_date(opt[1]) # see if it is a valid shift
	sleeptime= opt[1]
	elif opt[0] in [ "-t", "--time" ]:
	sched_time= get_date(opt[1])
	else: usage(1)
	except StandardError, e:
	Log(e, "Error")
	sys.exit(1)

	opts_dict["wordsindex"]=get_word_tables(tables)

	if opts_dict["cmd"] == "check":
	for table in opts_dict["wordsindex"]:
	wordTable = WordTable(table.keys()[0], table.values()[0])
	wordTable.report_on_table_consistency()
	return

	user = authenticate(user)

	query = """INSERT INTO schTASKS(
	proc,user,date,sleeptime,arguments,output, error,state,score)
	VALUES ('bibwords','%s',CAST('%s' AS DATE),'%s','%s','%s','%s',
	'WAITING','%d')""" % \
	(MySQLdb.escape_string(user),MySQLdb.escape_string(sched_time), sleeptime, \
	MySQLdb.escape_string(dumps(opts_dict)), \
	tmp_name + ".out", tmp_name + ".err", 100)

	new_task_id = run_sql(query)
	if opts_dict["debug"]:
	print opts_dict
	print "Task #%d was successfully scheduled for execution." % new_task_id
	return

	def write_message(msg, stream=sys.stdout):
	"""Prints message and flush output stream (may be sys.stdout or sys.stderr). Useful for debugging stuff."""
	if stream == sys.stdout or stream == sys.stderr:
	stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
	stream.write("%s\n" % msg)
	stream.flush()
	else:
	sys.stderr.write("Unknown stream %s. [must be sys.stdout or sys.stderr]\n" % stream)

	def task_sig_sleep(sig, frame):
	"""Signal handler for the 'sleep' signal sent by BibSched."""
	if opts_dict["debug"]:
	write_message("got signal %d" % sig)
	write_message("sleeping...")
	task_update_state("SLEEPING")
	signal.pause() # wait for wake-up signal

	def task_sig_wakeup(sig, frame):
	"""Signal handler for the 'wakeup' signal sent by BibSched."""
	if opts_dict["debug"]:
	write_message("got signal %d" % sig)
	write_message("continuing...")
	task_update_state("CONTINUING")

	def task_sig_stop(sig, frame):
	"""Signal handler for the 'stop' signal sent by BibSched."""
	if opts_dict["debug"]:
	write_message("got signal %d" % sig)
	write_message("stopping...")
	task_update_state("STOPPING")
	errcode = 0
	try:
	task_sig_stop_commands()
	write_message("stopped")
	task_update_state("STOPPED")
	except StandardError, err:
	write_message("Error during stopping! %e" % err)
	task_update_state("STOPPINGFAILED")
	errcode = 1
	sys.exit(errcode)

	def task_sig_stop_commands():
	"""Do all the commands necessary to stop the task before quitting.
	Useful for task_sig_stop() handler.
	"""
	write_message("stopping commands started")
	for table in wordTables:
	table.put_into_db()
	write_message("stopping commands ended")

	def task_sig_suicide(sig, frame):
	"""Signal handler for the 'suicide' signal sent by BibSched."""
	if opts_dict["debug"]:
	write_message("got signal %d" % sig)
	write_message("suiciding myself now...")
	task_update_state("SUICIDING")
	write_message("suicided")
	task_update_state("SUICIDED")
	sys.exit(0)

	def task_sig_unknown(sig, frame):
	"""Signal handler for the other unknown signals sent by shell or user."""
	if opts_dict["debug"]:
	write_message("got signal %d" % sig)
	write_message("unknown signal %d ignored" % sig) # do nothing for other signals

	def task_update_progress(msg):
	"""Updates progress information in the BibSched task table."""
	query = "UPDATE schTASKS SET progress='%s' where id=%d" % (MySQLdb.escape_string(msg), taskid)
	if opts_dict["debug"]:
	write_message(query)
	run_sql(query)
	return

	def task_update_state(val):
	"""Updates state information in the BibSched task table."""
	query = "UPDATE schTASKS SET state='%s' where id=%d" % (MySQLdb.escape_string(val), taskid)
	if opts_dict["debug"]:
	write_message(query)
	run_sql(query)
	return

	def test_fulltext_indexing():
	"""Tests fulltext indexing programs on PDF, PS, DOC, PPT,
	XLS. Prints list of words and word table on the screen. Does not
	integrate anything into the database. Useful when debugging
	problems with fulltext indexing: call this function instead of main().
	"""
	global opts_dict
	opts_dict = {}
	opts_dict["debug"] = 1
	print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=atlnot&categ=Communication&id=com-indet-2002-012") # protected URL
	print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a00388&id=a00388s2t7") # XLS
	print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a02883&id=a02883s1t6/transparencies") # PPT
	print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=agenda&categ=a99149&id=a99149s1t10/transparencies") # DOC
	print get_words_from_fulltext("http://doc.cern.ch/cgi-bin/setlink?base=preprint&categ=cern&id=lhc-project-report-601") # PDF
	sys.exit(0)

	def test_word_separators(phrase="hep-th/0101001"):
	"""Tests word separating policy on various input."""
	print "%s:" % phrase
	for word in get_words_from_phrase(phrase):
	print "\t-> %s" % word

	def test_really_run():
	"""Test by really running words indexing not via task scheduler but directly. Useful for debugging/profiling."""
	global opts_dict
	opts_dict = { "wordsindex": get_word_tables("author"),
	"cmd" : "add",
	"id" : [[1,1000]],
	"debug" : 0,
	"verbose" : 0,
	"flush" : 50000,
	}
	for table in opts_dict["wordsindex"]:
	wordTable = WordTable(table.keys()[0], table.values()[0])
	wordTable.report_on_table_consistency()
	try:
	if opts_dict["cmd"] == "del":
	if opts_dict["id"]:
	wordTable.del_recIDs(opts_dict["id"])
	else:
	wordTable.del_date(opts_dict["modified"])

	elif opts_dict["cmd"] == "add":
	if opts_dict["id"]:
	wordTable.add_recIDs(opts_dict["id"])
	else:
	wordTable.add_date(opts_dict["modified"])
	elif opts_dict["cmd"] == "repair":
	wordTable.repair()
	else:
	Log("Invalid command found processing %s" % \
	wordTable.tablename, "Error")
	raise StandardError

	id = wordTable.tablename[len("bibwords"):]
	query = """UPDATE wordsindex SET last_updated=NOW()
	WHERE id='%s'""" % id
	Log(query, "Debug")
	run_sql(query)

	except StandardError, e:
	Log("Exception caught: %s" % e, "Error")
	traceback.print_tb(sys.exc_info()[2])
	task_update_state("ERROR")
	task_sig_stop_commands()
	sys.exit(1)

	wordTable.report_on_table_consistency()

	def main():
	"""Reads arguments and either runs the task, or starts user-interface (command line)."""
	if len(sys.argv) == 2:
	try:
	id = int(sys.argv[1])
	except StandardError, err:
	command_line()
	sys.exit()

	res = run_sql("SELECT * FROM schTASKS WHERE id='%d'" % (id), None, 1)
	if not res:
	Log("Selected task not found.", "Error")
	sys.exit(1)
	task_run(res[0])
	else:
	command_line()

	if __name__ == "__main__":
	#print get_fieldvalues(600336,"700__a")
	#print get_all_wordsindexes()
	#test_fulltext_indexing()
	#test_word_separators()
	#test_word_separators("?C++.NET")
	#test_word_separators("Ellis, J")
	#test_word_separators("O'Neil and pi- in real.business")
	#print get_words_from_phrase("O'Neil and pi- in real.business", "[\s\\+\=\!\@\#\$\%\^\&\\\|\-\_\\\'\"\<\>\,\.\/\?\[\]\{\}\<\\|\`\~]")
	#test_really_run()
	main()

bibwords.wmlNo OneTemporaryActions

File Metadata

bibwords.wmlView Options

Event Timeline

bibwords.wml
No OneTemporary
Actions

bibwords.wml
View Options