diff --git a/modules/webalert/lib/Makefile.am b/modules/webalert/lib/Makefile.am index 1cdbea3cb..7f43e3fa6 100644 --- a/modules/webalert/lib/Makefile.am +++ b/modules/webalert/lib/Makefile.am @@ -1,28 +1,28 @@ ## $Id$ ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. pylibdir=$(libdir)/python/cdsware -pylib_DATA=webalert.py alert_engine.py htmlparser.py +pylib_DATA=webalert.py alert_engine.py htmlparser.py textwrap.py EXTRA_DIST = $(wildcard *.wml) CLEANFILES = $(pylib_DATA) *~ *.tmp *.pyc %.py: %.py.wml ../../../config/config.wml ../../../config/configbis.wml $(WML) -o $@ $< \ No newline at end of file diff --git a/modules/webalert/lib/alert_engine.py b/modules/webalert/lib/alert_engine.py index f9dc07837..9d37f1521 100644 --- a/modules/webalert/lib/alert_engine.py +++ b/modules/webalert/lib/alert_engine.py @@ -1,430 +1,432 @@ ## $Id$ ## Alert engine implementation. ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ##read config variables #include "config.wml" #include "configbis.wml" ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. """Alert engine implementation.""" ## rest of the Python code goes below __version__ = "$Id$" try: from cgi import parse_qs from sre import search, sub from time import localtime, strftime, mktime, sleep import smtplib from config import * from search_engine import perform_request_search from dbquery import run_sql from htmlparser import * + from string import split except ImportError, e: print "Error: %s" % e import sys sys.exit(1) MAXIDS = 50 FROMADDR = 'CDS Alert Engine <%s>' % alertengineemail ALERTURL = weburl + '/youralerts.py/list' DEVELOPERADDR = ['erik.simon@cern.ch', 'tibor.simko@cern.ch'] # Debug levels: # 0 = production, nothing on the console, email sent # 1 = messages on the console, email sent # 2 = messages on the console, but no email sent # 3 = many messages on the console, no email sent # 4 = many messages on the console, email sent to DEVELOPERADDR DEBUGLEVEL = 4 def update_date_lastrun(alert): return run_sql('update user_query_basket set date_lastrun=%s where id_user=%s and id_query=%s and id_basket=%s;', (strftime("%Y-%m-%d"), alert[0], alert[1], alert[2],)) def get_alert_queries(frequency): return run_sql('select distinct id, urlargs from query q, user_query_basket uqb where q.id=uqb.id_query and uqb.frequency=%s and uqb.date_lastrun <= now();', (frequency,)) def get_alert_queries_for_user(uid): return run_sql('select distinct id, urlargs, uqb.frequency from query q, user_query_basket uqb where q.id=uqb.id_query and uqb.id_user=%s and uqb.date_lastrun <= now();', (uid,)) def get_alerts(query, frequency): r = run_sql('select id_user, id_query, id_basket, frequency, date_lastrun, alert_name, notification from user_query_basket where id_query=%s and frequency=%s;', (query['id_query'], frequency,)) return {'alerts': r, 'records': query['records'], 'argstr': query['argstr'], 'date_from': query['date_from'], 'date_until': query['date_until']} # def add_record_to_basket(record_id, basket_id): # if DEBUGLEVEL > 0: # print "-> adding record %s into basket %s" % (record_id, basket_id) # try: # return run_sql('insert into basket_record (id_basket, id_record) values(%s, %s);', (basket_id, record_id,)) # except: # return 0 # def add_records_to_basket(record_ids, basket_id): # # TBD: generate the list and all all records in one step (see below) # for i in record_ids: # add_record_to_basket(i, basket_id) # Optimized version: def add_records_to_basket(record_ids, basket_id): global DEBUGLEVEL nrec = len(record_ids) if nrec > 0: vals = '(%s,%s)' % (basket_id, record_ids[0]) if nrec > 1: for i in record_ids[1:]: vals += ',(%s, %s)' % (basket_id, i) if DEBUGLEVEL > 0: print "-> adding %s records into basket %s: %s" % (nrec, basket_id, vals) try: if DEBUGLEVEL < 4: return run_sql('insert into basket_record (id_basket, id_record) values %s;' % vals) # Cannot use the run_sql(, (,)) form for some reason else: print ' NOT ADDED, DEBUG LEVEL == 4' return 0 except: return 0 else: return 0 def get_email(uid): r = run_sql('select email from user where id=%s', (uid,)) return r[0][0] def get_query(alert_id): r = run_sql('select urlargs from query where id=%s', (alert_id,)) return r[0][0] def send_email(fromaddr, toaddr, body): global DEBUGLEVEL try: server = smtplib.SMTP('localhost') if DEBUGLEVEL > 2: server.set_debuglevel(1) else: server.set_debuglevel(0) server.sendmail(fromaddr, toaddr, body) server.quit() except: print 'Error connecting to SMTP server, retrying in 10 seconds.' sleep(10) send_email(fromaddr, toaddr, body) def forge_email(fromaddr, toaddr, subject, content): body = 'From: %s\nTo: %s\nContent-Type: text/plain; charset=utf-8\nSubject: %s\n%s' % (fromaddr, toaddr, subject, content) return body def format_frequency(freq): frequency = freq if frequency == "day": return 'daily' else: return frequency + 'ly' def print_records(record_ids): global MAXIDS msg = '' c = 1 for i in record_ids: if c > MAXIDS: break msg += '\n\n%s) %s' % (c, get_as_text(i)) c += 1 if c > MAXIDS: - msg += '\n\n' + 'Only the first %s records are displayed above. Please consult the URL below to see all the results.' % MAXIDS + msg += '\n\n' + wrap('Only the first %s records are displayed above. Please consult the URL below to see all the results.' % MAXIDS) return msg + def email_notify(alert, records, argstr): global FROMADDR global ALERTURL global DEBUGLEVEL global DEVELOPERADDR if len(records) == 0: return msg = "" if DEBUGLEVEL > 0: msg = "*** THIS MESSAGE WAS SENT IN DEBUG MODE, DON'T TAKE IT INTO ACCOUNT ***\n\n" - msg += "Hello\n\nBelow are the results of the email alert that you set up with the CERN Document Server.\n" - msg += "This is an automatic message, please don't reply to its address. For any question, use <%s> instead.\n" % supportemail + msg += "Hello\n\n" + msg += wrap("Below are the results of the email alert that you set up with the CERN Document Server. This is an automatic message, please don't reply to its address. For any question, use <%s> instead." % supportemail) email = get_email(alert[0]) url = weburl + "/search.py?" + argstr pattern = get_pattern(argstr) catalogue = get_catalogue(argstr) catword = 'catalogue' if get_catalogue_num(argstr) > 1: catword += 's' time = strftime("%d-%m-%Y") - msg += '\nalert name: %s' % alert[5] - msg += '\npattern: \'%s\'' % pattern + msg += '\n' + wrap('alert name: %s' % alert[5]) + msg += wrap('pattern: \'%s\'' % pattern) if catalogue: - msg += '\n%s: %s' % (catword, catalogue) - msg += '\nfrequency: %s ' % format_frequency(alert[3]) - msg += '\nrun time: %s ' % time - msg += '\nfound: %s record' % len(records) + msg += wrap('%s: %s' % (catword, catalogue)) + msg += wrap('frequency: %s ' % format_frequency(alert[3])) + msg += wrap('run time: %s ' % time) + msg += wrap('found: %s record' % len(records)) if len(records) > 1: msg += 's' msg += "\nurl: <%s/search.py?%s>\n" % (weburl, argstr) - msg += print_records(records) + msg += wrap_records(print_records(records)) - msg += "\n\n-- \nCERN Document Server Alert Service <%s>\nUnsubscribe at <%s>\nNeed human intervention? Contact <%s>" % (weburl, ALERTURL, supportemail) + msg += "-- \nCERN Document Server Alert Service <%s>\nUnsubscribe at <%s>\nNeed human intervention? Contact <%s>" % (weburl, ALERTURL, supportemail) subject = 'Alert %s run on %s' % (alert[5], time) body = forge_email(FROMADDR, email, subject, msg) if DEBUGLEVEL > 0: print "********************************************************************************" print body print "********************************************************************************" if DEBUGLEVEL < 2: send_email(FROMADDR, email, body) if DEBUGLEVEL == 4: for a in DEVELOPERADDR: send_email(FROMADDR, a, body) def get_argument(args, argname): if args.has_key(argname): return args[argname] else: return [] def get_record_ids(argstr, date_from, date_until): args = parse_qs(argstr) p = get_argument(args, 'p') c = get_argument(args, 'c') cc = get_argument(args, 'cc') as = get_argument(args, 'as') f = get_argument(args, 'f') rg = get_argument(args, 'rg') so = get_argument(args, 'so') sp = get_argument(args, 'sp') ot = get_argument(args, 'ot') as = get_argument(args, 'as') p1 = get_argument(args, 'p1') f1 = get_argument(args, 'f1') m1 = get_argument(args, 'm1') op1 = get_argument(args, 'op1') p2 = get_argument(args, 'p2') f2 = get_argument(args, 'f2') m2 = get_argument(args, 'm2') op2 = get_argument(args, 'op2') p3 = get_argument(args, 'p3') f3 = get_argument(args, 'f3') m3 = get_argument(args, 'm3') sc = get_argument(args, 'sc') # search = get_argument(args, 'search') d1y, d1m, d1d = date_from d2y, d2m, d2d = date_until return perform_request_search(of='id', p=p, c=c, cc=cc, f=f, so=so, sp=sp, ot=ot, as=as, p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, sc=sc, d1y=d1y, d1m=d1m, d1d=d1d, d2y=d2y, d2m=d2m, d2d=d2d) def get_argument_as_string(argstr, argname): args = parse_qs(argstr) a = get_argument(args, argname) r = '' if len(a): r = a[0] for i in a[1:len(a)]: r += ", %s" % i return r def get_pattern(argstr): return get_argument_as_string(argstr, 'p') def get_catalogue(argstr): return get_argument_as_string(argstr, 'c') def get_catalogue_num(argstr): args = parse_qs(argstr) a = get_argument(args, 'c') return len(a) def get_date_from(time, freq): t = mktime(time) if freq == 'day': time2 = localtime(t - 86400) elif freq == 'month': m = time[1] - 1 y = time[0] if m == 0: m = 12 y -= 1 time2 = (y, m, time[2], time[3], time[4], time[5], time[6], time[7], time[8]) elif freq == 'week': time2 = localtime(t - 604800) ystr = strftime("%Y", time2) mstr = strftime("%m", time2) dstr = strftime("%d", time2) return (ystr, mstr, dstr) def run_query(query, frequency): """Return a dictionary containing the information of the performed query. The information contains the id of the query, the arguments as a string, and the list of found records.""" time = localtime() # Override time here for testing purposes (beware of localtime offset): #time = (2002, 12, 21, 2, 0, 0, 2, 120, 1) # Override frequency here for testing #frequency = 'week' ystr = strftime("%Y", time) mstr = strftime("%m", time) dstr = strftime("%d", time) date_until = (ystr, mstr, dstr) date_from = get_date_from(time, frequency) recs = get_record_ids(query[1], date_from, date_until) if DEBUGLEVEL > 2: print "[%s] run query: %s with dates: from=%s, until=%s\n found rec ids: %s" % (strftime("%c"), query, date_from, date_until, recs) return {'id_query': query[0], 'argstr': query[1], 'records': recs, 'date_from': date_from, 'date_until': date_until} def process_alert_queries(frequency): """Run the alerts according to the frequency. Retrieves the queries for which an alert exists, performs it, and processes the corresponding alerts.""" alert_queries = get_alert_queries(frequency) for aq in alert_queries: q = run_query(aq, frequency) alerts = get_alerts(q, frequency) process_alerts(alerts) def replace_argument(argstr, argname, argval): """Replace the given date argument value with the new one. If the argument is missing, it is added.""" if search('%s=\d+' % argname, argstr): r = sub('%s=\d+' % argname, '%s=%s' % (argname, argval), argstr) else: r = argstr + '&%s=%s' % (argname, argval) return r def update_arguments(argstr, date_from, date_until): """Replace date arguments in argstr with the ones specified by date_from and date_until. Absent arguments are added.""" d1y, d1m, d1d = date_from d2y, d2m, d2d = date_until r = replace_argument(argstr, 'd1y', d1y) r = replace_argument(r, 'd1m', d1m) r = replace_argument(r, 'd1d', d1d) r = replace_argument(r, 'd2y', d2y) r = replace_argument(r, 'd2m', d2m) r = replace_argument(r, 'd2d', d2d) return r def process_alerts(alerts): # TBD: do not generate the email each time, forge it once and then # send it to all appropriate people for a in alerts['alerts']: if alert_use_basket_p(a): add_records_to_basket(alerts['records'], a[2]) if alert_use_notification_p(a): argstr = update_arguments(alerts['argstr'], alerts['date_from'], alerts['date_until']) email_notify(a, alerts['records'], argstr) update_date_lastrun(a) def alert_use_basket_p(alert): return alert[2] != 0 def alert_use_notification_p(alert): return alert[6] == 'y' def run_alerts(): """Run the alerts. First decide which alerts to run according to the current local time, and runs them.""" t = localtime() if t[2] == 1: # first of the month process_alert_queries('month') t = strftime("%A") if t == 'Monday': # first day of the week process_alert_queries('week') process_alert_queries('day') def process_alert_queries_for_user(uid): """Process the alerts for the given user id. All alerts are with reference date set as the current local time.""" alert_queries = get_alert_queries_for_user(uid) print alert_queries for aq in alert_queries: frequency = aq[2] q = run_query(aq, frequency) alerts = get_alerts(q, frequency) process_alerts(alerts) if __name__ == '__main__': process_alert_queries_for_user(2571422) # erik process_alert_queries_for_user(109) # tibor # process_alert_queries_for_user(11040) # jean-yves diff --git a/modules/webalert/lib/alert_engine.py.wml b/modules/webalert/lib/alert_engine.py.wml index f9dc07837..9d37f1521 100644 --- a/modules/webalert/lib/alert_engine.py.wml +++ b/modules/webalert/lib/alert_engine.py.wml @@ -1,430 +1,432 @@ ## $Id$ ## Alert engine implementation. ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ##read config variables #include "config.wml" #include "configbis.wml" ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. """Alert engine implementation.""" ## rest of the Python code goes below __version__ = "$Id$" try: from cgi import parse_qs from sre import search, sub from time import localtime, strftime, mktime, sleep import smtplib from config import * from search_engine import perform_request_search from dbquery import run_sql from htmlparser import * + from string import split except ImportError, e: print "Error: %s" % e import sys sys.exit(1) MAXIDS = 50 FROMADDR = 'CDS Alert Engine <%s>' % alertengineemail ALERTURL = weburl + '/youralerts.py/list' DEVELOPERADDR = ['erik.simon@cern.ch', 'tibor.simko@cern.ch'] # Debug levels: # 0 = production, nothing on the console, email sent # 1 = messages on the console, email sent # 2 = messages on the console, but no email sent # 3 = many messages on the console, no email sent # 4 = many messages on the console, email sent to DEVELOPERADDR DEBUGLEVEL = 4 def update_date_lastrun(alert): return run_sql('update user_query_basket set date_lastrun=%s where id_user=%s and id_query=%s and id_basket=%s;', (strftime("%Y-%m-%d"), alert[0], alert[1], alert[2],)) def get_alert_queries(frequency): return run_sql('select distinct id, urlargs from query q, user_query_basket uqb where q.id=uqb.id_query and uqb.frequency=%s and uqb.date_lastrun <= now();', (frequency,)) def get_alert_queries_for_user(uid): return run_sql('select distinct id, urlargs, uqb.frequency from query q, user_query_basket uqb where q.id=uqb.id_query and uqb.id_user=%s and uqb.date_lastrun <= now();', (uid,)) def get_alerts(query, frequency): r = run_sql('select id_user, id_query, id_basket, frequency, date_lastrun, alert_name, notification from user_query_basket where id_query=%s and frequency=%s;', (query['id_query'], frequency,)) return {'alerts': r, 'records': query['records'], 'argstr': query['argstr'], 'date_from': query['date_from'], 'date_until': query['date_until']} # def add_record_to_basket(record_id, basket_id): # if DEBUGLEVEL > 0: # print "-> adding record %s into basket %s" % (record_id, basket_id) # try: # return run_sql('insert into basket_record (id_basket, id_record) values(%s, %s);', (basket_id, record_id,)) # except: # return 0 # def add_records_to_basket(record_ids, basket_id): # # TBD: generate the list and all all records in one step (see below) # for i in record_ids: # add_record_to_basket(i, basket_id) # Optimized version: def add_records_to_basket(record_ids, basket_id): global DEBUGLEVEL nrec = len(record_ids) if nrec > 0: vals = '(%s,%s)' % (basket_id, record_ids[0]) if nrec > 1: for i in record_ids[1:]: vals += ',(%s, %s)' % (basket_id, i) if DEBUGLEVEL > 0: print "-> adding %s records into basket %s: %s" % (nrec, basket_id, vals) try: if DEBUGLEVEL < 4: return run_sql('insert into basket_record (id_basket, id_record) values %s;' % vals) # Cannot use the run_sql(, (,)) form for some reason else: print ' NOT ADDED, DEBUG LEVEL == 4' return 0 except: return 0 else: return 0 def get_email(uid): r = run_sql('select email from user where id=%s', (uid,)) return r[0][0] def get_query(alert_id): r = run_sql('select urlargs from query where id=%s', (alert_id,)) return r[0][0] def send_email(fromaddr, toaddr, body): global DEBUGLEVEL try: server = smtplib.SMTP('localhost') if DEBUGLEVEL > 2: server.set_debuglevel(1) else: server.set_debuglevel(0) server.sendmail(fromaddr, toaddr, body) server.quit() except: print 'Error connecting to SMTP server, retrying in 10 seconds.' sleep(10) send_email(fromaddr, toaddr, body) def forge_email(fromaddr, toaddr, subject, content): body = 'From: %s\nTo: %s\nContent-Type: text/plain; charset=utf-8\nSubject: %s\n%s' % (fromaddr, toaddr, subject, content) return body def format_frequency(freq): frequency = freq if frequency == "day": return 'daily' else: return frequency + 'ly' def print_records(record_ids): global MAXIDS msg = '' c = 1 for i in record_ids: if c > MAXIDS: break msg += '\n\n%s) %s' % (c, get_as_text(i)) c += 1 if c > MAXIDS: - msg += '\n\n' + 'Only the first %s records are displayed above. Please consult the URL below to see all the results.' % MAXIDS + msg += '\n\n' + wrap('Only the first %s records are displayed above. Please consult the URL below to see all the results.' % MAXIDS) return msg + def email_notify(alert, records, argstr): global FROMADDR global ALERTURL global DEBUGLEVEL global DEVELOPERADDR if len(records) == 0: return msg = "" if DEBUGLEVEL > 0: msg = "*** THIS MESSAGE WAS SENT IN DEBUG MODE, DON'T TAKE IT INTO ACCOUNT ***\n\n" - msg += "Hello\n\nBelow are the results of the email alert that you set up with the CERN Document Server.\n" - msg += "This is an automatic message, please don't reply to its address. For any question, use <%s> instead.\n" % supportemail + msg += "Hello\n\n" + msg += wrap("Below are the results of the email alert that you set up with the CERN Document Server. This is an automatic message, please don't reply to its address. For any question, use <%s> instead." % supportemail) email = get_email(alert[0]) url = weburl + "/search.py?" + argstr pattern = get_pattern(argstr) catalogue = get_catalogue(argstr) catword = 'catalogue' if get_catalogue_num(argstr) > 1: catword += 's' time = strftime("%d-%m-%Y") - msg += '\nalert name: %s' % alert[5] - msg += '\npattern: \'%s\'' % pattern + msg += '\n' + wrap('alert name: %s' % alert[5]) + msg += wrap('pattern: \'%s\'' % pattern) if catalogue: - msg += '\n%s: %s' % (catword, catalogue) - msg += '\nfrequency: %s ' % format_frequency(alert[3]) - msg += '\nrun time: %s ' % time - msg += '\nfound: %s record' % len(records) + msg += wrap('%s: %s' % (catword, catalogue)) + msg += wrap('frequency: %s ' % format_frequency(alert[3])) + msg += wrap('run time: %s ' % time) + msg += wrap('found: %s record' % len(records)) if len(records) > 1: msg += 's' msg += "\nurl: <%s/search.py?%s>\n" % (weburl, argstr) - msg += print_records(records) + msg += wrap_records(print_records(records)) - msg += "\n\n-- \nCERN Document Server Alert Service <%s>\nUnsubscribe at <%s>\nNeed human intervention? Contact <%s>" % (weburl, ALERTURL, supportemail) + msg += "-- \nCERN Document Server Alert Service <%s>\nUnsubscribe at <%s>\nNeed human intervention? Contact <%s>" % (weburl, ALERTURL, supportemail) subject = 'Alert %s run on %s' % (alert[5], time) body = forge_email(FROMADDR, email, subject, msg) if DEBUGLEVEL > 0: print "********************************************************************************" print body print "********************************************************************************" if DEBUGLEVEL < 2: send_email(FROMADDR, email, body) if DEBUGLEVEL == 4: for a in DEVELOPERADDR: send_email(FROMADDR, a, body) def get_argument(args, argname): if args.has_key(argname): return args[argname] else: return [] def get_record_ids(argstr, date_from, date_until): args = parse_qs(argstr) p = get_argument(args, 'p') c = get_argument(args, 'c') cc = get_argument(args, 'cc') as = get_argument(args, 'as') f = get_argument(args, 'f') rg = get_argument(args, 'rg') so = get_argument(args, 'so') sp = get_argument(args, 'sp') ot = get_argument(args, 'ot') as = get_argument(args, 'as') p1 = get_argument(args, 'p1') f1 = get_argument(args, 'f1') m1 = get_argument(args, 'm1') op1 = get_argument(args, 'op1') p2 = get_argument(args, 'p2') f2 = get_argument(args, 'f2') m2 = get_argument(args, 'm2') op2 = get_argument(args, 'op2') p3 = get_argument(args, 'p3') f3 = get_argument(args, 'f3') m3 = get_argument(args, 'm3') sc = get_argument(args, 'sc') # search = get_argument(args, 'search') d1y, d1m, d1d = date_from d2y, d2m, d2d = date_until return perform_request_search(of='id', p=p, c=c, cc=cc, f=f, so=so, sp=sp, ot=ot, as=as, p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, sc=sc, d1y=d1y, d1m=d1m, d1d=d1d, d2y=d2y, d2m=d2m, d2d=d2d) def get_argument_as_string(argstr, argname): args = parse_qs(argstr) a = get_argument(args, argname) r = '' if len(a): r = a[0] for i in a[1:len(a)]: r += ", %s" % i return r def get_pattern(argstr): return get_argument_as_string(argstr, 'p') def get_catalogue(argstr): return get_argument_as_string(argstr, 'c') def get_catalogue_num(argstr): args = parse_qs(argstr) a = get_argument(args, 'c') return len(a) def get_date_from(time, freq): t = mktime(time) if freq == 'day': time2 = localtime(t - 86400) elif freq == 'month': m = time[1] - 1 y = time[0] if m == 0: m = 12 y -= 1 time2 = (y, m, time[2], time[3], time[4], time[5], time[6], time[7], time[8]) elif freq == 'week': time2 = localtime(t - 604800) ystr = strftime("%Y", time2) mstr = strftime("%m", time2) dstr = strftime("%d", time2) return (ystr, mstr, dstr) def run_query(query, frequency): """Return a dictionary containing the information of the performed query. The information contains the id of the query, the arguments as a string, and the list of found records.""" time = localtime() # Override time here for testing purposes (beware of localtime offset): #time = (2002, 12, 21, 2, 0, 0, 2, 120, 1) # Override frequency here for testing #frequency = 'week' ystr = strftime("%Y", time) mstr = strftime("%m", time) dstr = strftime("%d", time) date_until = (ystr, mstr, dstr) date_from = get_date_from(time, frequency) recs = get_record_ids(query[1], date_from, date_until) if DEBUGLEVEL > 2: print "[%s] run query: %s with dates: from=%s, until=%s\n found rec ids: %s" % (strftime("%c"), query, date_from, date_until, recs) return {'id_query': query[0], 'argstr': query[1], 'records': recs, 'date_from': date_from, 'date_until': date_until} def process_alert_queries(frequency): """Run the alerts according to the frequency. Retrieves the queries for which an alert exists, performs it, and processes the corresponding alerts.""" alert_queries = get_alert_queries(frequency) for aq in alert_queries: q = run_query(aq, frequency) alerts = get_alerts(q, frequency) process_alerts(alerts) def replace_argument(argstr, argname, argval): """Replace the given date argument value with the new one. If the argument is missing, it is added.""" if search('%s=\d+' % argname, argstr): r = sub('%s=\d+' % argname, '%s=%s' % (argname, argval), argstr) else: r = argstr + '&%s=%s' % (argname, argval) return r def update_arguments(argstr, date_from, date_until): """Replace date arguments in argstr with the ones specified by date_from and date_until. Absent arguments are added.""" d1y, d1m, d1d = date_from d2y, d2m, d2d = date_until r = replace_argument(argstr, 'd1y', d1y) r = replace_argument(r, 'd1m', d1m) r = replace_argument(r, 'd1d', d1d) r = replace_argument(r, 'd2y', d2y) r = replace_argument(r, 'd2m', d2m) r = replace_argument(r, 'd2d', d2d) return r def process_alerts(alerts): # TBD: do not generate the email each time, forge it once and then # send it to all appropriate people for a in alerts['alerts']: if alert_use_basket_p(a): add_records_to_basket(alerts['records'], a[2]) if alert_use_notification_p(a): argstr = update_arguments(alerts['argstr'], alerts['date_from'], alerts['date_until']) email_notify(a, alerts['records'], argstr) update_date_lastrun(a) def alert_use_basket_p(alert): return alert[2] != 0 def alert_use_notification_p(alert): return alert[6] == 'y' def run_alerts(): """Run the alerts. First decide which alerts to run according to the current local time, and runs them.""" t = localtime() if t[2] == 1: # first of the month process_alert_queries('month') t = strftime("%A") if t == 'Monday': # first day of the week process_alert_queries('week') process_alert_queries('day') def process_alert_queries_for_user(uid): """Process the alerts for the given user id. All alerts are with reference date set as the current local time.""" alert_queries = get_alert_queries_for_user(uid) print alert_queries for aq in alert_queries: frequency = aq[2] q = run_query(aq, frequency) alerts = get_alerts(q, frequency) process_alerts(alerts) if __name__ == '__main__': process_alert_queries_for_user(2571422) # erik process_alert_queries_for_user(109) # tibor # process_alert_queries_for_user(11040) # jean-yves diff --git a/modules/webalert/lib/htmlparser.py b/modules/webalert/lib/htmlparser.py index 2d5753ca7..9d6325a91 100644 --- a/modules/webalert/lib/htmlparser.py +++ b/modules/webalert/lib/htmlparser.py @@ -1,108 +1,136 @@ ## $Id$ ## HTML parser for records. ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ##read config variables #include "config.wml" #include "configbis.wml" ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. """HTML parser for records.""" ## rest of the Python code goes below __version__ = "$Id$" try: from config import * from search_engine import print_record from HTMLParser import HTMLParser + import textwrap + from string import split except ImportError, e: print "Error: %s" % e import sys sys.exit(1) +WRAPWIDTH = 72 + +def wrap(text): + global WRAPWIDTH + + lines = textwrap.wrap(text, WRAPWIDTH) + r = '' + for l in lines: + r += l + '\n' + return r + +def wrap_records(text): + global WRAPWIDTH + + lines = split(text, '\n') + result = '' + for l in lines: + newlines = textwrap.wrap(l, WRAPWIDTH) + for ll in newlines: + result += ll + '\n' + return result class RecordHTMLParser(HTMLParser): """A parser for the HTML returned by cdsware.search_engine.print_record. The parser provides methods to transform the HTML returned by cdsware.search_engine.print_record into plain text, with some minor formatting. """ def __init__(self): HTMLParser.__init__(self) self.result = '' def handle_starttag(self, tag, attrs): if tag == 'strong': # self.result += '*' pass elif tag == 'a': self.printURL = 0 self.unclosedBracket = 0 for f in attrs: if f[1] == 'note': self.result += 'Fulltext : <' self.unclosedBracket = 1 if f[1] == 'moreinfo': self.result += 'Detailed record : ' self.printURL = 1 if (self.printURL == 1) and (f[0] == 'href'): self.result += '<' + f[1] + '>' elif tag == 'br': self.result += '\n' def handle_endtag(self, tag): if tag == 'strong': # self.result += '\n' pass elif tag == 'a': if self.unclosedBracket == 1: self.result += '>' self.unclosedBracket = 0 def handle_data(self, data): if data == 'Detailed record': pass else: self.result += data + def handle_comment(self, data): + pass + + def get_as_text(record_id): """Return the plain text from RecordHTMLParser of the record.""" rec = print_record(record_id) htparser = RecordHTMLParser() try: htparser.feed(rec) return htparser.result except: + htparser.close() return htparser.result + '\n**HTML Error detected in record , contact <%s>.' % (record_id, supportemail) if __name__ == "__main__": rec = print_record(619028) print rec print "***" print get_as_text(619028) diff --git a/modules/webalert/lib/htmlparser.py.wml b/modules/webalert/lib/htmlparser.py.wml index 2d5753ca7..9d6325a91 100644 --- a/modules/webalert/lib/htmlparser.py.wml +++ b/modules/webalert/lib/htmlparser.py.wml @@ -1,108 +1,136 @@ ## $Id$ ## HTML parser for records. ## This file is part of the CERN Document Server Software (CDSware). ## Copyright (C) 2002 CERN. ## ## The CDSware is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## The CDSware is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDSware; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ##read config variables #include "config.wml" #include "configbis.wml" ## $Id$ ## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES. """HTML parser for records.""" ## rest of the Python code goes below __version__ = "$Id$" try: from config import * from search_engine import print_record from HTMLParser import HTMLParser + import textwrap + from string import split except ImportError, e: print "Error: %s" % e import sys sys.exit(1) +WRAPWIDTH = 72 + +def wrap(text): + global WRAPWIDTH + + lines = textwrap.wrap(text, WRAPWIDTH) + r = '' + for l in lines: + r += l + '\n' + return r + +def wrap_records(text): + global WRAPWIDTH + + lines = split(text, '\n') + result = '' + for l in lines: + newlines = textwrap.wrap(l, WRAPWIDTH) + for ll in newlines: + result += ll + '\n' + return result class RecordHTMLParser(HTMLParser): """A parser for the HTML returned by cdsware.search_engine.print_record. The parser provides methods to transform the HTML returned by cdsware.search_engine.print_record into plain text, with some minor formatting. """ def __init__(self): HTMLParser.__init__(self) self.result = '' def handle_starttag(self, tag, attrs): if tag == 'strong': # self.result += '*' pass elif tag == 'a': self.printURL = 0 self.unclosedBracket = 0 for f in attrs: if f[1] == 'note': self.result += 'Fulltext : <' self.unclosedBracket = 1 if f[1] == 'moreinfo': self.result += 'Detailed record : ' self.printURL = 1 if (self.printURL == 1) and (f[0] == 'href'): self.result += '<' + f[1] + '>' elif tag == 'br': self.result += '\n' def handle_endtag(self, tag): if tag == 'strong': # self.result += '\n' pass elif tag == 'a': if self.unclosedBracket == 1: self.result += '>' self.unclosedBracket = 0 def handle_data(self, data): if data == 'Detailed record': pass else: self.result += data + def handle_comment(self, data): + pass + + def get_as_text(record_id): """Return the plain text from RecordHTMLParser of the record.""" rec = print_record(record_id) htparser = RecordHTMLParser() try: htparser.feed(rec) return htparser.result except: + htparser.close() return htparser.result + '\n**HTML Error detected in record , contact <%s>.' % (record_id, supportemail) if __name__ == "__main__": rec = print_record(619028) print rec print "***" print get_as_text(619028) diff --git a/modules/webalert/lib/textwrap.py b/modules/webalert/lib/textwrap.py new file mode 100644 index 000000000..051895991 --- /dev/null +++ b/modules/webalert/lib/textwrap.py @@ -0,0 +1,313 @@ +"""Text wrapping and filling. + + Backported to Python 2.0 for inclusion in Mixminion. +""" + +# Copyright (C) 1999-2001 Gregory P. Ward. +# Copyright (C) 2002 Python Software Foundation. +# Written by Greg Ward + +# XXX currently this module does not work very well with Unicode +# strings. See http://www.python.org/sf/622831 for updates. + + + +__revision__ = "$Id$" + +import string, re + +# THIS SECTION IS HERE TO BACKPORT THIS MODULE TO PYTHON 2.0. + +# Later in the file, we replace ininstance(x, str) with +# isinstance(x, types.StringType) and so on. +import types + + +# The 'True' and 'False' constants weren't introduced until Python 2.3. +try: + True +except NameError: + True, False = 1, 0 +# END BACKPORT SECTION. + +class TextWrapper: + """ + Object for wrapping/filling text. The public interface consists of + the wrap() and fill() methods; the other methods are just there for + subclasses to override in order to tweak the default behaviour. + If you want to completely replace the main wrapping algorithm, + you'll probably have to override _wrap_chunks(). + + Several instance attributes control various aspects of wrapping: + width (default: 70) + the maximum width of wrapped lines (unless break_long_words + is false) + initial_indent (default: "") + string that will be prepended to the first line of wrapped + output. Counts towards the line's width. + subsequent_indent (default: "") + string that will be prepended to all lines save the first + of wrapped output; also counts towards each line's width. + expand_tabs (default: true) + Expand tabs in input text to spaces before further processing. + Each tab will become 1 .. 8 spaces, depending on its position in + its line. If false, each tab is treated as a single character. + replace_whitespace (default: true) + Replace all whitespace characters in the input text by spaces + after tab expansion. Note that if expand_tabs is false and + replace_whitespace is true, every tab will be converted to a + single space! + fix_sentence_endings (default: false) + Ensure that sentence-ending punctuation is always followed + by two spaces. Off by default becaus the algorithm is + (unavoidably) imperfect. + break_long_words (default: true) + Break words longer than 'width'. If false, those words will not + be broken, and some lines might be longer than 'width'. + """ + + whitespace_trans = string.maketrans(string.whitespace, + ' ' * len(string.whitespace)) + + unicode_whitespace_trans = {} + for c in string.whitespace: + unicode_whitespace_trans[ord(unicode(c))] = ord(u' ') + + # This funky little regex is just the trick for splitting + # text up into word-wrappable chunks. E.g. + # "Hello there -- you goof-ball, use the -b option!" + # splits into + # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! + # (after stripping out empty strings). + wordsep_re = re.compile(r'(\s+|' # any whitespace + r'-*\w{2,}-(?=\w{2,})|' # hyphenated words + r'(?<=\S)-{2,}(?=\w))') # em-dash + + # XXX will there be a locale-or-charset-aware version of + # string.lowercase in 2.3? + sentence_end_re = re.compile(r'[%s]' # lowercase letter + r'[\.\!\?]' # sentence-ending punct. + r'[\"\']?' # optional end-of-quote + % string.lowercase) + + + def __init__ (self, + width=70, + initial_indent="", + subsequent_indent="", + expand_tabs=True, + replace_whitespace=True, + fix_sentence_endings=False, + break_long_words=True): + self.width = width + self.initial_indent = initial_indent + self.subsequent_indent = subsequent_indent + self.expand_tabs = expand_tabs + self.replace_whitespace = replace_whitespace + self.fix_sentence_endings = fix_sentence_endings + self.break_long_words = break_long_words + + + # -- Private methods ----------------------------------------------- + # (possibly useful for subclasses to override) + + def _munge_whitespace(self, text): + """_munge_whitespace(text : string) -> string + + Munge whitespace in text: expand tabs and convert all other + whitespace characters to spaces. Eg. " foo\tbar\n\nbaz" + becomes " foo bar baz". + """ + if self.expand_tabs: + text = text.expandtabs() + if self.replace_whitespace: + if isinstance(text, types.StringType): + text = text.translate(self.whitespace_trans) + elif isinstance(text, types.UnicodeType): + text = text.translate(self.unicode_whitespace_trans) + return text + + + def _split(self, text): + """_split(text : string) -> [string] + + Split the text to wrap into indivisible chunks. Chunks are + not quite the same as words; see wrap_chunks() for full + details. As an example, the text + Look, goof-ball -- use the -b option! + breaks into the following chunks: + 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', + 'use', ' ', 'the', ' ', '-b', ' ', 'option!' + """ + chunks = self.wordsep_re.split(text) + chunks = filter(None, chunks) + return chunks + + def _fix_sentence_endings(self, chunks): + """_fix_sentence_endings(chunks : [string]) + + Correct for sentence endings buried in 'chunks'. Eg. when the + original text contains "... foo.\nBar ...", munge_whitespace() + and split() will convert that to [..., "foo.", " ", "Bar", ...] + which has one too few spaces; this method simply changes the one + space to two. + """ + i = 0 + pat = self.sentence_end_re + while i < len(chunks)-1: + if chunks[i+1] == " " and pat.search(chunks[i]): + chunks[i+1] = " " + i += 2 + else: + i += 1 + + def _handle_long_word(self, chunks, cur_line, cur_len, width): + """_handle_long_word(chunks : [string], + cur_line : [string], + cur_len : int, width : int) + + Handle a chunk of text (most likely a word, not whitespace) that + is too long to fit in any line. + """ + space_left = width - cur_len + + # If we're allowed to break long words, then do so: put as much + # of the next chunk onto the current line as will fit. + if self.break_long_words: + cur_line.append(chunks[0][0:space_left]) + chunks[0] = chunks[0][space_left:] + + # Otherwise, we have to preserve the long word intact. Only add + # it to the current line if there's nothing already there -- + # that minimizes how much we violate the width constraint. + elif not cur_line: + cur_line.append(chunks.pop(0)) + + # If we're not allowed to break long words, and there's already + # text on the current line, do nothing. Next time through the + # main loop of _wrap_chunks(), we'll wind up here again, but + # cur_len will be zero, so the next line will be entirely + # devoted to the long word that we can't handle right now. + + def _wrap_chunks(self, chunks): + """_wrap_chunks(chunks : [string]) -> [string] + + Wrap a sequence of text chunks and return a list of lines of + length 'self.width' or less. (If 'break_long_words' is false, + some lines may be longer than this.) Chunks correspond roughly + to words and the whitespace between them: each chunk is + indivisible (modulo 'break_long_words'), but a line break can + come between any two chunks. Chunks should not have internal + whitespace; ie. a chunk is either all whitespace or a "word". + Whitespace chunks will be removed from the beginning and end of + lines, but apart from that whitespace is preserved. + """ + lines = [] + + while chunks: + + # Start the list of chunks that will make up the current line. + # cur_len is just the length of all the chunks in cur_line. + cur_line = [] + cur_len = 0 + + # Figure out which static string will prefix this line. + if lines: + indent = self.subsequent_indent + else: + indent = self.initial_indent + + # Maximum width for this line. + width = self.width - len(indent) + + # First chunk on line is whitespace -- drop it, unless this + # is the very beginning of the text (ie. no lines started yet). + if chunks[0].strip() == '' and lines: + del chunks[0] + + while chunks: + l = len(chunks[0]) + + # Can at least squeeze this chunk onto the current line. + if cur_len + l <= width: + cur_line.append(chunks.pop(0)) + cur_len += l + + # Nope, this line is full. + else: + break + + # The current line is full, and the next chunk is too big to + # fit on *any* line (not just this one). + if chunks and len(chunks[0]) > width: + self._handle_long_word(chunks, cur_line, cur_len, width) + + # If the last chunk on this line is all whitespace, drop it. + if cur_line and cur_line[-1].strip() == '': + del cur_line[-1] + + # Convert current line back to a string and store it in list + # of all lines (return value). + if cur_line: + lines.append(indent + ''.join(cur_line)) + + return lines + + + # -- Public interface ---------------------------------------------- + + def wrap(self, text): + """wrap(text : string) -> [string] + + Reformat the single paragraph in 'text' so it fits in lines of + no more than 'self.width' columns, and return a list of wrapped + lines. Tabs in 'text' are expanded with string.expandtabs(), + and all other whitespace characters (including newline) are + converted to space. + """ + text = self._munge_whitespace(text) + indent = self.initial_indent + if len(text) + len(indent) <= self.width: + return [indent + text] + chunks = self._split(text) + if self.fix_sentence_endings: + self._fix_sentence_endings(chunks) + return self._wrap_chunks(chunks) + + def fill(self, text): + """fill(text : string) -> string + + Reformat the single paragraph in 'text' to fit in lines of no + more than 'self.width' columns, and return a new string + containing the entire wrapped paragraph. + """ + return "\n".join(self.wrap(text)) + + +# -- Convenience interface --------------------------------------------- + +def wrap(text, width=70, **kwargs): + """Wrap a single paragraph of text, returning a list of wrapped lines. + + Reformat the single paragraph in 'text' so it fits in lines of no + more than 'width' columns, and return a list of wrapped lines. By + default, tabs in 'text' are expanded with string.expandtabs(), and + all other whitespace characters (including newline) are converted to + space. See TextWrapper class for available keyword args to customize + wrapping behaviour. + """ + w = TextWrapper(width=width, **kwargs) + return w.wrap(text) + +def fill(text, width=70, **kwargs): + """Fill a single paragraph of text, returning a new string. + + Reformat the single paragraph in 'text' to fit in lines of no more + than 'width' columns, and return a new string containing the entire + wrapped paragraph. As with wrap(), tabs are expanded and other + whitespace characters converted to space. See TextWrapper class for + available keyword args to customize wrapping behaviour. + """ + w = TextWrapper(width=width, **kwargs) + return w.fill(text) + diff --git a/modules/webalert/lib/textwrap.py.wml b/modules/webalert/lib/textwrap.py.wml new file mode 100644 index 000000000..051895991 --- /dev/null +++ b/modules/webalert/lib/textwrap.py.wml @@ -0,0 +1,313 @@ +"""Text wrapping and filling. + + Backported to Python 2.0 for inclusion in Mixminion. +""" + +# Copyright (C) 1999-2001 Gregory P. Ward. +# Copyright (C) 2002 Python Software Foundation. +# Written by Greg Ward + +# XXX currently this module does not work very well with Unicode +# strings. See http://www.python.org/sf/622831 for updates. + + + +__revision__ = "$Id$" + +import string, re + +# THIS SECTION IS HERE TO BACKPORT THIS MODULE TO PYTHON 2.0. + +# Later in the file, we replace ininstance(x, str) with +# isinstance(x, types.StringType) and so on. +import types + + +# The 'True' and 'False' constants weren't introduced until Python 2.3. +try: + True +except NameError: + True, False = 1, 0 +# END BACKPORT SECTION. + +class TextWrapper: + """ + Object for wrapping/filling text. The public interface consists of + the wrap() and fill() methods; the other methods are just there for + subclasses to override in order to tweak the default behaviour. + If you want to completely replace the main wrapping algorithm, + you'll probably have to override _wrap_chunks(). + + Several instance attributes control various aspects of wrapping: + width (default: 70) + the maximum width of wrapped lines (unless break_long_words + is false) + initial_indent (default: "") + string that will be prepended to the first line of wrapped + output. Counts towards the line's width. + subsequent_indent (default: "") + string that will be prepended to all lines save the first + of wrapped output; also counts towards each line's width. + expand_tabs (default: true) + Expand tabs in input text to spaces before further processing. + Each tab will become 1 .. 8 spaces, depending on its position in + its line. If false, each tab is treated as a single character. + replace_whitespace (default: true) + Replace all whitespace characters in the input text by spaces + after tab expansion. Note that if expand_tabs is false and + replace_whitespace is true, every tab will be converted to a + single space! + fix_sentence_endings (default: false) + Ensure that sentence-ending punctuation is always followed + by two spaces. Off by default becaus the algorithm is + (unavoidably) imperfect. + break_long_words (default: true) + Break words longer than 'width'. If false, those words will not + be broken, and some lines might be longer than 'width'. + """ + + whitespace_trans = string.maketrans(string.whitespace, + ' ' * len(string.whitespace)) + + unicode_whitespace_trans = {} + for c in string.whitespace: + unicode_whitespace_trans[ord(unicode(c))] = ord(u' ') + + # This funky little regex is just the trick for splitting + # text up into word-wrappable chunks. E.g. + # "Hello there -- you goof-ball, use the -b option!" + # splits into + # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! + # (after stripping out empty strings). + wordsep_re = re.compile(r'(\s+|' # any whitespace + r'-*\w{2,}-(?=\w{2,})|' # hyphenated words + r'(?<=\S)-{2,}(?=\w))') # em-dash + + # XXX will there be a locale-or-charset-aware version of + # string.lowercase in 2.3? + sentence_end_re = re.compile(r'[%s]' # lowercase letter + r'[\.\!\?]' # sentence-ending punct. + r'[\"\']?' # optional end-of-quote + % string.lowercase) + + + def __init__ (self, + width=70, + initial_indent="", + subsequent_indent="", + expand_tabs=True, + replace_whitespace=True, + fix_sentence_endings=False, + break_long_words=True): + self.width = width + self.initial_indent = initial_indent + self.subsequent_indent = subsequent_indent + self.expand_tabs = expand_tabs + self.replace_whitespace = replace_whitespace + self.fix_sentence_endings = fix_sentence_endings + self.break_long_words = break_long_words + + + # -- Private methods ----------------------------------------------- + # (possibly useful for subclasses to override) + + def _munge_whitespace(self, text): + """_munge_whitespace(text : string) -> string + + Munge whitespace in text: expand tabs and convert all other + whitespace characters to spaces. Eg. " foo\tbar\n\nbaz" + becomes " foo bar baz". + """ + if self.expand_tabs: + text = text.expandtabs() + if self.replace_whitespace: + if isinstance(text, types.StringType): + text = text.translate(self.whitespace_trans) + elif isinstance(text, types.UnicodeType): + text = text.translate(self.unicode_whitespace_trans) + return text + + + def _split(self, text): + """_split(text : string) -> [string] + + Split the text to wrap into indivisible chunks. Chunks are + not quite the same as words; see wrap_chunks() for full + details. As an example, the text + Look, goof-ball -- use the -b option! + breaks into the following chunks: + 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', + 'use', ' ', 'the', ' ', '-b', ' ', 'option!' + """ + chunks = self.wordsep_re.split(text) + chunks = filter(None, chunks) + return chunks + + def _fix_sentence_endings(self, chunks): + """_fix_sentence_endings(chunks : [string]) + + Correct for sentence endings buried in 'chunks'. Eg. when the + original text contains "... foo.\nBar ...", munge_whitespace() + and split() will convert that to [..., "foo.", " ", "Bar", ...] + which has one too few spaces; this method simply changes the one + space to two. + """ + i = 0 + pat = self.sentence_end_re + while i < len(chunks)-1: + if chunks[i+1] == " " and pat.search(chunks[i]): + chunks[i+1] = " " + i += 2 + else: + i += 1 + + def _handle_long_word(self, chunks, cur_line, cur_len, width): + """_handle_long_word(chunks : [string], + cur_line : [string], + cur_len : int, width : int) + + Handle a chunk of text (most likely a word, not whitespace) that + is too long to fit in any line. + """ + space_left = width - cur_len + + # If we're allowed to break long words, then do so: put as much + # of the next chunk onto the current line as will fit. + if self.break_long_words: + cur_line.append(chunks[0][0:space_left]) + chunks[0] = chunks[0][space_left:] + + # Otherwise, we have to preserve the long word intact. Only add + # it to the current line if there's nothing already there -- + # that minimizes how much we violate the width constraint. + elif not cur_line: + cur_line.append(chunks.pop(0)) + + # If we're not allowed to break long words, and there's already + # text on the current line, do nothing. Next time through the + # main loop of _wrap_chunks(), we'll wind up here again, but + # cur_len will be zero, so the next line will be entirely + # devoted to the long word that we can't handle right now. + + def _wrap_chunks(self, chunks): + """_wrap_chunks(chunks : [string]) -> [string] + + Wrap a sequence of text chunks and return a list of lines of + length 'self.width' or less. (If 'break_long_words' is false, + some lines may be longer than this.) Chunks correspond roughly + to words and the whitespace between them: each chunk is + indivisible (modulo 'break_long_words'), but a line break can + come between any two chunks. Chunks should not have internal + whitespace; ie. a chunk is either all whitespace or a "word". + Whitespace chunks will be removed from the beginning and end of + lines, but apart from that whitespace is preserved. + """ + lines = [] + + while chunks: + + # Start the list of chunks that will make up the current line. + # cur_len is just the length of all the chunks in cur_line. + cur_line = [] + cur_len = 0 + + # Figure out which static string will prefix this line. + if lines: + indent = self.subsequent_indent + else: + indent = self.initial_indent + + # Maximum width for this line. + width = self.width - len(indent) + + # First chunk on line is whitespace -- drop it, unless this + # is the very beginning of the text (ie. no lines started yet). + if chunks[0].strip() == '' and lines: + del chunks[0] + + while chunks: + l = len(chunks[0]) + + # Can at least squeeze this chunk onto the current line. + if cur_len + l <= width: + cur_line.append(chunks.pop(0)) + cur_len += l + + # Nope, this line is full. + else: + break + + # The current line is full, and the next chunk is too big to + # fit on *any* line (not just this one). + if chunks and len(chunks[0]) > width: + self._handle_long_word(chunks, cur_line, cur_len, width) + + # If the last chunk on this line is all whitespace, drop it. + if cur_line and cur_line[-1].strip() == '': + del cur_line[-1] + + # Convert current line back to a string and store it in list + # of all lines (return value). + if cur_line: + lines.append(indent + ''.join(cur_line)) + + return lines + + + # -- Public interface ---------------------------------------------- + + def wrap(self, text): + """wrap(text : string) -> [string] + + Reformat the single paragraph in 'text' so it fits in lines of + no more than 'self.width' columns, and return a list of wrapped + lines. Tabs in 'text' are expanded with string.expandtabs(), + and all other whitespace characters (including newline) are + converted to space. + """ + text = self._munge_whitespace(text) + indent = self.initial_indent + if len(text) + len(indent) <= self.width: + return [indent + text] + chunks = self._split(text) + if self.fix_sentence_endings: + self._fix_sentence_endings(chunks) + return self._wrap_chunks(chunks) + + def fill(self, text): + """fill(text : string) -> string + + Reformat the single paragraph in 'text' to fit in lines of no + more than 'self.width' columns, and return a new string + containing the entire wrapped paragraph. + """ + return "\n".join(self.wrap(text)) + + +# -- Convenience interface --------------------------------------------- + +def wrap(text, width=70, **kwargs): + """Wrap a single paragraph of text, returning a list of wrapped lines. + + Reformat the single paragraph in 'text' so it fits in lines of no + more than 'width' columns, and return a list of wrapped lines. By + default, tabs in 'text' are expanded with string.expandtabs(), and + all other whitespace characters (including newline) are converted to + space. See TextWrapper class for available keyword args to customize + wrapping behaviour. + """ + w = TextWrapper(width=width, **kwargs) + return w.wrap(text) + +def fill(text, width=70, **kwargs): + """Fill a single paragraph of text, returning a new string. + + Reformat the single paragraph in 'text' to fit in lines of no more + than 'width' columns, and return a new string containing the entire + wrapped paragraph. As with wrap(), tabs are expanded and other + whitespace characters converted to space. See TextWrapper class for + available keyword args to customize wrapping behaviour. + """ + w = TextWrapper(width=width, **kwargs) + return w.fill(text) +