Page MenuHomec4science

webstat_engine.py
No OneTemporary

File Metadata

Created
Mon, Nov 18, 06:27

webstat_engine.py

## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
__revision__ = "$Id$"
__lastupdated__ = "$Date$"
import calendar, commands, datetime, time, os, cPickle
from invenio.config import tmpdir, weburl
from invenio.urlutils import redirect_to_url
from invenio.search_engine import perform_request_search
from invenio.dbquery import run_sql
WEBSTAT_SESSION_LENGTH = 48*60*60 # seconds
WEBSTAT_GRAPH_TOKENS = '-=#+@$%&XOSKEHBC'
# KEY EVENT TREND SECTION
def get_keyevent_trend_collection_population(args):
"""
Returns the quantity of documents in CDS Invenio for
the given timestamp range.
@param args['collection']: A collection name
@type args['collection']: str
@param args['t_start']: Date and time of start point
@type args['t_start']: str
@param args['t_end']: Date and time of end point
@type args['t_end']: str
@param args['granularity']: Granularity of date and time
@type args['granularity']: str
@param args['t_format']: Date and time formatting string
@type args['t_format']: str
"""
# Collect list of timestamps of insertion in the specific collection
ids = perform_request_search(cc=args['collection'])
if len(ids) == 0:
return []
sql = ("SELECT creation_date FROM bibrec WHERE id IN %s ORDER BY " + \
"creation_date DESC") % str(ids).replace('[', '(').replace(']', ')')
action_dates = [x[0] for x in run_sql(sql)]
initial_quantity = run_sql("SELECT COUNT(id) FROM bibrec WHERE creation_date < '%s'" %
_to_datetime(args['t_start'], args['t_format']).isoformat())[0][0]
return _get_trend_from_actions(action_dates, initial_quantity,
args['t_start'], args['t_end'], args['granularity'], args['t_format'])
def get_keyevent_trend_search_frequency(args):
"""
Returns the number of searches (of any kind) carried out
during the given timestamp range.
@param args['t_start']: Date and time of start point
@type args['t_start']: str
@param args['t_end']: Date and time of end point
@type args['t_end']: str
@param args['granularity']: Granularity of date and time
@type args['granularity']: str
@param args['t_format']: Date and time formatting string
@type args['t_format']: str
"""
sql = "SELECT date FROM query INNER JOIN user_query ON id=id_query ORDER BY date DESC"
action_dates = [x[0] for x in run_sql(sql)]
return _get_trend_from_actions(action_dates, 0, args['t_start'], args['t_end'], args['granularity'], args['t_format'])
def get_keyevent_trend_search_type_distribution(args):
"""
Returns the number of searches carried out during the given
timestamp range, but also partion them by type Simple and
Advanced.
@param args['t_start']: Date and time of start point
@type args['t_start']: str
@param args['t_end']: Date and time of end point
@type args['t_end']: str
@param args['granularity']: Granularity of date and time
@type args['granularity']: str
@param args['t_format']: Date and time formatting string
@type args['t_format']: str
"""
# SQL to determine all simple searches:
sql = "SELECT date FROM query INNER JOIN user_query ON id=id_query \
WHERE urlargs LIKE '%p=%' ORDER BY date DESC"
simple = [x[0] for x in run_sql(sql)]
# SQL to determine all advanced searches:
sql = "SELECT date FROM query INNER JOIN user_query ON id=id_query \
WHERE urlargs LIKE '%as=1%' ORDER BY date DESC"
advanced = [x[0] for x in run_sql(sql)]
# Compute the trend for both types
s_trend = _get_trend_from_actions(simple, 0, args['t_start'], args['t_end'], args['granularity'], args['t_format'])
a_trend = _get_trend_from_actions(advanced, 0, args['t_start'], args['t_end'], args['granularity'], args['t_format'])
# Assemble, according to return type
return [(s_trend[i][0], (s_trend[i][1], a_trend[i][1])) for i in range(len(s_trend))]
def get_keyevent_trend_download_frequency(args):
"""
Returns the number of full text downloads carried out
during the given timestamp range.
@param args['t_start']: Date and time of start point
@type args['t_start']: str
@param args['t_end']: Date and time of end point
@type args['t_end']: str
@param args['granularity']: Granularity of date and time
@type args['granularity']: str
@param args['t_format']: Date and time formatting string
@type args['t_format']: str
"""
sql = "SELECT download_time FROM rnkDOWNLOADS ORDER BY download_time DESC"
actions = [x[0] for x in run_sql(sql)]
return _get_trend_from_actions(actions, 0, args['t_start'], args['t_end'], args['granularity'], args['t_format'])
# KEY EVENT SNAPSHOT SECTION
def get_keyevent_snapshot_uptime_cmd():
"""
A specific implementation of get_current_event().
@return: The std-out from the UNIX command 'uptime'.
@type: str
"""
return _run_cmd('uptime').strip().replace(' ', ' ')
def get_keyevent_snapshot_apache_processes():
"""
A specific implementation of get_current_event().
@return: The std-out from the UNIX command 'uptime'.
@type: str
"""
# The number of Apache processes (root+children)
return _run_cmd('ps -e | grep apache2 | grep -v grep | wc -l')
def get_keyevent_snapshot_bibsched_status():
"""
A specific implementation of get_current_event().
@return: Information about the number of tasks in the different status modes.
@type: [(str, int)]
"""
sql = "SELECT status, COUNT(status) FROM schTASK GROUP BY status"
return [(x[0], int(x[1])) for x in run_sql(sql)]
def get_keyevent_snapshot_sessions():
"""
A specific implementation of get_current_event().
@return: The current number of website visitors (guests, logged in)
@type: (int, int)
"""
# SQL to retrieve sessions in the Guests
sql = "SELECT COUNT(session_expiry) FROM session INNER JOIN user ON uid=id " + \
"WHERE email = '' AND " + \
"session_expiry-%d < unix_timestamp() AND " % WEBSTAT_SESSION_LENGTH + \
"unix_timestamp() < session_expiry"
guests = run_sql(sql)[0][0]
# SQL to retrieve sessions in the Logged in users
sql = "SELECT COUNT(session_expiry) FROM session INNER JOIN user ON uid=id " + \
"WHERE email <> '' AND " + \
"session_expiry-%d < unix_timestamp() AND " % WEBSTAT_SESSION_LENGTH + \
"unix_timestamp() < session_expiry"
logged_ins = run_sql(sql)[0][0]
# Assemble, according to return type
return (guests, logged_ins)
# CUSTOM EVENT SECTION
def get_customevent_trend(args):
"""
Returns trend data for a custom event over a give
timestamp range.
@param args['id']: The event id
@type args['id']: str
@param args['t_start']: Date and time of start point
@type args['t_start']: str
@param args['t_end']: Date and time of end point
@type args['t_end']: str
@param args['granularity']: Granularity of date and time
@type args['granularity']: str
@param args['t_format']: Date and time formatting string
@type args['t_format']: str
"""
sql = "SELECT creation_time FROM %s ORDER BY creation_time DESC" % get_customevent_table(args['id'])
dates = [x[0] for x in run_sql(sql)]
return _get_trend_from_actions(dates, 0, args['t_start'], args['t_end'], args['granularity'], args['t_format'])
def get_customevent_dump(args):
"""
Similar to a get_event_trend implemention, but NO refining aka frequency
handling is carried out what so ever. This is just a dump. A dump!
@param args['id']: The event id
@type args['id']: str
@param args['t_start']: Date and time of start point
@type args['t_start']: str
@param args['t_end']: Date and time of end point
@type args['t_end']: str
@param args['granularity']: Granularity of date and time
@type args['granularity']: str
@param args['t_format']: Date and time formatting string
@type args['t_format']: str
"""
# Mapping of event id and column names
event_cols = {}
run_sql("CREATE TEMPORARY TABLE staTEMP " + \
"(event VARCHAR(255), creation_time TIMESTAMP, arguments VARCHAR(255)) " + \
"SELECT '%s' event, creation_time, arguments FROM %s"
% (args['ids'][0], get_customevent_table(args['ids'][0])))
try:
event_cols[args['ids'][0]] = cPickle.loads(run_sql("SELECT cols FROM staEVENT WHERE id = '%s'" % args['ids'][0])[0][0])
except TypeError:
event_cols[args['ids'][0]] = ["Unnamed"]
for id in args['ids'][1:]:
tbl_name = get_customevent_table(id)
run_sql("INSERT INTO staTEMP SELECT '%s', creation_time, arguments FROM %s"
% (id, tbl_name))
try:
event_cols[id] = cPickle.loads(run_sql("SELECT cols FROM staEVENT WHERE id = '%s'" % id)[0][0])
except TypeError:
event_cols[id] = ["Unnamed"]
# Get a MySQL friendly date
lower = _to_datetime(args['t_start'], args['t_format']).isoformat()
upper = _to_datetime(args['t_end'], args['t_format']).isoformat()
sql = "SELECT * FROM staTEMP WHERE creation_time > '%s' " % lower + \
"AND creation_time < '%s' ORDER BY creation_time DESC" % upper
output = []
for row in run_sql(sql):
temp = [row[0], row[1].strftime('%Y-%m-%d %H:%M:%S')]
if row[2] is not None:
arguments = cPickle.loads(row[2])
else:
arguments = [None]
arguments = ["%s: %s" % (event_cols[row[0]][i], arguments[i]) for i in range(len(arguments))]
temp.extend(arguments)
output.append(tuple(temp))
return output
def get_customevent_table(id):
"""
Helper function that for a certain event id retrives the corresponding
event table name.
"""
res = run_sql("SELECT CONCAT('staEVENT', number) FROM staEVENT WHERE id = '%s'" % id)
try:
return res[0][0]
except IndexError:
# No such event table
return None
def get_customevent_args(id):
"""
Helper function that for a certain event id retrives the corresponding
event argument (column) names.
"""
res = run_sql("SELECT arguments FROM staEVENT WHERE id = '%s'" % id)
try:
return cPickle.loads(res[0][0])
except IndexError:
# No such event table
return None
# GRAPHER
def create_graph_trend(trend, path, settings):
"""
Creates a graph representation out of data produced from get_event_trend.
@param trend: The trend data
@type trend: [(str, str|int|(str|int,...))]
@param path: Where to store the graph
@type path: str
@param settings: Dictionary of graph parameters
@type settings: dict
"""
# If no input, we don't bother about anything
if len(trend) == 0:
return
# If no filename is given, we'll assume STD-out format and ASCII.
if path == '':
settings["format"] = 'asciiart'
if settings["format"] == 'asciiart':
out = ""
if settings["multiple"] is not None:
# Tokens that will represent the different data sets (maximum 16 sets)
# Set index (=100) to the biggest of the histogram sums
index = max([sum(x[1]) for x in trend])
# Print legend box
out += "Legend: %s\n\n" % ", ".join(["%s (%s)" % x for x in zip(settings["multiple"], WEBSTAT_GRAPH_TOKENS)])
else:
index = max([x[1] for x in trend])
width = 82
# Figure out the max length of the xtics, in order to left align
xtic_max_len = max([len(_to_datetime(x[0]).strftime(settings["xtic_format"])) for x in trend])
for row in trend:
# Print the xtic
xtic = _to_datetime(row[0]).strftime(settings["xtic_format"])
out_row = xtic + ': ' + ' '*(xtic_max_len-len(xtic)) + '|'
try:
col_width = (1.0*width/index)
except ZeroDivisionError:
col_width = 0
if settings["multiple"] is not None:
# The second value of the row-tuple, represents the n values from the n data
# sets. Each set, will be represented by a different ASCII character, chosen
# from the randomized string 'WEBSTAT_GRAPH_TOKENS'. NOTE: Only up to 16 (len(WEBSTAT_GRAPH_TOKENS)) data
# sets are supported.
total = sum(row[1])
for i in range(len(row[1])):
col = row[1][i]
try:
out_row += WEBSTAT_GRAPH_TOKENS[i]*int(1.0*col*col_width)
except ZeroDivisionError:
break
if len([i for i in row[1] if type(i) is int and i > 0]) - 1 > 0:
out_row += out_row[-1]
else:
total = row[1]
try:
out_row += '-'*int(1.0*total*col_width)
except ZeroDivisionError:
break
# Print sentinel, and the total
out += out_row + '>' + ' '*(xtic_max_len+4+width-len(out_row)) + str(total) + '\n'
# Write to destination file
if path == '':
print out
else:
open(path, 'w').write(out)
elif settings["format"] == 'gnuplot':
import Gnuplot
g = Gnuplot.Gnuplot()
g('set style data linespoints')
g('set terminal png small')
g('set output "%s"' % path)
if settings["title"] != '':
g.title(settings["title"])
if settings["xlabel"] != '':
g.xlabel(settings["xlabel"])
if settings["ylabel"] != '':
g.ylabel(settings["ylabel"])
if settings["xtic_format"] != '':
xtics = 'set xtics ('
xtics += ', '.join(['"%s" %d' %
(_to_datetime(trend[i][0], '%Y-%m-%d \
%H:%M:%S').strftime(settings["xtic_format"]), i)
for i in range(len(trend))]) + ')'
g(xtics)
# If we have multiple data sets, we need to do some magic to make Gnuplot eat it,
# This is basically a matrix transposition, and the addition of index numbers.
if settings["multiple"] is not None:
cols = len(trend[0][1])
rows = len(trend)
plot_items = []
for col in range(cols):
data = []
for row in range(rows):
data.append([row, trend[row][1][col]])
plot_items.append(Gnuplot.PlotItems.Data(data, title=settings["multiple"][col]))
g.plot(*plot_items)
else:
g.plot([x[1] for x in trend])
def create_graph_dump(dump, path, settings):
"""
Creates a graph representation out of data produced from get_event_trend.
@param dump: The dump data
@type dump: [(str|int,...)]
@param path: Where to store the graph
@type path: str
@param graph_settings: Dictionary of graph parameters
@type graph_settings: dict
"""
out = ""
if len(dump) == 0:
out += "No actions for this custom event are registered in the given time range."
else:
# Make every row in dump equally long, insert None if appropriate.
max_len = max([len(x) for x in dump])
events = [tuple(list(x) + [None]*(max_len-len(x))) for x in dump]
cols = ["Event", "Date and time"] + ["Argument %d" % i for i in range(max_len-2)]
column_widths = [max([len(str(x[i])) for x in events + cols])+3 for i in range(len(events[0]))]
for i in range(len(cols)):
out += cols[i] + ' '*(column_widths[i] - len(cols[i]))
out += "\n"
for i in range(len(cols)):
out += '='*(len(cols[i])) + ' '*(column_widths[i] - len(cols[i]))
out += "\n\n"
for action in dump:
for i in range(len(action)):
if action[i] is None:
temp = ''
else:
temp = action[i]
out += str(temp) + ' '*(column_widths[i] - len(str(temp)))
out += "\n"
# Write to destination file
if path == '':
print out
else:
open(path, 'w').write(out)
# EXPORTER
def export_to_python(data, req):
"""
Exports the data to Python code.
@param data: The Python data that should be exported
@type data: []
@param req: The Apache request object
@type req:
"""
_export("text/x-python", str(data), req)
def export_to_csv(data, req):
"""
Exports the data to CSV.
@param data: The Python data that should be exported
@type data: []
@param req: The Apache request object
@type req:
"""
csv_list = [""""%s",%s""" % (x[0], ",".join([str(y) for y in ((type(x[1]) is tuple) and x[1] or (x[1],))])) for x in data]
_export('text/csv', '\n'.join(csv_list), req)
# INTERNAL
def _export(mime, content, req):
"""
Helper function to pass on the export call. Create a
temporary file in which the content is stored, then let
redirect to the export web interface.
"""
filename = tmpdir + "/webstat_export_" + str(time.time()).replace('.', '')
open(filename, 'w').write(content)
redirect_to_url(req, '%s/stats/export?filename=%s&mime=%s' % (weburl, os.path.basename(filename), mime))
def _get_trend_from_actions(action_dates, initial_value, t_start, t_end, granularity, format):
"""
Given a list of dates reflecting some sort of action/event, and some additional parameters,
an internal data format is returned. 'initial_value' set to zero, means that the frequency
will not be accumulative, but rather non-causal.
@param action_dates: A list of dates, indicating some sort of action/event.
@type action_dates: [datetime.datetime]
@param initial_value: The numerical offset the first action's value should make use of.
@type initial_value: int
@param t_start: Start time for the time domain in format %Y-%m-%d %H:%M:%S
@type t_start: str
@param t_stop: End time for the time domain in format %Y-%m-%d %H:%M:%S
@type t_stop: str
@param granularity: The granularity of the time domain, span between values.
Possible values are [year,month,day,hour,minute,second].
@type granularity: str
@param format: Format of the 't_start' and 't_stop' parameters
@type format: str
@return: A list of tuples zipping a time-domain and a value-domain
@type: [(str, int)]
"""
# Append the maximum date as a sentinel indicating we're done
action_dates.insert(0, datetime.datetime.max)
# Create an iterator running from the first day of activity
dt_iter = _get_datetime_iter(t_start, granularity, format)
# Construct the datetime tuple for the stop time
stop_at = _to_datetime(t_end, format) - datetime.timedelta(seconds=1)
# If our t_start is more recent than the initial action_dates, we need to
# drop those.
t_start_dt = _to_datetime(t_start, format)
while action_dates[-1] < t_start_dt:
action_dates = action_dates[:-1]
vector = [(None, initial_value)]
old = dt_iter.next()
upcoming_action = action_dates.pop()
for current in dt_iter:
# Counter of action_dates in the current span, set the initial value to
# zero to avoid accumlation.
if initial_value != 0:
actions_here = vector[-1][1]
else:
actions_here = 0
# Check to see if there's an action date in the current span
while old <= upcoming_action < current:
actions_here += 1
try:
upcoming_action = action_dates.pop()
except IndexError:
upcoming_action = datetime.datetime.max
vector.append((old.strftime('%Y-%m-%d %H:%M:%S'), actions_here))
old = current
# Make sure to stop the iteration at the end time
if current > stop_at:
break
# Remove the first bogus tuple, and return
return vector[1:]
def _get_datetime_iter(t_start, granularity='day', format='%Y-%m-%d %H:%M:%S'):
"""
Returns an iterator over datetime elements starting at an arbitrary time,
with granularity of a [year,month,day,hour,minute,second].
@param t_start: An arbitrary starting time in format %Y-%m-%d %H:%M:%S
@type t_start: str
@param granularity: The span between iterable elements, default is 'days'.
Possible values are [year,month,day,hour,minute,second].
@type granularity: str
@param format: Format of the 't_start' parameter
@type format: str
@return: An iterator of points in time
@type: iterator over datetime elements
"""
t = _to_datetime(t_start, format)
# Make a time increment depending on the granularity and the current time
# (the length of years and months vary over time)
span = ""
while True:
yield t
if granularity == "year":
span = (calendar.isleap(t.year) and ["days=366"] or ["days=365"])[0]
elif granularity == "month":
span = "days=" + str(calendar.monthrange(t.year, t.month)[1])
elif granularity == "day":
span = "days=1"
elif granularity == "hour":
span = "hours=1"
elif granularity == "minute":
span = "minutes=1"
elif granularity == "second":
span = "seconds=1"
else:
# Default just in case
span = "days=1"
t += eval("datetime.timedelta(" + span + ")")
def _to_datetime(dt, format='%Y-%m-%d %H:%M:%S'):
return datetime.datetime(*time.strptime(dt, format)[:6])
def _run_cmd(command):
"""
Runs a certain command and returns the string output. If the command is
not found a string saying so will be returned. Use with caution!
@param command: The UNIX command to execute.
@type command: str
@return: The std-out from the command.
@type: str
"""
return commands.getoutput(command)

Event Timeline