Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91126306
webstatlib.lisp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Nov 8, 04:32
Size
43 KB
Mime Type
text/x-lisp
Expires
Sun, Nov 10, 04:32 (2 d)
Engine
blob
Format
Raw Data
Handle
22201611
Attached To
R3600 invenio-infoscience
webstatlib.lisp
View Options
;;; $Id$
;;;
;;; webstatlib.lisp -- library with httpd log file analyzer to gather
;;; CDS usage stats. Another functionality is to parse old Apache log
;;; files and prepare Detailed record page views statistics for
;;; rnkPAGEVIEWS table.
;;;
;;; This file is part of CDS Invenio.
;;; Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 CERN.
;;;
;;; CDS Invenio is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU General Public License as
;;; published by the Free Software Foundation; either version 2 of the
;;; License, or (at your option) any later version.
;;;
;;; CDS Invenio is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
;;; 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
;;
;; Auxiliary functions come first.
#-
gcl
(
in-package
#:cl-user
)
(
defpackage
#:webstatlib
#-
gcl
(
:use
#:cl
)
(
:export
#:print-general-stats
#:print-search-interface-stats
#:print-search-collections-stats
#:print-search-pattern-stats
#:print-basket-stats
#:print-alert-stats
#:print-download-stats
))
#-
gcl
(
proclaim
'
(
optimize
(
speed
3
)
(
safety
1
)
(
debug
0
)
(
compilation-speed
0
)
(
space
0
)))
(
defun
parse-integer-or-zero
(
string
)
"Parse integer from STRING and return either the integer, or zero."
(
let
((
out
0
))
(
if
(
string-not-equal
string
"-"
)
(
setf
out
(
parse-integer
string
)))
out
))
(
defun
hash-table-keys
(
hashtable
&optional
(
sort-by-value
nil
))
"Return list of HASHTABLE keys. Optionally, sort the key list by
hash value."
(
let
((
keys
'
()))
(
maphash
(
lambda
(
key
val
)
(
declare
(
ignore
val
))
(
push
key
keys
))
hashtable
)
(
if
sort-by-value
(
sort
keys
#'
(
lambda
(
x
y
)
(
let
((
x-val
(
gethash
x
hashtable
))
(
y-val
(
gethash
y
hashtable
)))
(
if
(
equalp
x-val
y-val
)
(
string<
x
y
)
; when values are equal, test by keys
(
>
x-val
y-val
)))))
keys
)))
(
defun
hash-table-values
(
hashtable
)
"Return list of HASHTABLE values."
(
let
((
vals
'
()))
(
maphash
(
lambda
(
key
val
)
(
declare
(
ignore
key
))
(
push
val
vals
))
hashtable
)
vals
))
(
defun
hash-table-key-val-stats
(
hashtable
predicate
&optional
(
apply-predicate-to-vals
nil
))
"Return the number of keys in HASHTABLE that satisfy the PREDICATE.
Another returned value is the sum of values for these keys. Suitable
when values are integers, as in histograms."
(
declare
(
type
function
predicate
))
(
let
((
number-of-keys
0
)
(
sum-of-values
0
))
(
declare
(
type
fixnum
number-of-keys
sum-of-values
))
(
maphash
(
lambda
(
key
val
)
(
if
apply-predicate-to-vals
(
when
(
funcall
predicate
val
)
(
incf
number-of-keys
)
(
incf
sum-of-values
val
))
(
when
(
funcall
predicate
key
)
(
incf
number-of-keys
)
(
incf
sum-of-values
val
))))
hashtable
)
(
values
number-of-keys
sum-of-values
)))
(
defun
split-string-by-one-space
(
string
)
"Return list of substrings of STRING divided by *one* space each.
Two consecutive spaces will be seen as if there were an empty string
between them. Taken from Common Lisp Cookbook."
(
declare
(
type
simple-base-string
string
))
(
loop
for
i
=
0
then
(
1+
j
)
as
j
=
(
position
#\Space
string
:start
i
)
collect
(
subseq
string
i
j
)
while
j
))
(
defun
string-substitute
(
string
substring
replacement-string
)
"Taken from c.l.l."
(
declare
(
type
simple-base-string
string
substring
replacement-string
))
(
let
((
substring-length
(
length
substring
))
(
last-end
0
)
(
new-string
""
))
(
declare
(
type
simple-base-string
new-string
)
(
type
integer
substring-length
last-end
))
(
do
((
next-start
(
search
substring
string
)
(
search
substring
string
:start2
last-end
)))
((
null
next-start
)
(
concatenate
'string
new-string
(
subseq
string
last-end
)))
(
declare
(
type
(
or
integer
null
)
next-start
))
(
setq
new-string
(
concatenate
'string
new-string
(
subseq
string
last-end
next-start
)
replacement-string
))
(
setq
last-end
(
+
next-start
substring-length
)))))
(
defun
string-decode-url
(
string
)
"Return string where URL is unquoted, that is %22 is substituted by
a double quote, etc."
(
declare
(
type
simple-base-string
string
))
(
setf
string
(
string-substitute
string
"%20"
" "
))
(
setf
string
(
string-substitute
string
"+"
" "
))
(
setf
string
(
string-substitute
string
"%2B"
"+"
))
(
setf
string
(
string-substitute
string
"%22"
"\""
))
(
setf
string
(
string-substitute
string
"%23"
"#"
))
(
setf
string
(
string-substitute
string
"%25"
"%"
))
(
setf
string
(
string-substitute
string
"%26"
"&"
))
(
setf
string
(
string-substitute
string
"%27"
"'"
))
(
setf
string
(
string-substitute
string
"%28"
"("
))
(
setf
string
(
string-substitute
string
"%29"
")"
))
(
setf
string
(
string-substitute
string
"%2A"
"*"
))
(
setf
string
(
string-substitute
string
"%2C"
","
))
(
setf
string
(
string-substitute
string
"%2F"
"/"
))
(
setf
string
(
string-substitute
string
"%3A"
":"
))
(
setf
string
(
string-substitute
string
"%3D"
"="
))
(
setf
string
(
string-substitute
string
"%3E"
">"
))
(
setf
string
(
string-substitute
string
"%3F"
"?"
))
(
setf
string
(
string-substitute
string
"%5B"
"["
))
(
setf
string
(
string-substitute
string
"%5C"
"\\"
))
(
setf
string
(
string-substitute
string
"%5D"
"]"
)))
(
defun
get-month-number-from-month-name
(
month-name
)
"Return month number for MONTH-NAME."
(
declare
(
type
simple-base-string
month-name
))
(
cond
((
string=
month-name
"Jan"
)
1
)
((
string=
month-name
"Feb"
)
2
)
((
string=
month-name
"Mar"
)
3
)
((
string=
month-name
"Apr"
)
4
)
((
string=
month-name
"May"
)
5
)
((
string=
month-name
"Jun"
)
6
)
((
string=
month-name
"Jul"
)
7
)
((
string=
month-name
"Aug"
)
8
)
((
string=
month-name
"Sep"
)
9
)
((
string=
month-name
"Oct"
)
10
)
((
string=
month-name
"Nov"
)
11
)
((
string=
month-name
"Dec"
)
12
)
(
t
0
)))
(
defun
get-urlarg-value-from-url-string
(
url-string
url-arg
&optional
(
return-empty-value
nil
))
"Return list of values of URL-ARG in the URL-STRING. For example,
if URL-STRING contains 'search?p=ellis&f=title', and URL-ARG is
'f', then return list of 'title'. If URL-ARG is not present in
URL-STRING, return list of empty string."
(
declare
(
type
simple-base-string
url-string
url-arg
))
(
let
((
out
nil
)
(
position-pattern-beg
nil
)
(
position-pattern-end
nil
))
(
setf
position-pattern-beg
(
search
(
concatenate
'string
url-arg
"="
)
url-string
))
(
if
(
not
position-pattern-beg
)
(
progn
(
when
return-empty-value
(
push
""
out
)))
(
progn
(
setf
position-pattern-end
(
search
"&"
url-string
:start2
position-pattern-beg
))
(
if
(
null
position-pattern-end
)
(
setf
position-pattern-end
(
length
url-string
)))
(
push
(
string-decode-url
(
subseq
url-string
(
+
(
length
url-arg
)
1
position-pattern-beg
)
position-pattern-end
))
out
)))
(
if
return-empty-value
out
(
remove-if
#'
(
lambda
(
x
)
(
declare
(
type
simple-base-string
x
))
(
string=
""
x
))
out
))))
(
defun
get-datetime-from-httpd-log-datetime-string
(
httpd-log-datetime-string
)
"Return request date in numerical format YYYYMMDD from HTTPD-LOG-DATETIME-STRING."
(
declare
(
type
simple-base-string
httpd-log-datetime-string
))
(
let
((
yyyy
(
subseq
httpd-log-datetime-string
7
11
))
(
mm-number
(
get-month-number-from-month-name
(
subseq
httpd-log-datetime-string
3
6
)))
(
dd
(
subseq
httpd-log-datetime-string
0
2
)))
(
declare
(
type
fixnum
mm-number
))
(
+
(
*
100
100
(
parse-integer
yyyy
))
(
*
100
mm-number
)
(
parse-integer
dd
))))
(
defun
get-datetimee-from-httpd-log-datetime-string
(
httpd-log-datetime-string
)
"Return request date in the format YYYY-MM-DD HH:MM:SS from HTTPD-LOG-DATETIME-STRING."
(
declare
(
type
simple-base-string
httpd-log-datetime-string
))
(
let
((
yyyy
(
subseq
httpd-log-datetime-string
7
11
))
(
mm
(
get-month-number-from-month-name
(
subseq
httpd-log-datetime-string
3
6
)))
(
dd
(
subseq
httpd-log-datetime-string
0
2
))
(
hms
(
subseq
httpd-log-datetime-string
12
20
)))
(
format
nil
"~A-~A-~A ~A"
yyyy
mm
dd
hms
)))
(
defun
get-current-collection-from-httpd-log-request-string
(
httpd-log-request-string
)
"Return current collection found in HTTPD-LOG-REQUEST-STRING."
(
declare
(
special
*home-collection*
))
(
let
((
out
nil
)
(
url-string
(
subseq
httpd-log-request-string
(
position
#\space
httpd-log-request-string
)
(
position
#\space
httpd-log-request-string
:from-end
t
))))
(
if
(
search
"?cc="
url-string
)
(
setf
out
(
get-urlarg-value-from-url-string
url-string
"cc"
t
))
(
if
(
search
"?c="
url-string
)
(
setf
out
(
get-urlarg-value-from-url-string
url-string
"c"
t
))
(
setf
out
(
list
*home-collection*
))))
(
if
(
string=
(
car
out
)
""
)
(
setf
out
(
list
*home-collection*
)))
out
))
(
defun
get-search-patterns-from-httpd-log-request-string
(
httpd-log-request-string
)
"Return list of search pattern strings (p=,p1=,p2=,p3=) found in
HTTPD-LOG-REQUEST-STRING. As a side effect, increment counters for
*NUMBER-OF-SIMPLE-SEARCHES* and *NUMBER-OF-DETAILED-RECORD-PAGES* when
a simple search or a detailed record page is selected."
(
declare
(
type
simple-base-string
httpd-log-request-string
)
(
special
*search-engine-url*
*search-engine-url-old-style*
*detailed-record-url*
*number-of-simple-searches*
*number-of-advanced-searches*
*number-of-detailed-record-pages*
))
(
let
((
out
nil
)
(
url-string
(
subseq
httpd-log-request-string
(
position
#\space
httpd-log-request-string
)
(
position
#\space
httpd-log-request-string
:from-end
t
))))
;; url-string now contains URL without leading GET and trailing HTTP/1.1 strings
(
when
(
or
(
search
*search-engine-url*
url-string
)
(
search
*search-engine-url-old-style*
url-string
)
(
search
*detailed-record-url*
url-string
))
(
if
(
or
(
search
*detailed-record-url*
url-string
)
(
search
"?id="
url-string
)
(
search
"?recid="
url-string
)
(
search
"?sysno="
url-string
)
(
search
"?sysnb="
url-string
))
;; detailed record page happened:
(
incf
*number-of-detailed-record-pages*
)
(
if
(
search
"p1="
httpd-log-request-string
)
;; advanced search happened:
(
progn
(
incf
*number-of-advanced-searches*
)
(
setf
out
(
append
(
get-urlarg-value-from-url-string
url-string
"p1"
)
(
get-urlarg-value-from-url-string
url-string
"p2"
)
(
get-urlarg-value-from-url-string
url-string
"p3"
))))
;; simple search happened:
(
progn
(
incf
*number-of-simple-searches*
)
(
setf
out
(
get-urlarg-value-from-url-string
url-string
"p"
t
))))))
out
))
(
defun
wash-httpd-log-line
(
httpd-log-line
)
"Fix eventual problems in HTTPD-LOG-LINE, such as backslash-quote
that some browsers seem to produce. To be called for the input to
wash it out."
(
declare
(
type
simple-base-string
httpd-log-line
))
(
string-substitute
httpd-log-line
"\\\""
"%22"
))
;; HTTPD-LOG-ENTRY stucture that will represent Apache log hits, and
;; functions to create it.
(
defstruct
httpd-log-entry
"Structure representing Apache combined log entry."
(
ip
""
:type
simple-base-string
)
(
datetime
0
:type
integer
)
; YYYYMMDD (FIXME: get rid of this)
(
datetimee
""
:type
simple-base-string
)
; YYYY-MM-DD HH:MM:SS
(
request
""
:type
simple-base-string
)
(
status
0
:type
integer
)
(
bytes
0
:type
integer
)
(
referer
""
:type
simple-base-string
)
(
browser
""
:type
simple-base-string
))
(
defun
parse-httpd-log-line-conses-a-lot
(
line
)
"Return list of elements read from Apache combined log LINE. The
elements in line are text strings delimited either by spaces or by
double quotes or brackets. (Spaces within double quotes or brackets
do not take their separator effect. Double quote preceded by a
backslash do not take its effect neither.) WARNING: THIS FUNCTION
CONSES A LOT, A FASTER REWRITE IS AVAILABLE BELOW."
(
declare
(
type
simple-base-string
line
))
(
if
(
string=
line
""
)
nil
(
let
((
current-char
(
char
line
0
))
(
position-term-next
nil
)
(
position-term-offset
0
))
(
declare
(
type
(
or
fixnum
null
)
position-term-next
position-term-offset
)
(
type
character
current-char
))
(
cond
((
char=
current-char
#\"
)
(
setf
position-term-next
(
position
#\"
line
:start
1
)
position-term-offset
1
))
((
char=
current-char
#\[
)
(
setf
position-term-next
(
position
#\]
line
:start
1
)
position-term-offset
1
))
(
t
(
setf
position-term-next
(
position
#\Space
line
:start
1
))))
(
if
(
or
(
null
position-term-next
)
(
>=
(
+
1
position-term-next
position-term-offset
)
(
length
line
)))
(
list
(
subseq
line
position-term-offset
))
(
append
(
list
(
subseq
line
position-term-offset
position-term-next
))
(
parse-httpd-log-line
(
subseq
line
(
+
1
position-term-next
position-term-offset
))))))))
(
defun
parse-httpd-log-line
(
line
)
"Return list of elements read from Apache combined log LINE. The
elements in line are text strings delimited either by spaces or by
double quotes or brackets. (Spaces within double quotes or brackets
do not take their separator effect. Double quote preceded by a
backslash do not take its effect neither.)"
(
declare
(
type
simple-base-string
line
))
(
labels
((
get-next-char-to-search
(
current-char
)
(
declare
(
type
character
current-char
))
(
cond
((
char=
current-char
#\[
)
#\]
)
((
char=
current-char
#\"
)
#\"
)
(
t
#\space
)))
(
get-char-offset
(
current-char
)
(
declare
(
type
character
current-char
))
(
cond
((
char=
current-char
#\[
)
1
)
((
char=
current-char
#\"
)
1
)
(
t
0
))))
(
let*
((
out
nil
)
(
position-max
(
1-
(
length
line
)))
(
position-term
0
)
(
position-term-offset
(
get-char-offset
(
char
line
position-term
)))
(
position-term-next
(
position
(
get-next-char-to-search
(
char
line
position-term
))
line
:start
(
+
position-term-offset
position-term
))))
(
declare
(
type
(
or
integer
null
)
position-term
position-term-next
)
(
type
integer
position-max
position-term-offset
))
(
loop
(
when
(
null
position-term-next
)
(
when
(
<
position-term
position-max
)
(
push
(
subseq
line
(
+
position-term
position-term-offset
))
out
))
(
return
(
nreverse
out
)))
(
push
(
subseq
line
(
+
position-term
position-term-offset
)
position-term-next
)
out
)
(
setf
position-term
(
min
(
+
1
position-term-next
position-term-offset
)
position-max
))
(
setf
position-term-offset
(
get-char-offset
(
char
line
position-term
)))
(
setf
position-term-next
(
position
(
get-next-char-to-search
(
char
line
position-term
))
line
:start
(
+
position-term
position-term-offset
)))))))
(
defun
create-httpd-log-entry
(
httpd-log-line
)
"Create HTTPD-LOG-ENTRY structure from HTTPD-LOG-LINE Apache
combined log line. The line is supposed to be washed already."
(
declare
(
type
simple-base-string
httpd-log-line
))
; firstly parse Apache line according to spaces and quotes into list:
(
let
((
httpd-log-line-list
(
parse-httpd-log-line
httpd-log-line
)))
;; secondly transform into entry:
(
make-httpd-log-entry
:ip
(
nth
0
httpd-log-line-list
)
:request
(
nth
4
httpd-log-line-list
)
:datetime
(
get-datetime-from-httpd-log-datetime-string
(
nth
3
httpd-log-line-list
))
:datetimee
(
get-datetimee-from-httpd-log-datetime-string
(
nth
3
httpd-log-line-list
))
:status
(
parse-integer
(
nth
5
httpd-log-line-list
))
:bytes
(
parse-integer-or-zero
(
nth
6
httpd-log-line-list
))
:referer
(
nth
7
httpd-log-line-list
)
:browser
(
nth
8
httpd-log-line-list
))))
(
defun
create-httpd-log-entries
(
httpd-log-filename
&optional
(
exclude-ip-list
nil
))
"Create list of HTTPD-LOG-ENTRY structures from HTTPD-LOG-FILENAME
Apache combined log file. The log file lines are cleaned to make them
valid, if necessary. When EXCLUDE-IP-LIST is set, then lines coming
from those IPs are excluded."
(
declare
(
type
simple-base-string
httpd-log-filename
)
(
type
list
exclude-ip-list
))
(
let
((
httpd-log-entries
nil
))
(
format
t
"~&** APACHE LOG FILE ANALYSIS"
)
(
format
t
"~&Filename: ~A"
httpd-log-filename
)
(
unless
(
null
exclude-ip-list
)
(
format
t
"~&Excluding search engine hits from ~A."
exclude-ip-list
))
(
with-open-file
(
input-stream
httpd-log-filename
:direction
:input
)
(
do
((
line
(
read-line
input-stream
nil
)
(
read-line
input-stream
nil
)))
((
not
line
))
(
declare
(
type
(
or
simple-base-string
null
)
line
))
(
let
((
httpd-log-entry-item
(
create-httpd-log-entry
(
wash-httpd-log-line
line
))))
(
if
(
or
(
null
exclude-ip-list
)
(
not
(
member
(
httpd-log-entry-ip
httpd-log-entry-item
)
exclude-ip-list
:test
#'
equal
)))
(
push
httpd-log-entry-item
httpd-log-entries
)))))
httpd-log-entries
))
(
defun
filter-httpd-log-entries
(
httpd-all-log-entries
url-substring-pattern
&optional
(
status-code
200
))
"Return only those of HTTPD-ALL-LOG-ENTRIES that satisfy
URL-SUBSTRING-PATTERN and are of status STATUS-CODE."
(
remove-if-not
#'
(
lambda
(
httpd-log-entry-item
)
(
and
(
search
url-substring-pattern
(
httpd-log-entry-request
httpd-log-entry-item
))
(
=
status-code
(
httpd-log-entry-status
httpd-log-entry-item
))))
httpd-all-log-entries
))
;; Functions that analyze the files according to the typical use cases.
(
defun
print-general-stats
(
httpd-all-log-entries
)
"Read HTTPD-LOG-ENTRIES and print some general stats such as
the total number of hits, etc."
(
declare
(
special
*nb-histogram-items-to-print*
))
(
format
t
"~&~%** GENERAL STATS"
)
(
let
((
nb-hits
(
length
httpd-all-log-entries
))
(
datetime-last
(
httpd-log-entry-datetime
(
car
httpd-all-log-entries
)))
(
datetime-first
(
httpd-log-entry-datetime
(
car
(
last
httpd-all-log-entries
))))
(
visitor-histogram
(
make-hash-table
:test
#'
equalp
)))
;; build histogram:
(
dolist
(
httpd-log-entry-item
httpd-all-log-entries
)
(
incf
(
gethash
(
httpd-log-entry-ip
httpd-log-entry-item
)
visitor-histogram
0
)))
;; print summary info:
(
format
t
"~&There were ~D website hits dating from ~A to ~A."
nb-hits
datetime-first
datetime-last
)
(
format
t
"~&This makes an average of ~F website hits per day."
(
/
nb-hits
(
1+
(
-
datetime-last
datetime-first
))))
(
format
t
"~&There were ~D unique visitors in ~D days."
(
length
(
hash-table-keys
visitor-histogram
))
(
1+
(
-
datetime-last
datetime-first
)))
;; print visitors histogram:
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
"Visitor"
"no. of hits"
)
(
let
((
visitors-to-print
(
hash-table-keys
visitor-histogram
t
)))
(
if
(
>
(
length
visitors-to-print
)
*nb-histogram-items-to-print*
)
(
setq
visitors-to-print
(
subseq
visitors-to-print
0
*nb-histogram-items-to-print*
)))
(
dolist
(
visitor
visitors-to-print
)
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
visitor
(
gethash
visitor
visitor-histogram
))))))
(
defun
print-search-interface-stats
(
httpd-all-log-entries
)
"Read HTTPD-ALL-LOG-ENTRIES, analyze search interface usage and print
interesting statistics."
(
declare
(
special
*search-interface-url*
*home-collection*
*nb-histogram-items-to-print*
))
(
let
((
httpd-log-entries
(
filter-httpd-log-entries
httpd-all-log-entries
*search-interface-url*
))
(
collection-histogram
(
make-hash-table
:test
#'
equalp
)))
;; build histogram:
(
dolist
(
httpd-log-entry
httpd-log-entries
)
(
dolist
(
current-collection
(
get-current-collection-from-httpd-log-request-string
(
httpd-log-entry-request
httpd-log-entry
)))
(
incf
(
gethash
current-collection
collection-histogram
0
))))
;; print summary info:
(
format
t
"~&~%** SEARCH COLLECTIONS USAGE ANALYSIS"
)
(
let
((
number-of-interface-visits
(
reduce
#'
+
(
hash-table-values
collection-histogram
))))
(
format
t
"~&There were ~D visits of search interface pages."
number-of-interface-visits
)
(
format
t
"~&There were ~D visits of non-Home search interface pages. (~4F\%)"
(
-
number-of-interface-visits
(
gethash
*home-collection*
collection-histogram
0
))
(
if
(
zerop
number-of-interface-visits
)
0
(
*
100
(
/
(
gethash
*home-collection*
collection-histogram
0
)
number-of-interface-visits
)))))
;; print histogram:
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
"Collection"
"no. of visits"
)
(
let
((
collections-to-print
(
hash-table-keys
collection-histogram
t
)))
(
if
(
>
(
length
collections-to-print
)
*nb-histogram-items-to-print*
)
(
setq
collections-to-print
(
subseq
collections-to-print
0
*nb-histogram-items-to-print*
)))
(
dolist
(
collection
collections-to-print
)
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
collection
(
gethash
collection
collection-histogram
))))))
(
defun
print-search-collections-stats
(
httpd-all-log-entries
)
"Read HTTPD-ALL-LOG-ENTRIES, analyze search engine usage and print
interesting statistics with respect to collections."
(
declare
(
special
*search-engine-url*
*search-engine-url-old-style*
*home-collection*
*nb-histogram-items-to-print*
))
(
let
((
httpd-log-entries
(
append
(
filter-httpd-log-entries
httpd-all-log-entries
*search-engine-url*
)
(
filter-httpd-log-entries
httpd-all-log-entries
*search-engine-url-old-style*
)))
(
collection-histogram
(
make-hash-table
:test
#'
equalp
)))
;; build histogram:
(
dolist
(
httpd-log-entry
httpd-log-entries
)
(
if
(
or
(
search
"c="
(
httpd-log-entry-request
httpd-log-entry
))
(
search
"cc="
(
httpd-log-entry-request
httpd-log-entry
)))
(
dolist
(
current-collection
(
get-current-collection-from-httpd-log-request-string
(
httpd-log-entry-request
httpd-log-entry
)))
(
incf
(
gethash
current-collection
collection-histogram
0
)))))
;; print summary info:
(
format
t
"~&~%** SEARCH COLLECTIONS USAGE ANALYSIS"
)
(
let
((
number-of-interface-visits
(
reduce
#'
+
(
hash-table-values
collection-histogram
))))
(
format
t
"~&There were ~D search engine hits."
number-of-interface-visits
)
(
format
t
"~&There were ~D searches originating from non-Home collections. (~4F\%)"
(
-
number-of-interface-visits
(
gethash
*home-collection*
collection-histogram
0
))
(
*
100
(
/
(
gethash
*home-collection*
collection-histogram
0
)
number-of-interface-visits
))))
;; print histogram:
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
"Originating collection"
"no. of searches"
)
(
let
((
collections-to-print
(
hash-table-keys
collection-histogram
t
)))
(
if
(
>
(
length
collections-to-print
)
*nb-histogram-items-to-print*
)
(
setq
collections-to-print
(
subseq
collections-to-print
0
*nb-histogram-items-to-print*
)))
(
dolist
(
collection
collections-to-print
)
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
collection
(
gethash
collection
collection-histogram
))))))
(
defun
print-search-pattern-stats
(
httpd-all-log-entries
)
"Read HTTPD-ALL-LOG-ENTRIES, analyze search patterns and print
interesting statistics."
(
declare
(
special
*search-engine-url*
*search-engine-url-old-style*
*nb-histogram-items-to-print*
*number-of-simple-searches*
*number-of-advanced-searches*
*number-of-detailed-record-pages*
))
(
setf
*number-of-simple-searches*
0
*number-of-advanced-searches*
0
*number-of-detailed-record-pages*
0
)
(
let
((
httpd-log-entries
(
append
(
filter-httpd-log-entries
httpd-all-log-entries
*search-engine-url*
)
(
filter-httpd-log-entries
httpd-all-log-entries
*search-engine-url-old-style*
)))
(
pattern-histogram
(
make-hash-table
:test
#'
equalp
)))
;; build histogram:
(
dolist
(
httpd-log-entry
httpd-log-entries
)
(
dolist
(
pattern
(
get-search-patterns-from-httpd-log-request-string
(
httpd-log-entry-request
httpd-log-entry
)))
(
incf
(
gethash
pattern
pattern-histogram
0
))))
;; print summary info:
(
format
t
"~&~%** SEARCH ENGINE QUERY PATTERN ANALYSIS"
)
(
if
(
null
httpd-log-entries
)
(
format
t
"~&No valid search engine hits found. Exiting."
)
(
let
((
number-of-search-engine-hits
(
length
httpd-log-entries
))
(
number-of-patterns
(
hash-table-count
pattern-histogram
))
(
day-first
(
httpd-log-entry-datetime
(
nth
(
1-
(
length
httpd-log-entries
))
httpd-log-entries
)))
(
day-last
(
httpd-log-entry-datetime
(
first
httpd-log-entries
))))
(
declare
(
type
integer
number-of-search-engine-hits
number-of-patterns
day-first
day-last
))
;; a - total number of searches and patterns
(
format
t
"~&Found ~D search engine hits."
number-of-search-engine-hits
)
(
format
t
"~&First search engine hit log is dated ~A."
day-first
)
(
format
t
"~&Last search engine hit log is dated ~A."
day-last
)
(
format
t
"~&This makes an average of ~F search engine hits per day."
(
/
number-of-search-engine-hits
(
1+
(
-
day-last
day-first
))))
(
format
t
"~&There were ~D simple searches out of ~D search engine hits. (~4F\%)"
*number-of-simple-searches*
number-of-search-engine-hits
(
*
100
(
/
*number-of-simple-searches*
number-of-search-engine-hits
)))
(
format
t
"~&There were ~D advanced searches out of ~D search engine hits. (~4F\%)"
*number-of-advanced-searches*
number-of-search-engine-hits
(
*
100
(
/
*number-of-advanced-searches*
number-of-search-engine-hits
)))
(
format
t
"~&There were ~D detailed record pages out of ~D search engine hits. (~4F\%)"
*number-of-detailed-record-pages*
number-of-search-engine-hits
(
*
100
(
/
*number-of-detailed-record-pages*
number-of-search-engine-hits
)))
(
format
t
"~&There are ~D different query patterns for ~D search engine hits. (~4F\%)"
number-of-patterns
number-of-search-engine-hits
(
*
100
(
/
number-of-patterns
number-of-search-engine-hits
)))
;; b - empty searches
(
format
t
"~&There are ~D empty query pattern searches out of ~D search engine hits. (~4F\%)"
(
gethash
""
pattern-histogram
0
)
number-of-search-engine-hits
(
*
100
(
/
(
gethash
""
pattern-histogram
0
)
number-of-search-engine-hits
)))
;; c - phrase searches and patterns
(
multiple-value-bind
(
number-of-phrase-patterns
number-of-phrase-searches
)
(
hash-table-key-val-stats
pattern-histogram
#'
(
lambda
(
x
)
(
declare
(
type
simple-base-string
x
))
(
position
#\"
x
)))
(
format
t
"~&There are ~D phrase searches out of ~D search engine hits. (~4F\%)"
number-of-phrase-searches
number-of-search-engine-hits
(
*
100
(
/
number-of-phrase-searches
number-of-search-engine-hits
)))
(
format
t
"~&There are ~D phrase query patterns out of ~D query patterns. (~4F\%)"
number-of-phrase-patterns
number-of-patterns
(
*
100
(
/
number-of-phrase-patterns
number-of-patterns
))))
;; d - onetime searches and patterns
(
multiple-value-bind
(
number-of-onetime-patterns
number-of-onetime-searches
)
(
hash-table-key-val-stats
pattern-histogram
#'
(
lambda
(
x
)
(
declare
(
integer
x
))
(
=
x
1
))
t
)
(
format
t
"~&There are ~D one-time event searches out of ~D search engine hits. (~4F\%)"
number-of-onetime-searches
number-of-search-engine-hits
(
*
100
(
/
number-of-onetime-searches
number-of-search-engine-hits
)))
(
format
t
"~&There are ~D one-time query patterns out of ~D query patterns. (~4F\%)"
number-of-onetime-patterns
number-of-patterns
(
*
100
(
/
number-of-onetime-patterns
number-of-patterns
))))
;; e - one-word searches and patterns
(
multiple-value-bind
(
number-of-oneword-patterns
number-of-oneword-searches
)
(
hash-table-key-val-stats
pattern-histogram
#'
(
lambda
(
x
)
(
declare
(
type
simple-base-string
x
))
(
not
(
position
#\space
x
))))
(
format
t
"~&There are ~D one-word searches out of ~D search engine hits. (~4F\%)"
number-of-oneword-searches
number-of-search-engine-hits
(
*
100
(
/
number-of-oneword-searches
number-of-search-engine-hits
)))
(
format
t
"~&There are ~D one-word query patterns out of ~D query patterns. (~4F\%)"
number-of-oneword-patterns
number-of-patterns
(
*
100
(
/
number-of-oneword-patterns
number-of-patterns
))))
;; f - wildcard queries and patterns
(
multiple-value-bind
(
number-of-wildcard-patterns
number-of-wildcard-searches
)
(
hash-table-key-val-stats
pattern-histogram
#'
(
lambda
(
x
)
(
declare
(
type
simple-base-string
x
))
(
position
#\*
x
)))
(
format
t
"~&There are ~D wildcard searches out of ~D search engine hits. (~4F\%)"
number-of-wildcard-searches
number-of-search-engine-hits
(
*
100
(
/
number-of-wildcard-searches
number-of-search-engine-hits
)))
(
format
t
"~&There are ~D wildcard query patterns out of ~D query patterns. (~4F\%)"
number-of-wildcard-patterns
number-of-patterns
(
*
100
(
/
number-of-wildcard-patterns
number-of-patterns
))))
;; g - punctuation-like queries and patterns
(
multiple-value-bind
(
number-of-punctuation-patterns
number-of-punctuation-searches
)
(
hash-table-key-val-stats
pattern-histogram
#'
(
lambda
(
x
)
(
declare
(
type
simple-base-string
x
))
(
or
(
position
#\+
x
)
(
position
#\-
x
)
(
position
#\/
x
))))
(
format
t
"~&There are ~D punctuation-like searches out of ~D search engine hits. (~4F\%)"
number-of-punctuation-searches
number-of-search-engine-hits
(
*
100
(
/
number-of-punctuation-searches
number-of-search-engine-hits
)))
(
format
t
"~&There are ~D punctuation-like query patterns out of ~D query patterns. (~4F\%)"
number-of-punctuation-patterns
number-of-patterns
(
*
100
(
/
number-of-punctuation-patterns
number-of-patterns
))))
;; h - anyfield queries and patterns
(
multiple-value-bind
(
number-of-anyfield-patterns
number-of-anyfield-searches
)
(
hash-table-key-val-stats
pattern-histogram
#'
(
lambda
(
x
)
(
declare
(
type
simple-base-string
x
))
(
not
(
position
#\:
x
))))
(
declare
(
ignore
number-of-anyfield-searches
))
(
format
t
"~&There are ~D any-field query patterns out of ~D query patterns. (~4F\%)"
number-of-anyfield-patterns
number-of-patterns
(
*
100
(
/
number-of-anyfield-patterns
number-of-patterns
))))
;; print detailed histogram:
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
"User query"
"no. of occurrences"
)
(
let
((
patterns-to-print
(
hash-table-keys
pattern-histogram
t
)))
(
if
(
>
(
length
patterns-to-print
)
*nb-histogram-items-to-print*
)
(
setq
patterns-to-print
(
subseq
patterns-to-print
0
*nb-histogram-items-to-print*
)))
(
dolist
(
pattern
patterns-to-print
)
(
format
t
"~&~80,1,0,'.<~A ~; ~A~>"
pattern
(
gethash
pattern
pattern-histogram
))))))))
(
defun
print-basket-stats
(
httpd-all-log-entries
)
"Read HTTPD-LOG-ENTRIES and print stats related to user baskets."
(
declare
(
special
*basket-url*
*add-to-basket-url*
*display-basket-url*
*display-public-basket-url*
))
(
let
((
datetime-first
(
httpd-log-entry-datetime
(
car
httpd-all-log-entries
)))
(
datetime-last
(
httpd-log-entry-datetime
(
car
(
last
httpd-all-log-entries
))))
(
httpd-log-entries
(
filter-httpd-log-entries
httpd-all-log-entries
*basket-url*
))
(
basket-user-histogram
(
make-hash-table
:test
#'
equalp
)))
;; build basket users histogram:
(
dolist
(
httpd-log-entry-item
httpd-log-entries
)
(
incf
(
gethash
(
httpd-log-entry-ip
httpd-log-entry-item
)
basket-user-histogram
0
)))
(
format
t
"~&~%** USER BASKETS STATS"
)
;; print basket usage summary info:
(
format
t
"~&There were ~D user basket page hits."
(
length
httpd-log-entries
))
(
format
t
"~&This makes an average of ~F user basket page hits per day."
(
/
(
length
httpd-log-entries
)
(
1+
(
-
datetime-last
datetime-first
))))
(
format
t
"~&There were ~D unique basket page users in ~D days."
(
length
(
hash-table-keys
basket-user-histogram
))
(
1+
(
-
datetime-last
datetime-first
)))
;; add to basket:
(
format
t
"~&There were ~D additions to baskets."
(
length
(
filter-httpd-log-entries
httpd-log-entries
*add-to-basket-url*
)))
;; display baskets:
(
format
t
"~&There were ~D displays of baskets, out of which ~D public baskets accesses."
(
length
(
filter-httpd-log-entries
httpd-log-entries
*display-basket-url*
))
(
length
(
filter-httpd-log-entries
httpd-log-entries
*display-public-basket-url*
)))))
(
defun
print-alert-stats
(
httpd-all-log-entries
)
"Read HTTPD-LOG-ENTRIES and print stats related to user alerts."
(
declare
(
special
*alert-url*
*display-your-alerts-url*
*display-your-searches-url*
))
(
let
((
datetime-first
(
httpd-log-entry-datetime
(
car
httpd-all-log-entries
)))
(
datetime-last
(
httpd-log-entry-datetime
(
car
(
last
httpd-all-log-entries
))))
(
httpd-log-entries
(
filter-httpd-log-entries
httpd-all-log-entries
*alert-url*
))
(
alert-user-histogram
(
make-hash-table
:test
#'
equalp
)))
;; build alert users histogram:
(
dolist
(
httpd-log-entry-item
httpd-log-entries
)
(
incf
(
gethash
(
httpd-log-entry-ip
httpd-log-entry-item
)
alert-user-histogram
0
)))
(
format
t
"~&~%** USER ALERTS STATS"
)
;; print alert usage summary info:
(
format
t
"~&There were ~D user alert page hits."
(
length
httpd-log-entries
))
(
format
t
"~&This makes an average of ~F user alert page hits per day."
(
/
(
length
httpd-log-entries
)
(
1+
(
-
datetime-last
datetime-first
))))
(
format
t
"~&There were ~D unique alert page users in ~D days."
(
length
(
hash-table-keys
alert-user-histogram
))
(
1+
(
-
datetime-last
datetime-first
)))
;; display alert:
(
format
t
"~&There were ~D displays of user alerts."
(
length
(
filter-httpd-log-entries
httpd-log-entries
*display-your-alerts-url*
)))
;; display alerts:
(
format
t
"~&There were ~D displays of user searches history."
(
length
(
filter-httpd-log-entries
httpd-log-entries
*display-your-searches-url*
)))))
;; Special-purpose entry points:
(
defun
print-download-stats
(
httpd-log-filename
file-pattern
stats-frequency
)
"Read httpd log file HTTPD-LOG-FILENAME, looking for FILE-PATTERN.
Print download stats for intervals of STATS-FREQUENCY, which may be
day, week, or month."
(
declare
(
ignore
httpd-log-filename
file-pattern
stats-frequency
))
(
format
t
"~&FIXME: Not implemented yet."
)
t
)
(
defun
extract-page-views-events
(
httpd-log-filename
&optional
(
exclude-ip-list
nil
))
"Walk through HTTPD-LOG-FILENAME and extract detailed record page
views events. Useful to input old statistics of page views into
CDS Invenio's rnkPAGEVIEWS table. If EXCLUDE-IP-LIST is set, then do not
count events coming from those IPs."
(
declare
(
type
simple-base-string
httpd-log-filename
)
(
type
list
exclude-ip-list
))
(
flet
((
extract-recid-from-detailed-record-page-url
(
url
)
(
declare
(
type
simple-base-string
url
))
(
if
(
search
"GET /record/"
url
)
(
or
(
parse-integer
(
subseq
url
(
length
"GET /record/"
))
:junk-allowed
t
)
0
)
(
if
(
search
"GET /search.py?recid="
url
)
; compatibility with old style URLs
(
or
(
parse-integer
(
subseq
url
(
length
"GET /search.py?recid="
))
:junk-allowed
t
)
0
)
0
))))
(
format
t
"-- APACHE LOG FILE ANALYSIS"
)
(
format
t
"~&-- Filename: ~A"
httpd-log-filename
)
(
unless
(
null
exclude-ip-list
)
(
format
t
"~&-- Excluding search engine hits from ~A."
exclude-ip-list
))
(
with-open-file
(
input-stream
httpd-log-filename
:direction
:input
)
(
do
((
line
(
read-line
input-stream
nil
)
(
read-line
input-stream
nil
)))
((
not
line
))
(
declare
(
type
(
or
simple-base-string
null
)
line
))
(
let
((
httpd-log-entry-item
(
create-httpd-log-entry
(
wash-httpd-log-line
line
))))
(
if
(
and
(
or
(
null
exclude-ip-list
)
(
not
(
member
(
httpd-log-entry-ip
httpd-log-entry-item
)
exclude-ip-list
:test
#'
equal
)))
(
plusp
(
extract-recid-from-detailed-record-page-url
(
httpd-log-entry-request
httpd-log-entry-item
))))
(
format
t
"~&INSERT INTO rnkPAGEVIEWS (id_bibrec, client_host, view_time) VALUES ('~A', INET_ATON('~A'), '~A');"
(
extract-recid-from-detailed-record-page-url
(
httpd-log-entry-request
httpd-log-entry-item
))
(
httpd-log-entry-ip
httpd-log-entry-item
)
(
httpd-log-entry-datetimee
httpd-log-entry-item
)))))))
(
format
t
"~&-- DONE"
))
;; Main entry point:
(
defun
analyze-httpd-log-file
(
analyzer-config-file
httpd-log-file
)
"Main function that analyzes HTTPD-LOG-FILE according to
instructions presented in ANALYZER-CONFIG-FILE."
(
declare
(
special
*profile*
*nb-histogram-items-to-print*
*exclude-ip-list*
*home-collection*
*search-interface-url*
*search-engine-url*
*search-engine-url-old-style*
*detailed-record-url*
*basket-url*
*add-to-basket-url*
*display-basket-url*
*display-public-basket-url*
*alert-url*
*display-your-alerts-url*
*display-your-searches-url*
))
(
load
analyzer-config-file
:verbose
nil
:print
nil
)
(
when
*profile*
#+
cmu
(
profile:profile-all
)
#+
fixmeclisp
(
mon:monitor-all
)
#+
sbcl
(
progn
(
sb-profile:profile
PRINT-SEARCH-PATTERN-STATS
)
(
sb-profile:profile
CREATE-HTTPD-LOG-ENTRIES
)
(
sb-profile:profile
GET-SEARCH-PATTERNS-FROM-HTTPD-LOG-REQUEST-STRING
)
(
sb-profile:profile
PARSE-HTTPD-LOG-LINE
)
(
sb-profile:profile
STRING-DECODE-URL
)
(
sb-profile:profile
CREATE-HTTPD-LOG-ENTRY
)
(
sb-profile:profile
GET-DATETIME-FROM-HTTPD-LOG-DATETIME-STRING
)
(
sb-profile:profile
WASH-HTTPD-LOG-LINE
)
(
sb-profile:profile
GET-MONTH-NUMBER-FROM-MONTH-NAME
)
(
sb-profile:profile
MAKE-HTTPD-LOG-ENTRY
)
(
sb-profile:profile
PARSE-INTEGER-OR-ZERO
)
(
sb-profile:profile
GET-URLARG-VALUE-FROM-URL-STRING
)
(
sb-profile:profile
HASH-TABLE-KEYS
)
(
sb-profile:profile
HASH-TABLE-KEY-VAL-STATS
)
(
sb-profile:profile
STRING-SUBSTITUTE
)))
(
let
((
httpd-all-log-entries
(
create-httpd-log-entries
httpd-log-file
*exclude-ip-list*
)))
(
print-general-stats
httpd-all-log-entries
)
(
print-search-interface-stats
httpd-all-log-entries
)
(
print-search-collections-stats
httpd-all-log-entries
)
(
print-search-pattern-stats
httpd-all-log-entries
)
(
print-basket-stats
httpd-all-log-entries
)
(
print-alert-stats
httpd-all-log-entries
))
(
format
t
"~&"
)
(
when
*profile*
#+
cmu
(
profile:report-time
)
#+
fixmeclisp
(
mon:report
)
#+
sbcl
(
sb-profile:report
)))
Event Timeline
Log In to Comment