Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91767512
search_engine_summarizer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Nov 14, 06:36
Size
6 KB
Mime Type
text/x-python
Expires
Sat, Nov 16, 06:36 (2 d)
Engine
blob
Format
Raw Data
Handle
22321444
Attached To
R3600 invenio-infoscience
search_engine_summarizer.py
View Options
# -*- coding: utf-8 -*-
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Search Engine Summarizer, producing summary formats such as citesummary.
The main API is summarize_records().
"""
__lastupdated__
=
"""$Date$"""
__revision__
=
"$Id$"
from
invenio.config
import
CFG_INSPIRE_SITE
from
invenio.bibrank_citation_searcher
import
get_cited_by_list
import
search_engine
import
invenio.template
websearch_templates
=
invenio
.
template
.
load
(
'websearch'
)
## CFG_CITESUMMARY_COLLECTIONS -- how do we break down cite summary
## results according to collections?
if
CFG_INSPIRE_SITE
:
CFG_CITESUMMARY_COLLECTIONS
=
[[
'All papers'
,
'collection:citeable'
],
[
'Published only'
,
'collection:citeable collection:published'
]]
else
:
CFG_CITESUMMARY_COLLECTIONS
=
[[
'All papers'
,
''
],
[
'Published only'
,
'collection:article'
]]
## CFG_CITESUMMARY_FAME_THRESHOLDS -- how do we break down cite
## summary results into famous and less famous paper groups?
CFG_CITESUMMARY_FAME_THRESHOLDS
=
[
(
500
,
1000000
,
'Renowned papers (500+)'
),
(
250
,
499
,
'Famous papers (250-499)'
),
(
100
,
249
,
'Very well-known papers (100-249)'
),
(
50
,
99
,
'Well-known papers (50-99)'
),
(
10
,
49
,
'Known papers (10-49)'
),
(
1
,
9
,
'Less known papers (1-9)'
),
(
0
,
0
,
'Unknown papers (0)'
)
]
def
summarize_records
(
recids
,
of
,
ln
,
searchpattern
=
""
,
searchfield
=
""
,
req
=
None
):
"""Write summary report for records RECIDS in the format OF in language LN.
SEARCHPATTERN and SEARCHFIELD are search query that led to RECIDS,
for instance p='Smith, Paul' and f='author'. They are used for links.
REQ is the Apache/mod_python request object.
"""
import
search_engine
if
of
==
'hcs'
:
# this is HTML cite summary
# 1) hcs prologue:
d_recids
=
{}
d_total_recs
=
{}
for
coll
,
colldef
in
CFG_CITESUMMARY_COLLECTIONS
:
if
not
colldef
:
d_recids
[
coll
]
=
recids
else
:
d_recids
[
coll
]
=
recids
&
search_engine
.
search_pattern
(
p
=
colldef
)
d_total_recs
[
coll
]
=
len
(
d_recids
[
coll
])
req
.
write
(
websearch_templates
.
tmpl_citesummary_prologue
(
d_total_recs
,
CFG_CITESUMMARY_COLLECTIONS
,
searchpattern
,
searchfield
,
ln
))
# 2) hcs overview:
d_recid_citers
=
{}
d_total_cites
=
{}
d_avg_cites
=
{}
for
coll
,
colldef
in
CFG_CITESUMMARY_COLLECTIONS
:
d_total_cites
[
coll
]
=
0
d_avg_cites
[
coll
]
=
0
d_recid_citers
[
coll
]
=
get_cited_by_list
(
d_recids
[
coll
])
for
recid
,
lciters
in
d_recid_citers
[
coll
]:
if
lciters
:
d_total_cites
[
coll
]
+=
len
(
lciters
)
if
d_total_cites
[
coll
]
!=
0
:
d_avg_cites
[
coll
]
=
d_total_cites
[
coll
]
*
1.0
/
d_total_recs
[
coll
]
req
.
write
(
websearch_templates
.
tmpl_citesummary_overview
(
d_total_cites
,
d_avg_cites
,
CFG_CITESUMMARY_COLLECTIONS
,
ln
))
# 3) hcs break down by fame:
for
low
,
high
,
fame
in
CFG_CITESUMMARY_FAME_THRESHOLDS
:
d_cites
=
{}
for
coll
,
colldef
in
CFG_CITESUMMARY_COLLECTIONS
:
d_cites
[
coll
]
=
0
for
recid
,
lciters
in
d_recid_citers
[
coll
]:
numcites
=
0
if
lciters
:
numcites
=
len
(
lciters
)
if
numcites
>=
low
and
numcites
<=
high
:
d_cites
[
coll
]
+=
1
req
.
write
(
websearch_templates
.
tmpl_citesummary_breakdown_by_fame
(
d_cites
,
low
,
high
,
fame
,
CFG_CITESUMMARY_COLLECTIONS
,
searchpattern
,
searchfield
,
ln
))
# 4) hcs epilogue:
req
.
write
(
websearch_templates
.
tmpl_citesummary_epilogue
(
ln
))
return
''
elif
of
==
'xcs'
:
# this is XML cite summary
citedbylist
=
get_cited_by_list
(
recids
)
return
print_citation_summary_xml
(
citedbylist
)
#for citation summary, code xcs/hcs (unless changed)
def
print_citation_summary_xml
(
citedbylist
):
"""Prints citation summary in xml."""
alldict
=
calculate_citations
(
citedbylist
)
avgstr
=
str
(
alldict
[
'avgcites'
])
totalcites
=
str
(
alldict
[
'totalcites'
])
#format avg so that it does not span 10 digits
avgstr
=
avgstr
[
0
:
4
]
reciddict
=
alldict
[
'reciddict'
]
#output formatting
outp
=
"<citationsummary records=
\"
"
+
str
(
len
(
citedbylist
))
outp
+=
"
\"
citations=
\"
"
+
str
(
totalcites
)
+
"
\"
>"
for
low
,
high
,
name
in
CFG_CITESUMMARY_FAME_THRESHOLDS
:
#get the name, print the value
if
reciddict
.
has_key
(
name
):
recs
=
reciddict
[
name
]
outp
+=
"<citationclass>"
+
name
outp
+=
"<records>"
+
str
(
recs
)
+
"</records>"
outp
+=
"</citationclass>
\n
"
outp
=
outp
+
"</citationsummary>"
#req.write(outp)
return
outp
#just to return something
def
calculate_citations
(
citedbylist
):
"""calculates records in classes of citations
defined by thresholds. returns a dictionary that
contains total, avg, records and a dictionary
of threshold names and number corresponding to it"""
totalcites
=
0
avgcites
=
0
reciddict
=
{}
for
recid
,
cites
in
citedbylist
:
numcites
=
0
if
cites
:
numcites
=
len
(
cites
)
totalcites
=
totalcites
+
numcites
#take the numbers in CFG_CITESUMMARY_FAME_THRESHOLDS
for
low
,
high
,
name
in
CFG_CITESUMMARY_FAME_THRESHOLDS
:
if
(
numcites
>=
low
)
and
(
numcites
<=
high
):
if
reciddict
.
has_key
(
name
):
tmp
=
reciddict
[
name
]
tmp
.
append
(
recid
)
reciddict
[
name
]
=
tmp
else
:
reciddict
[
name
]
=
[
recid
]
if
(
len
(
citedbylist
)
==
0
):
avgcites
=
0
else
:
avgcites
=
totalcites
*
1.0
/
len
(
citedbylist
)
#create a dictionary that contains all the values
alldict
=
{}
alldict
[
'records'
]
=
len
(
citedbylist
)
alldict
[
'totalcites'
]
=
totalcites
alldict
[
'avgcites'
]
=
avgcites
alldict
[
'reciddict'
]
=
reciddict
return
alldict
Event Timeline
Log In to Comment