Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61542802
bibrank_citation_searcher.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, May 7, 08:13
Size
9 KB
Mime Type
text/x-python
Expires
Thu, May 9, 08:13 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
17483675
Attached To
R3600 invenio-infoscience
bibrank_citation_searcher.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
__revision__
=
"$Id$"
import
re
import
marshal
from
zlib
import
decompress
,
error
from
invenio.dbquery
import
run_sql
,
get_table_update_time
,
OperationalError
from
invenio.intbitset
import
intbitset
from
invenio.data_cacher
import
DataCacher
class
CitationDictsDataCacher
(
DataCacher
):
"""
Cache holding all citation dictionaries (citationdict,
reversedict, selfcitdict, selfcitedbydict).
"""
def
__init__
(
self
):
def
cache_filler
():
alldicts
=
{}
try
:
res
=
run_sql
(
"""SELECT object_name,object_value FROM rnkCITATIONDATA"""
)
except
OperationalError
:
# database problems, return empty cache
return
{}
for
row
in
res
:
object_name
=
row
[
0
]
object_value
=
row
[
1
]
try
:
object_value_dict
=
marshal
.
loads
(
decompress
(
object_value
))
except
:
object_value_dict
=
{}
alldicts
[
object_name
]
=
object_value_dict
if
object_name
==
'citationdict'
:
# for cited:M->N queries, it is interesting to cache also
# some preprocessed citationdict:
alldicts
[
'citationdict_keys'
]
=
object_value_dict
.
keys
()
alldicts
[
'citationdict_keys_intbitset'
]
=
intbitset
(
object_value_dict
.
keys
())
return
alldicts
def
timestamp_getter
():
return
get_table_update_time
(
'rnkCITATIONDATA'
)
DataCacher
.
__init__
(
self
,
cache_filler
,
timestamp_getter
)
try
:
cache_citation_dicts
.
is_ok_p
except
Exception
:
cache_citation_dicts
=
CitationDictsDataCacher
()
def
get_citation_dict
(
dictname
):
"""Return cached value of a citation dictionary. DICTNAME can be
citationdict, reversedict, selfcitdict, selfcitedbydict.
"""
cache
=
cache_citation_dicts
.
get_cache
()
return
cache
.
get
(
dictname
,
{})
def
get_cited_by
(
recordid
):
"""Return a list of records that cite recordid"""
ret
=
[]
cache_cited_by_dictionary
=
get_citation_dict
(
"citationdict"
)
if
cache_cited_by_dictionary
.
has_key
(
recordid
):
ret
=
cache_cited_by_dictionary
[
recordid
]
return
ret
def
get_cited_by_count
(
recordid
):
"""Return how many records cite given RECORDID."""
cache_cited_by_dictionary
=
get_citation_dict
(
"citationdict"
)
return
len
(
cache_cited_by_dictionary
.
get
(
recordid
,
[]))
def
get_records_with_num_cites
(
numstr
,
allrecs
=
intbitset
([])):
"""Return an intbitset of record IDs that are cited X times,
X defined in numstr.
Warning: numstr is string and may not be numeric! It can
be 10,0->100 etc
"""
cache_cited_by_dictionary
=
get_citation_dict
(
"citationdict"
)
cache_cited_by_dictionary_keys
=
get_citation_dict
(
"citationdict_keys"
)
cache_cited_by_dictionary_keys_intbitset
=
get_citation_dict
(
"citationdict_keys_intbitset"
)
matches
=
intbitset
([])
#once again, check that the parameter is a string
if
not
(
type
(
numstr
)
==
type
(
"thisisastring"
)):
return
intbitset
([])
numstr
=
numstr
.
replace
(
" "
,
''
)
numstr
=
numstr
.
replace
(
'"'
,
''
)
num
=
0
#first, check if numstr is just a number
singlenum
=
re
.
findall
(
"(^\d+$)"
,
numstr
)
if
singlenum
:
num
=
int
(
singlenum
[
0
])
if
num
==
0
:
#we return recids that are not in keys
return
allrecs
-
cache_cited_by_dictionary_keys_intbitset
for
k
in
cache_cited_by_dictionary_keys
:
li
=
cache_cited_by_dictionary
[
k
]
if
len
(
li
)
==
num
:
matches
.
add
(
k
)
return
matches
#try to get 1->10 or such
firstsec
=
re
.
findall
(
"(\d+)->(\d+)"
,
numstr
)
if
firstsec
:
first
=
0
sec
=
-
1
try
:
first
=
int
(
firstsec
[
0
][
0
])
sec
=
int
(
firstsec
[
0
][
1
])
except
:
return
intbitset
([])
if
(
first
==
0
):
#start with those that have no cites..
matches
=
allrecs
-
cache_cited_by_dictionary_keys_intbitset
if
(
first
<=
sec
):
for
k
in
cache_cited_by_dictionary_keys
:
li
=
cache_cited_by_dictionary
[
k
]
if
len
(
li
)
>=
first
:
if
len
(
li
)
<=
sec
:
matches
.
add
(
k
)
return
matches
firstsec
=
re
.
findall
(
"(\d+)\+"
,
numstr
)
if
firstsec
:
first
=
firstsec
[
0
]
for
k
in
cache_cited_by_dictionary_keys
:
li
=
cache_cited_by_dictionary
[
k
]
if
len
(
li
)
>
int
(
first
):
matches
.
add
(
k
)
return
matches
def
get_cited_by_list
(
recordlist
):
"""Return a tuple of ([recid,list_of_citing_records],...) for all the
records in recordlist.
"""
cache_cited_by_dictionary
=
get_citation_dict
(
"citationdict"
)
result
=
[]
for
recid
in
recordlist
:
if
cache_cited_by_dictionary
.
has_key
(
recid
):
tmp
=
[
recid
,
cache_cited_by_dictionary
[
recid
]]
else
:
tmp
=
[
recid
,
[]]
result
.
append
(
tmp
)
return
result
def
get_cited_by_weight
(
recordlist
):
"""Return a tuple of ([recid,number_of_citing_records],...) for all the
records in recordlist.
"""
result
=
[]
tuples
=
get_cited_by_list
(
recordlist
)
for
recid
,
rlist
in
tuples
:
#just return recid - length
if
rlist
:
tmp
=
[
recid
,
len
(
rlist
)]
else
:
tmp
=
[
recid
,
0
]
result
.
append
(
tmp
)
return
result
def
calculate_cited_by_list
(
record_id
,
sort_order
=
"d"
):
"""Return a tuple of ([recid,citation_weight],...) for all the
record in citing RECORD_ID. The resulting recids is sorted by
ascending/descending citation weights depending or SORT_ORDER.
"""
cache_cited_by_dictionary
=
get_citation_dict
(
"citationdict"
)
citation_list
=
[]
result
=
[]
# determine which record cite RECORD_ID:
if
cache_cited_by_dictionary
:
citation_list
=
cache_cited_by_dictionary
.
get
(
record_id
,
[])
#add weights i.e. records that cite each of the entries in citation_list
for
c
in
citation_list
:
ccited
=
cache_cited_by_dictionary
.
get
(
c
,
[])
result
.
append
([
c
,
len
(
ccited
)])
# sort them:
if
result
:
if
sort_order
==
"d"
:
result
.
sort
(
lambda
x
,
y
:
cmp
(
y
[
1
],
x
[
1
]))
else
:
result
.
sort
(
lambda
x
,
y
:
cmp
(
x
[
1
],
y
[
1
]))
return
result
def
get_author_cited_by
(
authorstring
):
"""Return a list of doc ids [y1,y2,..] for the
author given as param, such that y1,y2.. cite that author
"""
citations
=
[]
res
=
run_sql
(
"select hitlist from rnkAUTHORDATA where aterm=
%s
"
,
(
authorstring
,))
if
res
and
res
[
0
]
and
res
[
0
][
0
]:
#has to be prepared for corrupted data!
try
:
citations
=
marshal
.
loads
(
decompress
(
res
[
0
][
0
]))
except
:
citations
=
[]
return
citations
def
get_self_cited_by
(
record_id
):
"""Return a list of doc ids [y1,y2,..] for the
rec id x given as param, so that x cites y1,y2,.. and x and each y share an author
"""
cache_selfcit_dictionary
=
get_citation_dict
(
"selfcitdict"
)
result
=
[]
if
cache_selfcit_dictionary
and
cache_selfcit_dictionary
.
has_key
(
record_id
):
result
.
extend
(
cache_selfcit_dictionary
[
record_id
])
if
not
result
:
return
None
return
result
def
get_self_cited_in
(
record_id
):
"""Return a list of doc ids [y1,y2,..] for the
rec id x given as param, so that x is cited in y1,y2,.. and x and each y share an author
"""
cache_selfcitedby_dictionary
=
get_citation_dict
(
"selfcitedbydict"
)
result
=
[]
if
cache_selfcitedby_dictionary
and
cache_selfcitedby_dictionary
.
has_key
(
record_id
):
result
.
extend
(
cache_selfcitedby_dictionary
[
record_id
])
if
not
result
:
return
None
return
result
def
calculate_co_cited_with_list
(
record_id
,
sort_order
=
"d"
):
"""Return a tuple of ([recid,co-cited weight],...) for records
that are co-cited with RECORD_ID. The resulting recids is sorted by
ascending/descending citation weights depending or SORT_ORDER.
"""
cache_cited_by_dictionary
=
get_citation_dict
(
"citationdict"
)
cache_reference_list_dictionary
=
get_citation_dict
(
"reversedict"
)
result
=
[]
result_intermediate
=
{}
citation_list
=
[]
if
cache_cited_by_dictionary
:
citation_list
=
cache_cited_by_dictionary
.
get
(
record_id
,
[])
for
cit_id
in
citation_list
:
reference_list
=
[]
if
cache_reference_list_dictionary
:
reference_list
=
cache_reference_list_dictionary
.
get
(
cit_id
,
[])
for
ref_id
in
reference_list
:
if
not
result_intermediate
.
has_key
(
ref_id
):
result_intermediate
[
ref_id
]
=
1
else
:
result_intermediate
[
ref_id
]
+=
1
for
key
,
value
in
result_intermediate
.
iteritems
():
if
not
(
key
==
record_id
):
result
.
append
([
key
,
value
])
if
result
:
if
sort_order
==
"d"
:
result
.
sort
(
lambda
x
,
y
:
cmp
(
y
[
1
],
x
[
1
]))
else
:
result
.
sort
(
lambda
x
,
y
:
cmp
(
x
[
1
],
y
[
1
]))
return
result
Event Timeline
Log In to Comment