Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F121664744
bibrank_selfcites_task.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jul 12, 23:07
Size
11 KB
Mime Type
text/x-python
Expires
Mon, Jul 14, 23:07 (2 d)
Engine
blob
Format
Raw Data
Handle
27362888
Attached To
R3600 invenio-infoscience
bibrank_selfcites_task.py
View Options
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2012 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Self citations task
Stores self-citations in a table for quick access
Examples:
(run a daemon job)
bibrank -w selfcites
(run on a set of records)
selfcites -i 1-20
(run on a collection)
selfcites -c "Reports"
This task handles the self-citations computation
It is run on modified records so that it can update the tables used for
displaying info in the citesummary format
"""
import
sys
import
ConfigParser
import
time
from
datetime
import
datetime
from
invenio.redisutils
import
get_redis
from
invenio.dbquery
import
serialize_via_marshal
from
invenio.intbitset
import
intbitset
from
invenio.config
import
CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID
,
\
CFG_ETCDIR
from
invenio.bibtask
import
task_get_option
,
write_message
,
\
task_sleep_now_if_required
,
\
task_update_progress
from
invenio.dbquery
import
run_sql
from
invenio.bibrank_selfcites_indexer
import
update_self_cites_tables
,
\
compute_friends_self_citations
,
\
compute_simple_self_citations
,
\
get_authors_tags
from
invenio.bibrank_citation_searcher
import
get_refers_to
from
invenio.bibauthorid_daemon
import
get_user_logs
as
bibauthorid_user_log
from
invenio.bibrank_citation_indexer
import
get_bibrankmethod_lastupdate
from
invenio.bibrank_tag_based_indexer
import
intoDB
,
fromDB
from
invenio.intbitset
import
intbitset
def
compute_and_store_self_citations
(
recid
,
tags
,
citations_fun
,
selfcites_dic
,
verbose
=
False
):
"""Compute and store self-cites in a table
Args:
- recid
- tags: used when bibauthorid is desactivated see get_author_tags()
in bibrank_selfcites_indexer
"""
assert
recid
if
verbose
:
write_message
(
"* processing
%s
"
%
recid
)
references
=
get_refers_to
(
recid
)
recids_to_check
=
set
([
recid
])
|
set
(
references
)
placeholders
=
','
.
join
(
'
%s
'
for
r
in
recids_to_check
)
rec_row
=
run_sql
(
"SELECT MAX(`modification_date`) FROM `bibrec`"
" WHERE `id` IN (
%s
)"
%
placeholders
,
recids_to_check
)
try
:
rec_timestamp
=
rec_row
[
0
]
except
IndexError
:
write_message
(
"record not found"
)
return
cached_citations_row
=
run_sql
(
"SELECT `count` FROM `rnkSELFCITES`"
" WHERE `last_updated` >=
%s
"
" AND `id_bibrec` =
%s
"
,
(
rec_timestamp
[
0
],
recid
))
if
cached_citations_row
and
cached_citations_row
[
0
][
0
]:
if
verbose
:
write_message
(
"
%s
found (cached)"
%
cached_citations_row
[
0
])
else
:
cites
=
citations_fun
(
recid
,
tags
)
selfcites_dic
[
recid
]
=
len
(
cites
)
replace_cites
(
recid
,
cites
)
sql
=
"""REPLACE INTO rnkSELFCITES (`id_bibrec`, `count`, `references`,
`last_updated`) VALUES (%s, %s, %s, NOW())"""
references_string
=
','
.
join
(
str
(
r
)
for
r
in
references
)
run_sql
(
sql
,
(
recid
,
len
(
cites
),
references_string
))
if
verbose
:
write_message
(
"
%s
found"
%
len
(
cites
))
def
replace_cites
(
recid
,
new_cites
):
"""Update database with new citations set
Given a set of self citations:
* stores the new ones in the database
* removes the old ones from the database
"""
old_cites
=
set
(
row
[
0
]
for
row
in
run_sql
(
"""SELECT citer
FROM rnkSELFCITEDICT
WHERE citee = %s"""
,
[
recid
]))
cites_to_add
=
new_cites
-
old_cites
cites_to_delete
=
old_cites
-
new_cites
for
cit
in
cites_to_add
:
write_message
(
'adding cite
%s
%s
'
%
(
recid
,
cit
),
verbose
=
1
)
now
=
time
.
strftime
(
"%Y-%m-
%d
%H:%M:%S"
,
time
.
localtime
())
run_sql
(
"""INSERT INTO rnkSELFCITEDICT (citee, citer, last_updated)
VALUES (%s, %s, %s)"""
,
(
recid
,
cit
,
now
))
for
cit
in
cites_to_delete
:
write_message
(
'deleting cite
%s
%s
'
%
(
recid
,
cit
),
verbose
=
1
)
run_sql
(
"""DELETE FROM rnkSELFCITEDICT
WHERE citee = %s and citer = %s"""
,
(
recid
,
cit
))
def
rebuild_tables
(
rank_method_code
,
config
):
"""Rebuild the tables from scratch
Called by bibrank -w selfcites -R
"""
task_update_progress
(
'emptying tables'
)
empty_self_cites_tables
()
task_update_progress
(
'filling tables'
)
fill_self_cites_tables
(
rank_method_code
,
config
)
return
True
def
fetch_bibauthorid_last_update
():
"""Fetch last runtime of bibauthorid"""
bibauthorid_log
=
bibauthorid_user_log
(
userinfo
=
'daemon'
,
action
=
'PID_UPDATE'
,
only_most_recent
=
True
)
try
:
bibauthorid_end_date
=
bibauthorid_log
[
0
][
2
]
except
IndexError
:
bibauthorid_end_date
=
datetime
(
year
=
1900
,
month
=
1
,
day
=
1
)
return
bibauthorid_end_date
.
strftime
(
"%Y-%m-
%d
%H:%M:%S"
)
def
fetch_index_update
():
"""Fetch last runtime of given task"""
end_date
=
get_bibrankmethod_lastupdate
(
'citation'
)
if
CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID
:
bibauthorid_end_date
=
fetch_bibauthorid_last_update
()
end_date
=
min
(
end_date
,
bibauthorid_end_date
)
return
end_date
def
fetch_records
(
start_date
,
end_date
):
"""Filter records not indexed out of recids
We need to run after bibauthorid // bibrank citation indexer
"""
sql
=
"""SELECT `id` FROM `bibrec`
WHERE `modification_date` <= %s
AND `modification_date` > %s"""
records
=
run_sql
(
sql
,
(
end_date
,
start_date
))
return
intbitset
(
records
)
def
fetch_concerned_records
(
name
,
ids_param
):
"""Fetch records that have been updated since the last run of the daemon"""
if
ids_param
:
recids
=
intbitset
()
for
first
,
last
in
ids_param
:
recids
+=
range
(
first
,
last
+
1
)
end_date
=
None
else
:
start_date
=
get_bibrankmethod_lastupdate
(
name
)
end_date
=
fetch_index_update
()
recids
=
fetch_records
(
start_date
,
end_date
)
return
recids
,
end_date
def
store_last_updated
(
name
,
date
):
"""Updates method last run date"""
run_sql
(
"UPDATE rnkMETHOD SET last_updated=
%s
WHERE name=
%s
"
,
(
date
,
name
))
def
read_configuration
(
rank_method_code
):
"""Loads the config file from disk and parses it"""
filename
=
CFG_ETCDIR
+
"/bibrank/"
+
rank_method_code
+
".cfg"
config
=
ConfigParser
.
ConfigParser
()
try
:
config
.
readfp
(
open
(
filename
))
except
StandardError
:
write_message
(
"Cannot find configuration file:
%s
"
%
filename
,
sys
.
stderr
)
raise
return
config
def
process_updates
(
rank_method_code
):
"""
This is what gets executed first when the task is started.
It handles the --rebuild option. If that option is not specified
we fall back to the process_one()
"""
write_message
(
"Running rank method:
%s
"
%
rank_method_code
,
verbose
=
0
)
selfcites_config
=
read_configuration
(
rank_method_code
)
config
=
{
'algorithm'
:
selfcites_config
.
get
(
rank_method_code
,
"algorithm"
),
'friends_threshold'
:
selfcites_config
.
get
(
rank_method_code
,
"friends_threshold"
)
}
quick
=
task_get_option
(
"quick"
)
!=
"no"
if
not
quick
:
return
rebuild_tables
(
rank_method_code
,
config
)
tags
=
get_authors_tags
()
recids
,
end_date
=
fetch_concerned_records
(
rank_method_code
,
task_get_option
(
"id"
))
citations_fun
=
get_citations_fun
(
config
[
'algorithm'
])
weights
=
fromDB
(
rank_method_code
)
write_message
(
"recids
%s
"
%
str
(
recids
))
total
=
len
(
recids
)
for
count
,
recid
in
enumerate
(
recids
):
task_sleep_now_if_required
(
can_stop_too
=
True
)
msg
=
"Extracting for
%s
(
%d
/
%d
)"
%
(
recid
,
count
+
1
,
total
)
task_update_progress
(
msg
)
write_message
(
msg
)
process_one
(
recid
,
tags
,
citations_fun
,
weights
)
intoDB
(
weights
,
end_date
,
rank_method_code
)
store_weights_cache
(
weights
)
write_message
(
"Complete"
)
return
True
def
get_citations_fun
(
algorithm
):
"""Returns the computation function given the algorithm name"""
if
algorithm
==
'friends'
:
citations_fun
=
compute_friends_self_citations
else
:
citations_fun
=
compute_simple_self_citations
return
citations_fun
def
process_one
(
recid
,
tags
,
citations_fun
,
selfcites_dic
):
"""Self-cites core func, executed on each recid"""
# First update this record then all its references
compute_and_store_self_citations
(
recid
,
tags
,
citations_fun
,
selfcites_dic
)
references
=
get_refers_to
(
recid
)
for
recordid
in
references
:
compute_and_store_self_citations
(
recordid
,
tags
,
citations_fun
,
selfcites_dic
)
def
empty_self_cites_tables
():
"""
This will empty all the self-cites tables
The purpose is to rebuild the tables from scratch in case there is problem
with them: inconsitencies, corruption,...
"""
run_sql
(
'TRUNCATE rnkSELFCITES'
)
run_sql
(
'TRUNCATE rnkEXTENDEDAUTHORS'
)
run_sql
(
'TRUNCATE rnkRECORDSCACHE'
)
def
fill_self_cites_tables
(
rank_method_code
,
config
):
"""
This will fill the self-cites tables with data
The purpose of this function is to fill these tables on a website that
never ran the self-cites daemon
This is an optimization when running on empty tables, and we hope the
result is the same as the compute_and_store_self_citations.
"""
begin_date
=
datetime
.
now
()
.
strftime
(
"%Y-%m-
%d
%H:%M:%S"
)
algorithm
=
config
[
'algorithm'
]
tags
=
get_authors_tags
()
selfcites_dic
=
{}
all_ids
=
intbitset
(
run_sql
(
'SELECT id FROM bibrec ORDER BY id'
))
citations_fun
=
get_citations_fun
(
algorithm
)
write_message
(
'using
%s
'
%
citations_fun
.
__name__
)
if
algorithm
==
'friends'
:
# We only needs this table for the friends algorithm or assimilated
# Fill intermediary tables
for
index
,
recid
in
enumerate
(
all_ids
):
if
index
%
1000
==
0
:
msg
=
'intermediate
%d
/
%d
'
%
(
index
,
len
(
all_ids
))
task_update_progress
(
msg
)
write_message
(
msg
)
task_sleep_now_if_required
()
update_self_cites_tables
(
recid
,
config
,
tags
)
# Fill self-cites table
for
index
,
recid
in
enumerate
(
all_ids
):
if
index
%
1000
==
0
:
msg
=
'final
%d
/
%d
'
%
(
index
,
len
(
all_ids
))
task_update_progress
(
msg
)
write_message
(
msg
)
task_sleep_now_if_required
()
compute_and_store_self_citations
(
recid
,
tags
,
citations_fun
,
selfcites_dic
)
intoDB
(
selfcites_dic
,
begin_date
,
rank_method_code
)
store_weights_cache
(
selfcites_dic
)
def
store_weights_cache
(
weights
):
"""Store into key/value store"""
redis
=
get_redis
()
redis
.
set
(
'selfcites_weights'
,
serialize_via_marshal
(
weights
))
Event Timeline
Log In to Comment