Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F82459346
citation_indexer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Sep 11, 13:20
Size
38 KB
Mime Type
text/x-python
Expires
Fri, Sep 13, 13:20 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
20702186
Attached To
R3600 invenio-infoscience
citation_indexer.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
__revision__
=
"$Id$"
import
re
import
time
import
os
import
sys
import
ConfigParser
from
itertools
import
islice
from
datetime
import
datetime
from
six
import
iteritems
from
invenio.legacy.dbquery
import
run_sql
,
serialize_via_marshal
,
\
deserialize_via_marshal
from
invenio.modules.indexer.tokenizers.BibIndexJournalTokenizer
import
\
CFG_JOURNAL_PUBINFO_STANDARD_FORM
,
\
CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
from
invenio.legacy.search_engine
import
search_pattern
,
search_unit
from
invenio.legacy.bibrecord
import
get_fieldvalues
from
invenio.modules.formatter.utils
import
parse_tag
from
invenio.modules.knowledge.api
import
get_kb_mappings
from
invenio.legacy.bibsched.bibtask
import
write_message
,
task_get_option
,
\
task_update_progress
,
task_sleep_now_if_required
,
\
task_get_task_param
from
invenio.ext.logging
import
register_exception
from
invenio.legacy.bibindex.engine
import
get_field_tags
INTBITSET_OF_DELETED_RECORDS
=
search_unit
(
p
=
'DELETED'
,
f
=
'980'
,
m
=
'a'
)
re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
=
re
.
compile
(
CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
)
def
get_recids_matching_query
(
p
,
f
,
m
=
'e'
):
"""Return set of recIDs matching query for pattern p in field f."""
return
search_pattern
(
p
=
p
,
f
=
f
,
m
=
m
)
-
INTBITSET_OF_DELETED_RECORDS
def
get_citation_weight
(
rank_method_code
,
config
,
chunk_size
=
20000
):
"""return a dictionary which is used by bibrank daemon for generating
the index of sorted research results by citation information
"""
begin_time
=
time
.
time
()
quick
=
task_get_option
(
"quick"
)
!=
"no"
# id option forces re-indexing a certain range
# even if there are no new recs
if
task_get_option
(
"id"
):
# construct a range of records to index
updated_recids
=
[]
for
first
,
last
in
task_get_option
(
"id"
):
updated_recids
+=
range
(
first
,
last
+
1
)
if
len
(
updated_recids
)
>
10000
:
str_updated_recids
=
str
(
updated_recids
[:
10
])
+
' ... '
+
str
(
updated_recids
[
-
10
:])
else
:
str_updated_recids
=
str
(
updated_recids
)
write_message
(
'Records to process:
%s
'
%
str_updated_recids
)
index_update_time
=
None
else
:
bibrank_update_time
=
get_bibrankmethod_lastupdate
(
rank_method_code
)
if
not
quick
:
bibrank_update_time
=
"0000-00-00 00:00:00"
write_message
(
"bibrank:
%s
"
%
bibrank_update_time
)
index_update_time
=
get_bibindex_update_time
()
write_message
(
"bibindex:
%s
"
%
index_update_time
)
if
index_update_time
>
datetime
.
now
()
.
strftime
(
"%Y-%m-
%d
%H:%M:%S"
):
index_update_time
=
"0000-00-00 00:00:00"
updated_recids
=
get_modified_recs
(
bibrank_update_time
,
index_update_time
)
if
len
(
updated_recids
)
>
10000
:
str_updated_recids
=
str
(
updated_recids
[:
10
])
+
' ... '
+
str
(
updated_recids
[
-
10
:])
else
:
str_updated_recids
=
str
(
updated_recids
)
write_message
(
"
%s
records to update"
%
str_updated_recids
)
if
updated_recids
:
# result_intermediate should be warranted to exists!
# but if the user entered a "-R" (do all) option, we need to
# make an empty start set
if
quick
:
dicts
=
{
'cites_weight'
:
last_updated_result
(
rank_method_code
),
'cites'
:
get_cit_dict
(
"citationdict"
),
'refs'
:
get_cit_dict
(
"reversedict"
),
'selfcites'
:
get_cit_dict
(
"selfcitdict"
),
'selfrefs'
:
get_cit_dict
(
"selfcitedbydict"
),
'authorcites'
:
get_initial_author_dict
(),
}
else
:
dicts
=
{
'cites_weight'
:
{},
'cites'
:
{},
'refs'
:
{},
'selfcites'
:
{},
'selfrefs'
:
{},
'authorcites'
:
{},
}
# Process fully the updated records
process_and_store
(
updated_recids
,
config
,
dicts
,
chunk_size
,
quick
)
end_time
=
time
.
time
()
write_message
(
"Total time of get_citation_weight():
%.2f
sec"
%
\
(
end_time
-
begin_time
))
task_update_progress
(
"citation analysis done"
)
cites_weight
=
dicts
[
'cites_weight'
]
else
:
cites_weight
=
{}
write_message
(
"No new records added since last time this "
\
"rank method was executed"
)
return
cites_weight
,
index_update_time
def
process_and_store
(
recids
,
config
,
dicts
,
chunk_size
,
quick
):
# Process recent records first
# The older records were most likely added by the above steps
# to be reprocessed so they only have minor changes
recids_iter
=
iter
(
sorted
(
recids
,
reverse
=
True
))
# Split records to process into chunks so that we do not
# fill up too much memory
while
True
:
task_sleep_now_if_required
()
chunk
=
list
(
islice
(
recids_iter
,
chunk_size
))
if
not
chunk
:
if
not
quick
:
store_dicts
(
dicts
)
break
write_message
(
"Processing chunk #
%s
to #
%s
"
%
(
chunk
[
0
],
chunk
[
-
1
]))
# dicts are modified in-place
process_chunk
(
chunk
,
config
,
dicts
)
if
quick
:
# Store partial result as it is just an update and not
# a creation from scratch
store_dicts
(
dicts
)
def
process_chunk
(
recids
,
config
,
dicts
):
cites_weight
=
dicts
[
'cites_weight'
]
cites
=
dicts
[
'cites'
]
refs
=
dicts
[
'refs'
]
old_refs
=
{}
for
recid
in
recids
:
old_refs
[
recid
]
=
set
(
refs
.
get
(
recid
,
[]))
old_cites
=
{}
for
recid
in
recids
:
old_cites
[
recid
]
=
set
(
cites
.
get
(
recid
,
[]))
process_inner
(
recids
,
config
,
dicts
)
# Records cited by updated_recid_list
# They can only loose references as added references
# are already added to the dicts at this point
for
somerecid
in
recids
:
for
recid
in
set
(
old_cites
[
somerecid
])
-
set
(
cites
.
get
(
somerecid
,
[])):
refs
[
recid
]
=
list
(
set
(
refs
.
get
(
recid
,
[]))
-
set
([
somerecid
]))
if
not
refs
[
recid
]:
del
refs
[
recid
]
# Records referenced by updated_recid_list
# They can only loose citations as added citations
# are already added to the dicts at this point
for
somerecid
in
recids
:
for
recid
in
set
(
old_refs
[
somerecid
])
-
set
(
refs
.
get
(
somerecid
,
[])):
cites
[
recid
]
=
list
(
set
(
cites
.
get
(
recid
,
[]))
-
set
([
somerecid
]))
cites_weight
[
recid
]
=
len
(
cites
[
recid
])
if
not
cites
[
recid
]:
del
cites
[
recid
]
del
cites_weight
[
recid
]
def
process_inner
(
recids
,
config
,
dicts
,
do_catchup
=
True
):
tags
=
get_tags_config
(
config
)
# call the procedure that does the hard work by reading fields of
# citations and references in the updated_recid's (but nothing else)!
write_message
(
"Entering get_citation_informations"
,
verbose
=
9
)
citation_informations
=
get_citation_informations
(
recids
,
tags
,
fetch_catchup_info
=
do_catchup
)
write_message
(
"Entering ref_analyzer"
,
verbose
=
9
)
# call the analyser that uses the citation_informations to really
# search x-cites-y in the coll..
return
ref_analyzer
(
citation_informations
,
dicts
,
recids
,
tags
,
do_catchup
=
do_catchup
)
def
get_bibrankmethod_lastupdate
(
rank_method_code
):
"""return the last excution date of bibrank method
"""
query
=
"""SELECT DATE_FORMAT(last_updated, '%%Y-%%m-%%d %%H:%%i:%%s')
FROM rnkMETHOD WHERE name =%s"""
last_update_time
=
run_sql
(
query
,
[
rank_method_code
])
try
:
r
=
last_update_time
[
0
][
0
]
except
IndexError
:
r
=
"0000-00-00 00:00:00"
return
r
def
get_bibindex_update_time
():
try
:
# check indexing times of `journal' and `reportnumber`
# indexes, and only fetch records which have been indexed
sql
=
"SELECT DATE_FORMAT(MIN(last_updated), "
\
"'
%%
Y-
%%
m-
%%
d
%%
H:
%%
i:
%%
s') FROM idxINDEX WHERE name IN (
%s
,
%s
)"
index_update_time
=
run_sql
(
sql
,
(
'journal'
,
'reportnumber'
),
1
)[
0
][
0
]
except
IndexError
:
write_message
(
"Not running citation indexer since journal/reportnumber"
" indexes are not created yet."
)
index_update_time
=
"0000-00-00 00:00:00"
return
index_update_time
def
get_modified_recs
(
bibrank_method_lastupdate
,
indexes_lastupdate
):
"""Get records to be updated by bibrank indexing
Return the list of records which have been modified between the last
execution of bibrank method and the latest journal/report index updates.
The result is expected to have ascending id order.
"""
query
=
"""SELECT id FROM bibrec
WHERE modification_date >= %s
AND modification_date < %s
ORDER BY id ASC"""
records
=
run_sql
(
query
,
(
bibrank_method_lastupdate
,
indexes_lastupdate
))
return
[
r
[
0
]
for
r
in
records
]
def
last_updated_result
(
rank_method_code
):
""" return the last value of dictionary in rnkMETHODDATA table if it
exists and initialize the value of last updated records by zero,
otherwise an initial dictionary with zero as value for all recids
"""
query
=
"""SELECT relevance_data FROM rnkMETHOD, rnkMETHODDATA WHERE
rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD
AND rnkMETHOD.Name = '%s'"""
%
rank_method_code
try
:
rdict
=
run_sql
(
query
)[
0
][
0
]
except
IndexError
:
dic
=
{}
else
:
dic
=
deserialize_via_marshal
(
rdict
)
return
dic
def
format_journal
(
format_string
,
mappings
):
"""format the publ infostring according to the format"""
def
replace
(
char
,
data
):
return
data
.
get
(
char
,
char
)
return
''
.
join
(
replace
(
c
,
mappings
)
for
c
in
format_string
)
def
get_tags_config
(
config
):
"""Fetch needs config from our config file"""
# Probably "citation" unless this file gets renamed
function
=
config
.
get
(
"rank_method"
,
"function"
)
write_message
(
"config function
%s
"
%
function
,
verbose
=
9
)
tags
=
{}
# 037a: contains (often) the "hep-ph/0501084" tag of THIS record
try
:
tag
=
config
.
get
(
function
,
"primary_report_number"
)
except
ConfigParser
.
NoOptionError
:
tags
[
'record_pri_number'
]
=
None
else
:
tags
[
'record_pri_number'
]
=
tagify
(
parse_tag
(
tag
))
# 088a: additional short identifier for the record
try
:
tag
=
config
.
get
(
function
,
"additional_report_number"
)
except
ConfigParser
.
NoOptionError
:
tags
[
'record_add_number'
]
=
None
else
:
tags
[
'record_add_number'
]
=
tagify
(
parse_tag
(
tag
))
# 999C5r. this is in the reference list, refers to other records.
# Looks like: hep-ph/0408002
try
:
tag
=
config
.
get
(
function
,
"reference_via_report_number"
)
except
ConfigParser
.
NoOptionError
:
tags
[
'refs_report_number'
]
=
None
else
:
tags
[
'refs_report_number'
]
=
tagify
(
parse_tag
(
tag
))
# 999C5s. this is in the reference list, refers to other records.
# Looks like: Phys.Rev.,A21,78
try
:
tag
=
config
.
get
(
function
,
"reference_via_pubinfo"
)
except
ConfigParser
.
NoOptionError
:
tags
[
'refs_journal'
]
=
None
else
:
tags
[
'refs_journal'
]
=
tagify
(
parse_tag
(
tag
))
# 999C5a. this is in the reference list, refers to other records.
# Looks like: 10.1007/BF03170733
try
:
tag
=
config
.
get
(
function
,
"reference_via_doi"
)
except
ConfigParser
.
NoOptionError
:
tags
[
'refs_doi'
]
=
None
else
:
tags
[
'refs_doi'
]
=
tagify
(
parse_tag
(
tag
))
# Fields needed to construct the journals for this record
try
:
tag
=
{
'pages'
:
config
.
get
(
function
,
"pubinfo_journal_page"
),
'year'
:
config
.
get
(
function
,
"pubinfo_journal_year"
),
'journal'
:
config
.
get
(
function
,
"pubinfo_journal_title"
),
'volume'
:
config
.
get
(
function
,
"pubinfo_journal_volume"
),
}
except
ConfigParser
.
NoOptionError
:
tags
[
'publication'
]
=
None
else
:
tags
[
'publication'
]
=
{
'pages'
:
tagify
(
parse_tag
(
tag
[
'pages'
])),
'year'
:
tagify
(
parse_tag
(
tag
[
'year'
])),
'journal'
:
tagify
(
parse_tag
(
tag
[
'journal'
])),
'volume'
:
tagify
(
parse_tag
(
tag
[
'volume'
])),
}
# Fields needed to lookup the DOIs
tags
[
'doi'
]
=
get_field_tags
(
'doi'
)
# 999C5s. A standardized way of writing a reference in the reference list.
# Like: Nucl. Phys. B 710 (2000) 371
try
:
tags
[
'publication_format'
]
=
config
.
get
(
function
,
"pubinfo_journal_format"
)
except
ConfigParser
.
NoOptionError
:
tags
[
'publication_format'
]
=
CFG_JOURNAL_PUBINFO_STANDARD_FORM
# Print values of tags for debugging
write_message
(
"tag values:
%r
"
%
[
tags
],
verbose
=
9
)
return
tags
def
get_journal_info
(
recid
,
tags
):
record_info
=
[]
# TODO: handle recors with multiple journals
tagsvalues
=
{}
# we store the tags and their values here
# like c->444 y->1999 p->"journal of foo",
# v->20
tmp
=
get_fieldvalues
(
recid
,
tags
[
'publication'
][
'journal'
])
if
tmp
:
tagsvalues
[
"p"
]
=
tmp
[
0
]
tmp
=
get_fieldvalues
(
recid
,
tags
[
'publication'
][
'volume'
])
if
tmp
:
tagsvalues
[
"v"
]
=
tmp
[
0
]
tmp
=
get_fieldvalues
(
recid
,
tags
[
'publication'
][
'year'
])
if
tmp
:
tagsvalues
[
"y"
]
=
tmp
[
0
]
tmp
=
get_fieldvalues
(
recid
,
tags
[
'publication'
][
'pages'
])
if
tmp
:
# if the page numbers have "x-y" take just x
pages
=
tmp
[
0
]
hpos
=
pages
.
find
(
"-"
)
if
hpos
>
0
:
pages
=
pages
[:
hpos
]
tagsvalues
[
"c"
]
=
pages
# check if we have the required data
ok
=
True
for
c
in
tags
[
'publication_format'
]:
if
c
in
(
'p'
,
'v'
,
'y'
,
'c'
):
if
c
not
in
tagsvalues
:
ok
=
False
if
ok
:
publ
=
format_journal
(
tags
[
'publication_format'
],
tagsvalues
)
record_info
+=
[
publ
]
alt_volume
=
get_alt_volume
(
tagsvalues
[
'v'
])
if
alt_volume
:
tagsvalues2
=
tagsvalues
.
copy
()
tagsvalues2
[
'v'
]
=
alt_volume
publ
=
format_journal
(
tags
[
'publication_format'
],
tagsvalues2
)
record_info
+=
[
publ
]
# Add codens
for
coden
in
get_kb_mappings
(
'CODENS'
,
value
=
tagsvalues
[
'p'
]):
tagsvalues2
=
tagsvalues
.
copy
()
tagsvalues2
[
'p'
]
=
coden
[
'key'
]
publ
=
format_journal
(
tags
[
'publication_format'
],
tagsvalues2
)
record_info
+=
[
publ
]
return
record_info
def
get_alt_volume
(
volume
):
alt_volume
=
None
if
re
.
match
(
ur'[a-zA-Z]\d+'
,
volume
,
re
.
U
|
re
.
I
):
alt_volume
=
volume
[
1
:]
+
volume
[
0
]
elif
re
.
match
(
ur'\d+[a-zA-Z]'
,
volume
,
re
.
U
|
re
.
I
):
alt_volume
=
volume
[
-
1
]
+
volume
[:
-
1
]
return
alt_volume
def
get_citation_informations
(
recid_list
,
tags
,
fetch_catchup_info
=
True
):
"""scans the collections searching references (999C5x -fields) and
citations for items in the recid_list
returns a 4 list of dictionaries that contains the citation information
of cds records
examples: [ {} {} {} {} ]
[ {5: 'SUT-DP-92-70-5'},
{ 93: ['astro-ph/9812088']},
{ 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
NB: stuff here is for analysing new or changed records.
see "ref_analyzer" for more.
"""
begin_time
=
os
.
times
()[
4
]
records_info
=
{
'report-numbers'
:
{},
'journals'
:
{},
'doi'
:
{},
}
references_info
=
{
'report-numbers'
:
{},
'journals'
:
{},
'doi'
:
{},
}
# perform quick check to see if there are some records with
# reference tags, because otherwise get.cit.inf would be slow even
# if there is nothing to index:
if
run_sql
(
"SELECT value FROM bib
%s
x WHERE tag=
%%
s LIMIT 1"
%
tags
[
'refs_journal'
][
0
:
2
],
(
tags
[
'refs_journal'
],
))
or
\
run_sql
(
"SELECT value FROM bib
%s
x WHERE tag=
%%
s LIMIT 1"
%
tags
[
'refs_report_number'
][
0
:
2
],
(
tags
[
'refs_report_number'
],
)):
done
=
0
# for status reporting
for
recid
in
recid_list
:
if
done
%
10
==
0
:
task_sleep_now_if_required
()
# in fact we can sleep any time here
if
done
%
1000
==
0
:
mesg
=
"get cit.inf done
%s
of
%s
"
%
(
done
,
len
(
recid_list
))
write_message
(
mesg
)
task_update_progress
(
mesg
)
done
+=
1
if
recid
in
INTBITSET_OF_DELETED_RECORDS
:
# do not treat this record since it was deleted; we
# skip it like this in case it was only soft-deleted
# e.g. via bibedit (i.e. when collection tag 980 is
# DELETED but other tags like report number or journal
# publication info remained the same, so the calls to
# get_fieldvalues() below would return old values)
continue
if
tags
[
'refs_report_number'
]:
references_info
[
'report-numbers'
][
recid
]
\
=
get_fieldvalues
(
recid
,
tags
[
'refs_report_number'
],
sort
=
False
)
msg
=
"references_info['report-numbers'][
%s
] =
%r
"
\
%
(
recid
,
references_info
[
'report-numbers'
][
recid
])
write_message
(
msg
,
verbose
=
9
)
if
tags
[
'refs_journal'
]:
references_info
[
'journals'
][
recid
]
=
[]
for
ref
in
get_fieldvalues
(
recid
,
tags
[
'refs_journal'
],
sort
=
False
):
try
:
# Inspire specific parsing
journal
,
volume
,
page
=
ref
.
split
(
','
)
except
ValueError
:
pass
else
:
alt_volume
=
get_alt_volume
(
volume
)
if
alt_volume
:
alt_ref
=
','
.
join
([
journal
,
alt_volume
,
page
])
references_info
[
'journals'
][
recid
]
+=
[
alt_ref
]
references_info
[
'journals'
][
recid
]
+=
[
ref
]
msg
=
"references_info['journals'][
%s
] =
%r
"
\
%
(
recid
,
references_info
[
'journals'
][
recid
])
write_message
(
msg
,
verbose
=
9
)
if
tags
[
'refs_doi'
]:
references_info
[
'doi'
][
recid
]
\
=
get_fieldvalues
(
recid
,
tags
[
'refs_doi'
],
sort
=
False
)
msg
=
"references_info['doi'][
%s
] =
%r
"
\
%
(
recid
,
references_info
[
'doi'
][
recid
])
write_message
(
msg
,
verbose
=
9
)
if
not
fetch_catchup_info
:
# We do not need the extra info
continue
if
tags
[
'record_pri_number'
]
or
tags
[
'record_add_number'
]:
records_info
[
'report-numbers'
][
recid
]
=
[]
if
tags
[
'record_pri_number'
]:
records_info
[
'report-numbers'
][
recid
]
\
+=
get_fieldvalues
(
recid
,
tags
[
'record_pri_number'
],
sort
=
False
)
if
tags
[
'record_add_number'
]:
records_info
[
'report-numbers'
][
recid
]
\
+=
get_fieldvalues
(
recid
,
tags
[
'record_add_number'
],
sort
=
False
)
msg
=
"records_info[
%s
]['report-numbers'] =
%r
"
\
%
(
recid
,
records_info
[
'report-numbers'
][
recid
])
write_message
(
msg
,
verbose
=
9
)
if
tags
[
'doi'
]:
records_info
[
'doi'
][
recid
]
=
[]
for
tag
in
tags
[
'doi'
]:
records_info
[
'doi'
][
recid
]
+=
get_fieldvalues
(
recid
,
tag
,
sort
=
False
)
msg
=
"records_info[
%s
]['doi'] =
%r
"
\
%
(
recid
,
records_info
[
'doi'
][
recid
])
write_message
(
msg
,
verbose
=
9
)
# get a combination of
# journal vol (year) pages
if
tags
[
'publication'
]:
records_info
[
'journals'
][
recid
]
=
get_journal_info
(
recid
,
tags
)
msg
=
"records_info[
%s
]['journals'] =
%r
"
\
%
(
recid
,
records_info
[
'journals'
][
recid
])
write_message
(
msg
,
verbose
=
9
)
else
:
mesg
=
"Warning: there are no records with tag values for "
\
"
%s
or
%s
. Nothing to do."
%
\
(
tags
[
'refs_journal'
],
tags
[
'refs_report_number'
])
write_message
(
mesg
)
mesg
=
"get cit.inf done fully"
write_message
(
mesg
)
task_update_progress
(
mesg
)
end_time
=
os
.
times
()[
4
]
write_message
(
"Execution time for generating citation info "
"from record:
%.2f
sec"
%
(
end_time
-
begin_time
))
return
records_info
,
references_info
def
standardize_report_number
(
report_number
):
# Remove category for arxiv papers
report_number
=
re
.
sub
(
ur'(?:arXiv:)?(\d{4}\.\d{4}) \[[a-zA-Z\.-]+\]'
,
ur'arXiv:\g<1>'
,
report_number
,
re
.
I
|
re
.
U
)
return
report_number
def
ref_analyzer
(
citation_informations
,
dicts
,
updated_recids
,
tags
,
do_catchup
=
True
):
"""Analyze the citation informations and calculate the citation weight
and cited by list dictionary.
"""
citations_weight
=
dicts
[
'cites_weight'
]
citations
=
dicts
[
'cites'
]
references
=
dicts
[
'refs'
]
selfcites
=
dicts
[
'selfcites'
]
selfrefs
=
dicts
[
'selfrefs'
]
authorcites
=
dicts
[
'authorcites'
]
def
step
(
msg_prefix
,
recid
,
done
,
total
):
if
done
%
30
==
0
:
task_sleep_now_if_required
()
if
done
%
1000
==
0
:
mesg
=
"
%s
done
%s
of
%s
"
%
(
msg_prefix
,
done
,
total
)
write_message
(
mesg
)
task_update_progress
(
mesg
)
write_message
(
"Processing:
%s
"
%
recid
,
verbose
=
9
)
def
add_to_dicts
(
citer
,
cited
):
# Make sure we don't add ourselves
# Workaround till we know why we are adding ourselves.
if
citer
==
cited
:
return
if
cited
not
in
citations_weight
:
citations_weight
[
cited
]
=
0
# Citations and citations weight
if
citer
not
in
citations
.
setdefault
(
cited
,
[]):
citations
[
cited
]
.
append
(
citer
)
citations_weight
[
cited
]
+=
1
# References
if
cited
not
in
references
.
setdefault
(
citer
,
[]):
references
[
citer
]
.
append
(
cited
)
# dict of recid -> institute_give_publ_id
records_info
,
references_info
=
citation_informations
t1
=
os
.
times
()[
4
]
write_message
(
"Phase 0: temporarily remove changed records from "
\
"citation dictionaries; they will be filled later"
)
if
do_catchup
:
for
somerecid
in
updated_recids
:
try
:
del
citations
[
somerecid
]
except
KeyError
:
pass
for
somerecid
in
updated_recids
:
try
:
del
references
[
somerecid
]
except
KeyError
:
pass
# Try to find references based on 999C5r
# e.g 8 -> ([astro-ph/9889],[hep-ph/768])
# meaning: rec 8 contains these in bibliography
write_message
(
"Phase 1: Report numbers references"
)
done
=
0
for
thisrecid
,
refnumbers
in
iteritems
(
references_info
[
'report-numbers'
]):
step
(
"Report numbers references"
,
thisrecid
,
done
,
len
(
references_info
[
'report-numbers'
]))
done
+=
1
for
refnumber
in
(
r
for
r
in
refnumbers
if
r
):
field
=
'reportnumber'
refnumber
=
standardize_report_number
(
refnumber
)
# Search for "hep-th/5644654 or such" in existing records
recids
=
get_recids_matching_query
(
p
=
refnumber
,
f
=
field
)
write_message
(
"These match searching
%s
in
%s
:
%s
"
%
\
(
refnumber
,
field
,
list
(
recids
)),
verbose
=
9
)
if
not
recids
:
insert_into_missing
(
thisrecid
,
refnumber
)
else
:
remove_from_missing
(
refnumber
)
if
len
(
recids
)
>
1
:
store_citation_warning
(
'multiple-matches'
,
refnumber
)
msg
=
"Whoops: record '
%d
' report number value '
%s
' "
\
"matches many records; taking only the first one.
%s
"
%
\
(
thisrecid
,
refnumber
,
repr
(
recids
))
write_message
(
msg
,
stream
=
sys
.
stderr
)
for
recid
in
list
(
recids
)[:
1
]:
# take only the first one
add_to_dicts
(
thisrecid
,
recid
)
mesg
=
"done fully"
write_message
(
mesg
)
task_update_progress
(
mesg
)
t2
=
os
.
times
()[
4
]
# Try to find references based on 999C5s
# e.g. Phys.Rev.Lett. 53 (1986) 2285
write_message
(
"Phase 2: Journal references"
)
done
=
0
for
thisrecid
,
refs
in
iteritems
(
references_info
[
'journals'
]):
step
(
"Journal references"
,
thisrecid
,
done
,
len
(
references_info
[
'journals'
]))
done
+=
1
for
reference
in
(
r
for
r
in
refs
if
r
):
p
=
reference
field
=
'journal'
# check reference value to see whether it is well formed:
if
not
re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
.
match
(
p
):
store_citation_warning
(
'not-well-formed'
,
p
)
msg
=
"Whoops, record '
%d
' reference value '
%s
' "
\
"is not well formed; skipping it."
%
(
thisrecid
,
p
)
write_message
(
msg
,
stream
=
sys
.
stderr
)
continue
# skip this ill-formed value
recids
=
search_unit
(
p
,
field
)
-
INTBITSET_OF_DELETED_RECORDS
write_message
(
"These match searching
%s
in
%s
:
%s
"
\
%
(
reference
,
field
,
list
(
recids
)),
verbose
=
9
)
if
not
recids
:
insert_into_missing
(
thisrecid
,
p
)
else
:
remove_from_missing
(
p
)
if
len
(
recids
)
>
1
:
store_citation_warning
(
'multiple-matches'
,
p
)
msg
=
"Whoops: record '
%d
' reference value '
%s
' "
\
"matches many records; taking only the first one.
%s
"
%
\
(
thisrecid
,
p
,
repr
(
recids
))
write_message
(
msg
,
stream
=
sys
.
stderr
)
for
recid
in
list
(
recids
)[:
1
]:
# take only the first one
add_to_dicts
(
thisrecid
,
recid
)
mesg
=
"done fully"
write_message
(
mesg
)
task_update_progress
(
mesg
)
t3
=
os
.
times
()[
4
]
# Try to find references based on 999C5a
# e.g. 10.1007/BF03170733
write_message
(
"Phase 3: DOI references"
)
done
=
0
for
thisrecid
,
refs
in
iteritems
(
references_info
[
'doi'
]):
step
(
"DOI references"
,
thisrecid
,
done
,
len
(
references_info
[
'doi'
]))
done
+=
1
for
reference
in
(
r
for
r
in
refs
if
r
):
p
=
reference
field
=
'doi'
recids
=
get_recids_matching_query
(
p
,
field
)
write_message
(
"These match searching
%s
in
%s
:
%s
"
\
%
(
reference
,
field
,
list
(
recids
)),
verbose
=
9
)
if
not
recids
:
insert_into_missing
(
thisrecid
,
p
)
else
:
remove_from_missing
(
p
)
if
len
(
recids
)
>
1
:
store_citation_warning
(
'multiple-matches'
,
p
)
msg
=
"Whoops: record '
%d
' DOI value '
%s
' "
\
"matches many records; taking only the first one.
%s
"
%
\
(
thisrecid
,
p
,
repr
(
recids
))
write_message
(
msg
,
stream
=
sys
.
stderr
)
for
recid
in
list
(
recids
)[:
1
]:
# take only the first one
add_to_dicts
(
thisrecid
,
recid
)
mesg
=
"done fully"
write_message
(
mesg
)
task_update_progress
(
mesg
)
t4
=
os
.
times
()[
4
]
# Search for stuff like CERN-TH-4859/87 in list of refs
write_message
(
"Phase 4: report numbers catchup"
)
done
=
0
for
thisrecid
,
reportcodes
in
iteritems
(
records_info
[
'report-numbers'
]):
step
(
"Report numbers catchup"
,
thisrecid
,
done
,
len
(
records_info
[
'report-numbers'
]))
done
+=
1
for
reportcode
in
(
r
for
r
in
reportcodes
if
r
):
if
reportcode
.
startswith
(
'arXiv'
):
std_reportcode
=
standardize_report_number
(
reportcode
)
report_pattern
=
r'^
%s
( *\[[a-zA-Z.-]*\])?'
%
\
re
.
escape
(
std_reportcode
)
recids
=
get_recids_matching_query
(
report_pattern
,
tags
[
'refs_report_number'
],
'r'
)
else
:
recids
=
get_recids_matching_query
(
reportcode
,
tags
[
'refs_report_number'
],
'e'
)
for
recid
in
recids
:
add_to_dicts
(
recid
,
thisrecid
)
mesg
=
"done fully"
write_message
(
mesg
)
task_update_progress
(
mesg
)
# Find this record's pubinfo in other records' bibliography
write_message
(
"Phase 5: journals catchup"
)
done
=
0
t5
=
os
.
times
()[
4
]
for
thisrecid
,
rec_journals
in
iteritems
(
records_info
[
'journals'
]):
step
(
"Journals catchup"
,
thisrecid
,
done
,
len
(
records_info
[
'journals'
]))
done
+=
1
for
journal
in
rec_journals
:
journal
=
journal
.
replace
(
"
\"
"
,
""
)
# Search the publication string like
# Phys. Lett., B 482 (2000) 417 in 999C5s
recids
=
search_unit
(
p
=
journal
,
f
=
tags
[
'refs_journal'
],
m
=
'a'
)
\
-
INTBITSET_OF_DELETED_RECORDS
write_message
(
"These records match
%s
in
%s
:
%s
"
\
%
(
journal
,
tags
[
'refs_journal'
],
list
(
recids
)),
verbose
=
9
)
for
recid
in
recids
:
add_to_dicts
(
recid
,
thisrecid
)
mesg
=
"done fully"
write_message
(
mesg
)
task_update_progress
(
mesg
)
write_message
(
"Phase 6: DOI catchup"
)
done
=
0
t6
=
os
.
times
()[
4
]
for
thisrecid
,
dois
in
iteritems
(
records_info
[
'doi'
]):
step
(
"DOI catchup"
,
thisrecid
,
done
,
len
(
records_info
[
'doi'
]))
done
+=
1
for
doi
in
dois
:
# Search the publication string like
# Phys. Lett., B 482 (2000) 417 in 999C5a
recids
=
search_unit
(
p
=
doi
,
f
=
tags
[
'refs_doi'
],
m
=
'a'
)
\
-
INTBITSET_OF_DELETED_RECORDS
write_message
(
"These records match
%s
in
%s
:
%s
"
\
%
(
doi
,
tags
[
'refs_doi'
],
list
(
recids
)),
verbose
=
9
)
for
recid
in
recids
:
add_to_dicts
(
recid
,
thisrecid
)
mesg
=
"done fully"
write_message
(
mesg
)
task_update_progress
(
mesg
)
write_message
(
"Phase 7: remove empty lists from dicts"
)
# Remove empty lists in citation and reference
keys
=
citations
.
keys
()
for
k
in
keys
:
if
not
citations
[
k
]:
del
citations
[
k
]
keys
=
references
.
keys
()
for
k
in
keys
:
if
not
references
[
k
]:
del
references
[
k
]
if
task_get_task_param
(
'verbose'
)
>=
3
:
# Print only X first to prevent flood
write_message
(
"citation_list (x is cited by y):"
)
write_message
(
dict
(
islice
(
iteritems
(
citations
),
10
)))
write_message
(
"size:
%s
"
%
len
(
citations
))
write_message
(
"reference_list (x cites y):"
)
write_message
(
dict
(
islice
(
iteritems
(
references
),
10
)))
write_message
(
"size:
%s
"
%
len
(
references
))
write_message
(
"selfcitedbydic (x is cited by y and one of the "
\
"authors of x same as y's):"
)
write_message
(
dict
(
islice
(
iteritems
(
selfcites
),
10
)))
write_message
(
"size:
%s
"
%
len
(
selfcites
))
write_message
(
"selfdic (x cites y and one of the authors of x "
\
"same as y's):"
)
write_message
(
dict
(
islice
(
iteritems
(
selfrefs
),
10
)))
write_message
(
"size:
%s
"
%
len
(
selfrefs
))
write_message
(
"authorcitdic (author is cited in recs):"
)
write_message
(
dict
(
islice
(
iteritems
(
authorcites
),
10
)))
write_message
(
"size:
%s
"
%
len
(
authorcites
))
t7
=
os
.
times
()[
4
]
write_message
(
"Execution time for analyzing the citation information "
\
"generating the dictionary:"
)
write_message
(
"... checking ref report numbers:
%.2f
sec"
%
(
t2
-
t1
))
write_message
(
"... checking ref journals:
%.2f
sec"
%
(
t3
-
t2
))
write_message
(
"... checking ref DOI:
%.2f
sec"
%
(
t4
-
t3
))
write_message
(
"... checking rec report numbers:
%.2f
sec"
%
(
t5
-
t4
))
write_message
(
"... checking rec journals:
%.2f
sec"
%
(
t6
-
t5
))
write_message
(
"... checking rec DOI:
%.2f
sec"
%
(
t7
-
t6
))
write_message
(
"... total time of ref_analyze:
%.2f
sec"
%
(
t7
-
t1
))
return
citations_weight
,
citations
,
references
,
selfcites
,
\
selfrefs
,
authorcites
def
store_dicts
(
dicts
):
"""Insert the reference and citation list into the database"""
insert_into_cit_db
(
dicts
[
'refs'
],
"reversedict"
)
insert_into_cit_db
(
dicts
[
'cites'
],
"citationdict"
)
insert_into_cit_db
(
dicts
[
'selfcites'
],
"selfcitedbydict"
)
insert_into_cit_db
(
dicts
[
'selfrefs'
],
"selfcitdict"
)
def
insert_into_cit_db
(
dic
,
name
):
"""Stores citation dictionary in the database"""
ndate
=
time
.
strftime
(
"%Y-%m-
%d
%H:%M:%S"
,
time
.
localtime
())
s
=
serialize_via_marshal
(
dic
)
write_message
(
"size of
%s
%s
"
%
(
name
,
len
(
s
)))
# check that this column really exists
run_sql
(
"""REPLACE INTO rnkCITATIONDATA(object_name, object_value,
last_updated) VALUES (%s, %s, %s)"""
,
(
name
,
s
,
ndate
))
def
get_cit_dict
(
name
):
"""get a named citation dict from the db"""
cdict
=
run_sql
(
"""SELECT object_value FROM rnkCITATIONDATA
WHERE object_name = %s"""
,
(
name
,
))
if
cdict
and
cdict
[
0
]
and
cdict
[
0
][
0
]:
dict_from_db
=
deserialize_via_marshal
(
cdict
[
0
][
0
])
else
:
dict_from_db
=
{}
return
dict_from_db
def
get_initial_author_dict
():
"""read author->citedinlist dict from the db"""
adict
=
{}
try
:
ah
=
run_sql
(
"SELECT aterm,hitlist FROM rnkAUTHORDATA"
)
for
(
a
,
h
)
in
ah
:
adict
[
a
]
=
deserialize_via_marshal
(
h
)
return
adict
except
:
register_exception
(
prefix
=
"could not read rnkAUTHORDATA"
,
alert_admin
=
True
)
return
{}
def
insert_into_missing
(
recid
,
report
):
"""put the referingrecordnum-publicationstring into
the "we are missing these" table"""
if
len
(
report
)
>=
255
:
# Invalid report, it is too long
# and does not fit in the database column
# (currently varchar 255)
return
wasalready
=
run_sql
(
"""SELECT id_bibrec
FROM rnkCITATIONDATAEXT
WHERE id_bibrec = %s
AND extcitepubinfo = %s"""
,
(
recid
,
report
))
if
not
wasalready
:
run_sql
(
"""INSERT INTO rnkCITATIONDATAEXT(id_bibrec, extcitepubinfo)
VALUES (%s,%s)"""
,
(
recid
,
report
))
def
remove_from_missing
(
report
):
"""remove the recid-ref -pairs from the "missing" table for report x: prob
in the case ref got in our library collection"""
run_sql
(
"""DELETE FROM rnkCITATIONDATAEXT
WHERE extcitepubinfo = %s"""
,
(
report
,))
def
create_analysis_tables
():
"""temporary simple table + index"""
sql1
=
"CREATE TABLE IF NOT EXISTS tmpcit (citer mediumint(10),
\
cited mediumint(10)) TYPE=MyISAM"
sql2
=
"CREATE UNIQUE INDEX citercited ON tmpcit(citer, cited)"
sql3
=
"CREATE INDEX citer ON tmpcit(citer)"
sql4
=
"CREATE INDEX cited ON tmpcit(cited)"
run_sql
(
sql1
)
run_sql
(
sql2
)
run_sql
(
sql3
)
run_sql
(
sql4
)
def
write_citer_cited
(
citer
,
cited
):
"""write an entry to tmp table"""
run_sql
(
"INSERT INTO tmpcit(citer, cited) VALUES (
%s
,
%s
)"
,
(
citer
,
cited
))
def
print_missing
(
num
):
"""
Print the contents of rnkCITATIONDATAEXT table containing external
records that were cited by NUM or more internal records.
NUM is by default taken from the -E command line option.
"""
if
not
num
:
num
=
task_get_option
(
"print-extcites"
)
write_message
(
"Listing external papers cited by
%i
or more
\
internal records:"
%
num
)
res
=
run_sql
(
"""SELECT COUNT(id_bibrec), extcitepubinfo
FROM rnkCITATIONDATAEXT
GROUP BY extcitepubinfo HAVING COUNT(id_bibrec) >= %s
ORDER BY COUNT(id_bibrec) DESC"""
,
(
num
,))
for
(
cnt
,
brec
)
in
res
:
print
str
(
cnt
)
+
"
\t
"
+
brec
write_message
(
"Listing done."
)
def
tagify
(
parsedtag
):
"""aux auf to make '100__a' out of ['100','','','a']"""
tag
=
""
for
t
in
parsedtag
:
if
t
==
''
:
t
=
'_'
tag
+=
t
return
tag
def
store_citation_warning
(
warning_type
,
cit_info
):
r
=
run_sql
(
"""SELECT 1 FROM rnkCITATIONDATAERR
WHERE type = %s
AND citinfo = %s"""
,
(
warning_type
,
cit_info
))
if
not
r
:
run_sql
(
"""INSERT INTO rnkCITATIONDATAERR (type, citinfo)
VALUES (%s, %s)"""
,
(
warning_type
,
cit_info
))
Event Timeline
Log In to Comment