Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91058248
dbinterface.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Nov 7, 10:27
Size
107 KB
Mime Type
text/x-python
Expires
Sat, Nov 9, 10:27 (2 d)
Engine
blob
Format
Raw Data
Handle
22187746
Attached To
R3600 invenio-infoscience
dbinterface.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2011, 2012, 2013 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
'''
bibauthorid_bdinterface
This is the only file in bibauthorid which should
use the data base. It should have an interface for
all other files in the module.
'''
import
invenio.bibauthorid_config
as
bconfig
import
numpy
import
cPickle
from
cPickle
import
UnpicklingError
from
invenio.utils.html
import
X
import
os
import
gc
#python2.4 compatibility
from
invenio.bibauthorid_general_utils
import
bai_all
as
all
from
itertools
import
groupby
,
count
,
ifilter
,
chain
,
imap
from
operator
import
itemgetter
from
invenio.legacy.search_engine
import
perform_request_search
from
invenio.modules.access.engine
import
acc_authorize_action
from
invenio.config
import
CFG_SITE_URL
from
invenio.bibauthorid_name_utils
import
split_name_parts
from
invenio.bibauthorid_name_utils
import
create_canonical_name
from
invenio.bibauthorid_name_utils
import
create_normalized_name
from
invenio.bibauthorid_general_utils
import
bibauthor_print
from
invenio.bibauthorid_general_utils
import
update_status
\
,
update_status_final
from
invenio.legacy.dbquery
import
run_sql
try
:
from
collections
import
defaultdict
except
:
from
invenio.utils.container
import
defaultdict
MARC_100_700_CACHE
=
None
COLLECT_INSPIRE_ID
=
bconfig
.
COLLECT_EXTERNAL_ID_INSPIREID
def
get_sql_time
():
'''
Returns the time according to the database. The type is datetime.datetime.
'''
return
run_sql
(
"select now()"
)[
0
][
0
]
def
set_personid_row
(
person_id
,
tag
,
value
,
opt1
=
None
,
opt2
=
None
,
opt3
=
None
):
'''
Inserts data and additional info into aidPERSONIDDATA
@param person_id:
@type person_id: int
@param tag:
@type tag: string
@param value:
@type value: string
@param opt1:
@type opt1: int
@param opt2:
@type opt2: int
@param opt3:
@type opt3: string
'''
run_sql
(
"INSERT INTO aidPERSONIDDATA "
"(`personid`, `tag`, `data`, `opt1`, `opt2`, `opt3`) "
"VALUES (
%s
,
%s
,
%s
,
%s
,
%s
,
%s
)"
,
(
person_id
,
tag
,
value
,
opt1
,
opt2
,
opt3
))
def
get_personid_row
(
person_id
,
tag
):
'''
Returns all the records associated to a person and a tag.
@param person_id: id of the person to read the attribute from
@type person_id: int
@param tag: the tag to read.
@type tag: string
@return: the data associated with a virtual author
@rtype: tuple of tuples
'''
return
run_sql
(
"SELECT data, opt1, opt2, opt3 "
"data FROM aidPERSONIDDATA "
"WHERE personid =
%s
AND tag =
%s
"
,
(
person_id
,
tag
))
def
del_personid_row
(
tag
,
person_id
=
None
,
value
=
None
):
'''
Delete the value associated to the given tag for a certain person.
Can delete all tags regardless of person_id or value, or restrict the deletion using either of
both of them.
@param person_id: ID of the person
@type person_id: int
@param tag: tag to be updated
@type tag: string
@param value: value to be written for the tag
@type value: string
'''
if
person_id
:
if
value
:
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
and tag=
%s
and data=
%s
"
,
(
person_id
,
tag
,
value
,))
else
:
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
and tag=
%s
"
,
(
person_id
,
tag
,))
else
:
if
value
:
run_sql
(
"delete from aidPERSONIDDATA where tag=
%s
and data=
%s
"
,
(
tag
,
value
,))
else
:
run_sql
(
"delete from aidPERSONIDDATA where tag=
%s
"
,
(
tag
,))
def
get_all_papers_of_pids
(
personid_list
):
'''
Get all papers of authors in a given list and sorts the results
by bibrefrec.
@param personid_list: list with the authors.
@type personid_list: iteratable of integers.
'''
if
personid_list
:
plist
=
list_2_SQL_str
(
personid_list
)
paps
=
run_sql
(
"select personid, bibref_table, bibref_value, bibrec, flag "
"from aidPERSONIDPAPERS "
"where personid in
%s
"
%
plist
)
inner
=
set
(
row
[
1
:
4
]
for
row
in
paps
if
row
[
4
]
>
-
2
)
return
(
x
for
x
in
paps
if
x
[
1
:
4
]
in
inner
)
return
()
def
del_person_not_manually_claimed_papers
(
pid
):
'''
Deletes papers from a person which have not been manually claimed.
@param pid:
@type pid: int
'''
run_sql
(
"delete from aidPERSONIDPAPERS "
"where and (flag <> '-2' and flag <> '2') and personid=
%s
"
,
(
pid
,))
def
get_personid_from_uid
(
uid
):
'''
Returns the personID associated with the provided ui.
If the personID is already associated with the person the secon parameter is True, false otherwise.
@param uid: userID
@type uid: ((int,),)
'''
pid
=
run_sql
(
"select personid from aidPERSONIDDATA where tag=
%s
and data=
%s
"
,
(
'uid'
,
str
(
uid
[
0
][
0
])))
if
len
(
pid
)
==
1
:
return
(
pid
[
0
],
True
)
else
:
return
([
-
1
],
False
)
def
get_uid_from_personid
(
pid
):
'''
Get the invenio user id associated to a pid if exists.
@param pid: person_id
@type pid: int
'''
uid
=
run_sql
(
"select data from aidPERSONIDDATA where tag='uid' and personid =
%s
"
,
(
pid
,))
if
uid
:
return
uid
[
0
][
0
]
else
:
return
None
def
get_new_personid
():
'''
Get a free personid number
'''
pids
=
(
run_sql
(
"select max(personid) from aidPERSONIDDATA"
)[
0
][
0
],
run_sql
(
"select max(personid) from aidPERSONIDPAPERS"
)[
0
][
0
])
pids
=
tuple
(
int
(
p
)
for
p
in
pids
if
p
!=
None
)
if
len
(
pids
)
==
2
:
return
max
(
*
pids
)
+
1
elif
len
(
pids
)
==
1
:
return
pids
[
0
]
+
1
else
:
return
0
def
get_existing_personids
(
with_papers_only
=
False
):
'''
Get a set of existing person_ids.
@param with_papers_only: if True, returns only ids holding papers discarding ids holding only information in aidPERSONIDDATA
@type with_papers_only: Bool
'''
if
not
with_papers_only
:
try
:
pids_data
=
set
(
map
(
int
,
zip
(
*
run_sql
(
"select distinct personid from aidPERSONIDDATA"
))[
0
]))
except
IndexError
:
pids_data
=
set
()
else
:
pids_data
=
set
()
try
:
pids_pap
=
set
(
map
(
int
,
zip
(
*
run_sql
(
"select distinct personid from aidPERSONIDPAPERS"
))[
0
]))
except
IndexError
:
pids_pap
=
set
()
return
pids_data
|
pids_pap
def
get_existing_result_clusters
():
'''
Get existing relult clusters, for private use of Tortoise and merger
'''
return
run_sql
(
"select distinct personid from aidRESULTS"
)
def
create_new_person
(
uid
=
-
1
,
uid_is_owner
=
False
):
'''
Create a new person. Set the uid as owner if requested.
@param uid: User id to associate to the newly created person
@type uid: int
@param uid_is_owner: If true, the person will hold the uid as owned, otherwise the id is only remembered as the creator
@type uid_is_owner: bool
'''
pid
=
get_new_personid
()
if
uid_is_owner
:
set_personid_row
(
pid
,
'uid'
,
str
(
uid
))
else
:
set_personid_row
(
pid
,
'user-created'
,
str
(
uid
))
return
pid
def
create_new_person_from_uid
(
uid
):
'''
Commodity stub for create_new_person(...)
@param uid: user id
@type uid: int
'''
return
create_new_person
(
uid
,
uid_is_owner
=
True
)
def
new_person_from_signature
(
sig
,
name
=
None
):
'''
Creates a new person from a signature.
@param sig: signature tuple ([100|700],bibref,bibrec)
@type sig: tuple
@param name:
@type name: string
'''
pid
=
get_new_personid
()
add_signature
(
sig
,
name
,
pid
)
return
pid
def
add_signature
(
sig
,
name
,
pid
):
'''
Inserts a signature in personid.
@param sig: signature tuple
@type sig: tuple
@param name: name string
@type name: string
@param pid: personid to which assign the signature
@type pid: int
'''
if
not
name
:
name
=
get_name_by_bibrecref
(
sig
)
name
=
create_normalized_name
(
split_name_parts
(
name
))
run_sql
(
"INSERT INTO aidPERSONIDPAPERS "
"(personid, bibref_table, bibref_value, bibrec, name) "
"VALUES (
%s
,
%s
,
%s
,
%s
,
%s
)"
,
(
pid
,
str
(
sig
[
0
]),
sig
[
1
],
sig
[
2
],
name
))
def
move_signature
(
sig
,
pid
,
force_claimed
=
False
,
unclaim
=
False
):
'''
Moves a signature to a different person id
@param sig: signature tuple
@type sig: tuple
@param pid: personid
@type pid: int
'''
upd
=
"update aidPERSONIDPAPERS set personid=
%s
"
%
pid
if
unclaim
:
upd
+=
',flag=0 '
sel
=
" where bibref_table like '
%s
' and bibref_value=
%s
and bibrec=
%s
"
%
sig
sql
=
upd
+
sel
if
not
force_claimed
:
sql
+=
' and flag <> 2 and flag <> -2'
run_sql
(
sql
)
def
find_conflicts
(
sig
,
pid
):
'''
Helper for merger algorithm, find signature given personid
@param sig: signature tuple
@type sig: tuple
@param pid: personid id
@type pid: integer
'''
return
run_sql
(
"select bibref_table, bibref_value, bibrec, flag "
"from aidPERSONIDPAPERS where "
"personid =
%s
and "
"bibrec =
%s
and "
"flag <> -2"
,
(
pid
,
sig
[
2
]))
def
update_request_ticket
(
person_id
,
tag_data_tuple
,
ticket_id
=
None
):
'''
Creates / updates a request ticket for a personID
@param: personid int
@param: tag_data_tuples 'image' of the ticket: (('paper', '700:316,10'), ('owner', 'admin'), ('external_id', 'ticket_18'))
@return: ticketid
'''
#tags: rt_owner (the owner of the ticket, associating the rt_number to the transaction)
# rt_external_id
# rt_paper_cornfirm, rt_paper_reject, rt_paper_forget, rt_name, rt_email, rt_whatever
#flag: rt_number
if
not
ticket_id
:
last_id
=
run_sql
(
"select max(opt1) from aidPERSONIDDATA where personid=
%s
and tag like
%s
"
,
(
str
(
person_id
),
'rt_%'
))[
0
][
0
]
if
last_id
:
ticket_id
=
last_id
+
1
else
:
ticket_id
=
1
else
:
delete_request_ticket
(
person_id
,
ticket_id
)
for
d
in
tag_data_tuple
:
run_sql
(
"insert into aidPERSONIDDATA (personid, tag, data, opt1) "
"values (
%s
,
%s
,
%s
,
%s
)"
,
(
str
(
person_id
),
'rt_'
+
str
(
d
[
0
]),
str
(
d
[
1
]),
str
(
ticket_id
)))
return
ticket_id
def
delete_request_ticket
(
person_id
,
ticket_id
=
None
):
'''
Removes a ticket from a person_id.
If ticket_id is not provider removes all the tickets pending on a person.
'''
if
ticket_id
:
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
and tag like
%s
and opt1 =
%s
"
,
(
str
(
person_id
),
'rt_%'
,
str
(
ticket_id
)))
else
:
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
and tag like
%s
"
,
(
str
(
person_id
),
'rt_%'
))
def
get_all_personids_by_name
(
regexpr
):
'''
Search personids matching SQL expression in the name field
@param regexpr: string SQL regexp
@type regexpr: string
'''
return
run_sql
(
"select personid, name "
"from aidPERSONIDPAPERS "
"where name like
%s
"
"and flag > -2"
,
(
regexpr
,))
def
get_personids_by_canonical_name
(
target
):
'''
Find personids by canonical name
@param target:
@type target:
'''
pid
=
run_sql
(
"select personid from aidPERSONIDDATA where "
"tag='canonical_name' and data like
%s
"
,
(
target
,))
if
pid
:
return
run_sql
(
"select personid, name from aidPERSONIDPAPERS "
"where personid=
%s
and flag > -2"
,
(
pid
[
0
][
0
],))
else
:
return
[]
def
get_bibref_modification_status
(
bibref
):
'''
Determines if a record attached to a person has been touched by a human
by checking the flag.
@param pid: The Person ID of the person to check the assignment from
@type pid: int
@param bibref: The paper identifier to be checked (e.g. "100:12,144")
@type bibref: string
returns [bool:human_modified, int:lcul]
'''
if
not
bibref
:
raise
ValueError
(
"A bibref is expected!"
)
head
,
rec
=
bibref
.
split
(
','
)
table
,
ref
=
head
.
split
(
':'
)
flags
=
run_sql
(
"SELECT flag, lcul FROM aidPERSONIDPAPERS WHERE "
"bibref_table =
%s
and bibref_value =
%s
and bibrec =
%s
"
,
(
table
,
ref
,
rec
))
if
flags
:
return
flags
[
0
]
else
:
return
(
False
,
0
)
def
get_canonical_id_from_personid
(
pid
):
'''
Finds the person id canonical name (e.g. Ellis_J_R_1)
@param pid
@type int
@return: sql result of the request
@rtype: tuple of tuple
'''
return
run_sql
(
"SELECT data FROM aidPERSONIDDATA WHERE "
"tag =
%s
AND personid =
%s
"
,
(
'canonical_name'
,
str
(
pid
)))
def
get_papers_status
(
paper
):
'''
Gets the personID and flag assiciated to papers
@param papers: list of papers
@type papers: '100:7531,9024'
@return: (('data','personID','flag',),)
@rtype: tuple of tuples
'''
head
,
bibrec
=
paper
.
split
(
','
)
_table
,
bibref
=
head
.
split
(
':'
)
rets
=
run_sql
(
"select PersonID, flag "
"from aidPERSONIDPAPERS "
"where bibref_table =
%s
"
"and bibref_value =
%s
"
"and bibrec =
%s
"
%
(
head
,
bibrec
,
bibref
))
return
[[
paper
]
+
list
(
x
)
for
x
in
rets
]
def
get_persons_from_recids
(
recids
,
return_alt_names
=
False
,
return_all_person_papers
=
False
):
'''
Helper for search engine indexing. Gives back a dictionary with important info about a person, for example:
get_persons_from_recids([1], True, True) returns
({1: [16591L]},
{16591L: {'alternatative_names': ['Wong, Yung Chow'],
'canonical_id': 'Y.C.Wong.1',
'person_records': [275304, 1, 51394, 128250, 311629]}})
@param recids:
@type recids:
@param return_alt_names:
@type return_alt_names:
@param return_all_person_papers:
@type return_all_person_papers:
'''
rec_2_pid
=
dict
()
pid_2_data
=
dict
()
all_pids
=
set
()
def
get_canonical_name
(
pid
):
return
run_sql
(
"SELECT data "
"FROM aidPERSONIDDATA "
"WHERE tag =
%s
"
"AND personid =
%s
"
,
(
'canonical_name'
,
pid
))
for
rec
in
recids
:
pids
=
run_sql
(
"SELECT personid "
"FROM aidPERSONIDPAPERS "
"WHERE bibrec =
%s
"
" and flag > -2 "
,
(
rec
,))
# for some reason python's set is faster than a mysql distinct
pids
=
set
(
p
[
0
]
for
p
in
pids
)
all_pids
|=
pids
rec_2_pid
[
rec
]
=
list
(
pids
)
for
pid
in
all_pids
:
pid_data
=
{}
canonical
=
get_canonical_name
(
pid
)
#We can supposed that this person didn't have a chance to get a canonical name yet
#because it was not fully processed by it's creator. Anyway it's safe to try to create one
#before failing miserably
if
not
canonical
:
update_personID_canonical_names
([
pid
])
canonical
=
get_canonical_name
(
pid
)
#assert len(canonical) == 1
#This condition cannot hold in case claims or update daemons are run in parallel
#with this, as it can happen that a person with papers exists for wich a canonical name
#has not been computed yet. Hence, it will be indexed next time, so it learns.
#Each person should have at most one canonical name, so:
assert
len
(
canonical
)
<=
1
,
"A person cannot have more than one canonical name"
if
len
(
canonical
)
==
1
:
pid_data
=
{
'canonical_id'
:
canonical
[
0
][
0
]}
if
return_alt_names
:
names
=
run_sql
(
"SELECT name "
"FROM aidPERSONIDPAPERS "
"WHERE personid =
%s
"
" and flag > -2 "
,
(
pid
,))
names
=
set
(
n
[
0
]
for
n
in
names
)
pid_data
[
'alternatative_names'
]
=
list
(
names
)
if
return_all_person_papers
:
recs
=
run_sql
(
"SELECT bibrec "
"FROM aidPERSONIDPAPERS "
"WHERE personid =
%s
"
" and flag > -2 "
,
(
pid
,))
recs
=
set
(
r
[
0
]
for
r
in
recs
)
pid_data
[
'person_records'
]
=
list
(
recs
)
pid_2_data
[
pid
]
=
pid_data
return
(
rec_2_pid
,
pid_2_data
)
def
get_person_db_names_count
(
pid
,
sort_by_count
=
True
):
'''
Returns the set of name strings and count associated to a person id.
The name strings are as found in the database.
@param pid: ID of the person
@type pid: ('2',)
'''
id_2_count
=
run_sql
(
"select bibref_table, bibref_value "
"from aidPERSONIDPAPERS "
"where personid =
%s
"
"and flag > -2"
,
(
pid
,))
ref100
=
[
refid
[
1
]
for
refid
in
id_2_count
if
refid
[
0
]
==
'100'
]
ref700
=
[
refid
[
1
]
for
refid
in
id_2_count
if
refid
[
0
]
==
'700'
]
ref100_count
=
dict
((
key
,
len
(
list
(
data
)))
for
key
,
data
in
groupby
(
sorted
(
ref100
)))
ref700_count
=
dict
((
key
,
len
(
list
(
data
)))
for
key
,
data
in
groupby
(
sorted
(
ref700
)))
if
ref100
:
ref100_s
=
list_2_SQL_str
(
ref100
,
str
)
id100_2_str
=
run_sql
(
"select id, value "
"from bib10x "
"where id in
%s
"
%
ref100_s
)
else
:
id100_2_str
=
tuple
()
if
ref700
:
ref700_s
=
list_2_SQL_str
(
ref700
,
str
)
id700_2_str
=
run_sql
(
"select id, value "
"from bib70x "
"where id in
%s
"
%
ref700_s
)
else
:
id700_2_str
=
tuple
()
ret100
=
[(
name
,
ref100_count
[
refid
])
for
refid
,
name
in
id100_2_str
]
ret700
=
[(
name
,
ref700_count
[
refid
])
for
refid
,
name
in
id700_2_str
]
ret
=
ret100
+
ret700
if
sort_by_count
:
ret
=
sorted
(
ret
,
key
=
itemgetter
(
1
),
reverse
=
True
)
return
ret
def
get_person_id_from_canonical_id
(
canonical_id
):
'''
Finds the person id from a canonical name (e.g. Ellis_J_R_1)
@param canonical_id: the canonical ID
@type canonical_id: string
@return: sql result of the request
@rtype: tuple of tuple
'''
return
run_sql
(
"SELECT personid FROM aidPERSONIDDATA WHERE "
"tag='canonical_name' AND data =
%s
"
,
(
canonical_id
,))
def
get_person_names_count
(
pid
):
'''
Returns the set of name strings and count associated to a person id
@param pid: ID of the person
@type pid: ('2',)
@param value: value to be written for the tag
@type value: string
'''
return
run_sql
(
"select name, count(name) from aidPERSONIDPAPERS where "
"personid=
%s
and flag > -2 group by name"
,
(
pid
,))
def
get_person_db_names_set
(
pid
):
'''
Returns the set of db_name strings associated to a person id
@param pid: ID of the person
@type pid: 2
'''
names
=
get_person_db_names_count
(
pid
)
if
names
:
return
zip
(
set
(
zip
(
*
names
)[
0
]))
else
:
return
[]
def
get_personids_from_bibrec
(
bibrec
):
'''
Returns all the personids associated to a bibrec.
@param bibrec: record id
@type bibrec: int
'''
pids
=
run_sql
(
"select distinct personid from aidPERSONIDPAPERS where bibrec=
%s
and flag > -2"
,
(
bibrec
,))
if
pids
:
return
zip
(
*
pids
)[
0
]
else
:
return
[]
def
get_personids_and_papers_from_bibrecs
(
bibrecs
,
limit_by_name
=
None
):
'''
Gives back a list of tuples (personid, set_of_papers_owned_by) limited to the given list of bibrecs.
@param bibrecs:
@type bibrecs:
@param limit_by_name:
@type limit_by_name:
'''
if
not
bibrecs
:
return
[]
else
:
bibrecs
=
list_2_SQL_str
(
bibrecs
)
if
limit_by_name
:
try
:
surname
=
split_name_parts
(
limit_by_name
)[
0
]
except
IndexError
:
surname
=
None
else
:
surname
=
None
if
not
surname
:
data
=
run_sql
(
"select personid,bibrec from aidPERSONIDPAPERS where bibrec in
%s
"
%
(
bibrecs
,))
else
:
surname
=
split_name_parts
(
limit_by_name
)[
0
]
data
=
run_sql
((
"select personid,bibrec from aidPERSONIDPAPERS where bibrec in
%s
"
"and name like "
%
bibrecs
)
+
'
%s
'
,
(
surname
+
'%'
,))
pidlist
=
[(
k
,
set
([
s
[
1
]
for
s
in
d
]))
for
k
,
d
in
groupby
(
sorted
(
data
,
key
=
lambda
x
:
x
[
0
]),
key
=
lambda
x
:
x
[
0
])]
pidlist
=
sorted
(
pidlist
,
key
=
lambda
x
:
len
(
x
[
1
]),
reverse
=
True
)
return
pidlist
def
get_person_bibrecs
(
pid
):
'''
Returns bibrecs associated with a personid
@param pid: integer personid
@return [bibrec1,...,bibrecN]
'''
papers
=
run_sql
(
"select bibrec from aidPERSONIDPAPERS where personid=
%s
and flag > -2"
,
(
str
(
pid
),))
if
papers
:
return
list
(
set
(
zip
(
*
papers
)[
0
]))
else
:
return
[]
def
get_person_papers
(
pid
,
flag
,
show_author_name
=
False
,
show_title
=
False
,
show_rt_status
=
False
,
show_affiliations
=
False
,
show_date
=
False
,
show_experiment
=
False
):
'''
Get all papers of person with flag greater than flag. Gives back a dictionary like:
get_person_papers(16591,-2,True,True,True,True,True,True) returns
[{'affiliation': ['Hong Kong U.'],
'authorname': 'Wong, Yung Chow',
'data': '100:1,1',
'date': ('1961',),
'experiment': [],
'flag': 0,
'rt_status': False,
'title': ('Isoclinic N planes in Euclidean 2N space, Clifford parallels in elliptic (2N-1) space, and the Hurwitz matrix equations',)},
...]
@param pid:
@type pid:
@param flag:
@type flag:
@param show_author_name:
@type show_author_name:
@param show_title:
@type show_title:
@param show_rt_status:
@type show_rt_status:
@param show_affiliations:
@type show_affiliations:
@param show_date:
@type show_date:
@param show_experiment:
@type show_experiment:
'''
query
=
"bibref_table, bibref_value, bibrec, flag"
if
show_author_name
:
query
+=
", name"
all_papers
=
run_sql
(
"SELECT "
+
query
+
" "
"FROM aidPERSONIDPAPERS "
"WHERE personid =
%s
"
"AND flag >=
%s
"
,
(
pid
,
flag
))
def
format_paper
(
paper
):
bibrefrec
=
"
%s
:
%d
,
%d
"
%
paper
[:
3
]
ret
=
{
'data'
:
bibrefrec
,
'flag'
:
paper
[
3
]
}
if
show_author_name
:
ret
[
'authorname'
]
=
paper
[
4
]
if
show_title
:
ret
[
'title'
]
=
""
title
=
get_title_from_rec
(
paper
[
2
])
if
title
:
ret
[
'title'
]
=
(
title
,)
if
show_rt_status
:
rt_count
=
run_sql
(
"SELECT count(personid) "
"FROM aidPERSONIDDATA WHERE "
"tag like 'rt_
%%
' and data =
%s
"
,
(
bibrefrec
,))
ret
[
'rt_status'
]
=
(
rt_count
[
0
][
0
]
>
0
)
if
show_affiliations
:
tag
=
'
%s
__u'
%
paper
[
0
]
ret
[
'affiliation'
]
=
get_grouped_records
(
paper
[:
3
],
tag
)[
tag
]
if
show_date
:
ret
[
'date'
]
=
[]
date_id
=
run_sql
(
"SELECT id_bibxxx "
"FROM bibrec_bib26x "
"WHERE id_bibrec =
%s
"
,
(
paper
[
2
],))
if
date_id
:
date_id_s
=
list_2_SQL_str
(
date_id
,
lambda
x
:
x
[
0
])
date
=
run_sql
(
"SELECT value "
"FROM bib26x "
"WHERE id in
%s
"
"AND tag =
%s
"
%
(
date_id_s
,
"'269__c'"
))
if
date
:
ret
[
'date'
]
=
zip
(
*
date
)[
0
]
if
show_experiment
:
ret
[
'experiment'
]
=
[]
experiment_id
=
run_sql
(
"SELECT id_bibxxx "
"FROM bibrec_bib69x "
"WHERE id_bibrec =
%s
"
,
(
paper
[
2
],))
if
experiment_id
:
experiment_id_s
=
list_2_SQL_str
(
experiment_id
,
lambda
x
:
x
[
0
])
experiment
=
run_sql
(
"SELECT value "
"FROM bib69x "
"WHERE id in
%s
"
"AND tag =
%s
"
%
(
experiment_id_s
,
"'693__e'"
))
if
experiment
:
ret
[
'experiment'
]
=
zip
(
*
experiment
)[
0
]
return
ret
return
[
format_paper
(
paper
)
for
paper
in
all_papers
]
def
get_persons_with_open_tickets_list
():
'''
Finds all the persons with open tickets and returns pids and count of tickets
@return: [[pid, ticket_count]]
'''
return
run_sql
(
"select personid, count(distinct opt1) from "
"aidPERSONIDDATA where tag like 'rt_%' group by personid"
)
def
get_request_ticket
(
person_id
,
ticket_id
=
None
):
'''
Retrieves one or many requests tickets from a person
@param: person_id: person id integer
@param: matching: couple of values to match ('tag', 'value')
@param: ticket_id: ticket id (flag) value
@returns: [[[('tag', 'value')], ticket_id]]
[[[('a', 'va'), ('b', 'vb')], 1L], [[('b', 'daOEIaoe'), ('a', 'caaoOUIe')], 2L]]
'''
if
ticket_id
:
tstr
=
" and opt1='
%s
' "
%
ticket_id
else
:
tstr
=
" "
tickets
=
run_sql
(
"select tag,data,opt1 from aidPERSONIDDATA where personid=
%s
and "
" tag like 'rt_
%%
' "
+
tstr
,
(
person_id
,))
return
[[[(
s
[
0
][
3
:],
s
[
1
])
for
s
in
d
],
k
]
for
k
,
d
in
groupby
(
sorted
(
tickets
,
key
=
lambda
k
:
k
[
2
]),
key
=
lambda
k
:
k
[
2
])]
def
insert_user_log
(
userinfo
,
personid
,
action
,
tag
,
value
,
comment
=
''
,
transactionid
=
0
,
timestamp
=
None
,
userid
=
''
):
'''
Instert log entries in the user log table.
For example of entres look at the table generation script.
@param userinfo: username or user identifier
@type: string
@param personid: personid involved in the transaction
@type: longint
@param action: action type
@type: string
@param tag: tag
@type: string
@param value: value for the transaction
@type: string
@param comment: optional comment for the transaction
@type: string
@param transactionid: optional id for the transaction
@type: longint
@return: the transactionid
@rtype: longint
'''
if
not
timestamp
:
timestamp
=
run_sql
(
'select now()'
)[
0
][
0
]
run_sql
(
'insert into aidUSERINPUTLOG '
'(transactionid,timestamp,userinfo,userid,personid,action,tag,value,comment) values '
'(
%s
,
%s
,
%s
,
%s
,
%s
,
%s
,
%s
,
%s
,
%s
)'
,
(
transactionid
,
timestamp
,
userinfo
,
userid
,
personid
,
action
,
tag
,
value
,
comment
))
return
transactionid
def
person_bibref_is_touched_old
(
pid
,
bibref
):
'''
Determines if a record attached to a person has been touched by a human
by checking the flag.
@param pid: The Person ID of the person to check the assignment from
@type pid: int
@param bibref: The paper identifier to be checked (e.g. "100:12,144")
@type bibref: string
'''
bibref
,
rec
=
bibref
.
split
(
","
)
table
,
ref
=
bibref
.
split
(
":"
)
flag
=
run_sql
(
"SELECT flag "
"FROM aidPERSONIDPAPERS "
"WHERE personid =
%s
"
"AND bibref_table =
%s
"
"AND bibref_value =
%s
"
"AND bibrec =
%s
"
,
(
pid
,
table
,
ref
,
rec
))
try
:
flag
=
flag
[
0
][
0
]
except
(
IndexError
):
return
False
if
not
flag
:
return
False
elif
-
2
<
flag
<
2
:
return
False
else
:
return
True
def
confirm_papers_to_person
(
pid
,
papers
,
user_level
=
0
):
'''
Confirms the relationship between pid and paper, as from user input.
@param pid: id of the person
@type pid: integer
@param papers: list of papers to confirm
@type papers: ((str,),) e.g. (('100:7531,9024',),)
@return: list of tuples: (status, message_key)
@rtype: [(bool, str), ]
'''
pids_to_update
=
set
([
pid
])
res
=
[]
for
p
in
papers
:
bibref
,
rec
=
p
.
split
(
","
)
rec
=
int
(
rec
)
table
,
ref
=
bibref
.
split
(
":"
)
ref
=
int
(
ref
)
sig
=
(
table
,
ref
,
rec
)
#Check the status of pid: the paper should be present, either assigned or rejected
gen_papers
=
run_sql
(
"select bibref_table, bibref_value, bibrec, personid, flag, name "
"from aidPERSONIDPAPERS "
"where bibrec=
%s
"
"and flag >= -2"
,
(
rec
,))
paps
=
[
el
[
0
:
3
]
for
el
in
gen_papers
if
el
[
3
]
==
pid
and
el
[
4
]
>
-
2
]
#run_sql("select bibref_table, bibref_value, bibrec "
# "from aidPERSONIDPAPERS "
# "where personid=%s "
# "and bibrec=%s "
# "and flag > -2"
# , (pid, rec))
other_paps
=
[
el
[
0
:
3
]
for
el
in
gen_papers
if
el
[
3
]
!=
pid
and
el
[
4
]
>
-
2
]
#other_paps = run_sql("select bibref_table, bibref_value, bibrec "
# "from aidPERSONIDPAPERS "
# "where personid <> %s "
# "and bibrec=%s "
# "and flag > -2"
# , (pid, rec))
rej_paps
=
[
el
[
0
:
3
]
for
el
in
gen_papers
if
el
[
3
]
==
pid
and
el
[
4
]
==
-
2
]
#rej_paps = run_sql("select bibref_table, bibref_value, bibrec "
# "from aidPERSONIDPAPERS "
# "where personid=%s "
# "and bibrec=%s "
# "and flag = -2"
# , (pid, rec))
bibref_exists
=
[
el
[
0
:
3
]
for
el
in
gen_papers
if
el
[
0
]
==
table
and
el
[
1
]
==
ref
and
el
[
4
]
>
-
2
]
#bibref_exists = run_sql("select * "
# "from aidPERSONIDPAPERS "
# "and bibref_table=%s "
# "and bibref_value=%s "
# "and bibrec=%s "
# "and flag > -2"
# , (table, ref, rec))
# All papers that are being claimed should be present in aidPERSONIDPAPERS, thus:
# assert paps or rej_paps or other_paps, 'There should be at least something regarding this bibrec!'
# should always be valid.
# BUT, it usually happens that claims get done out of the browser/session cache which is hours/days old,
# hence it happens that papers are claimed which no longer exists in the system.
# For the sake of mental sanity, instead of crashing from now on we just ignore such cases.
if
not
(
paps
or
other_paps
or
rej_paps
)
or
not
bibref_exists
:
res
.
append
((
False
,
'confirm_failure'
))
continue
res
.
append
((
True
,
'confirm_success'
))
# It should not happen that a paper is assigned more then once to the same person.
# But sometimes it happens in rare unfortunate cases of bad concurrency circumstances,
# so we try to fix it directly instead of crashing here.
# Once a better solution for dealing with concurrency will be found, the following asserts
# shall be reenabled, to allow better control on what happens.
# assert len(paps) < 2, "This paper should not be assigned to this person more then once! %s" % paps
# assert len(other_paps) < 2, "There should not be more then one copy of this paper! %s" % other_paps
# if the bibrec is present with a different bibref, the present one must be moved somwhere
# else before we can claim the incoming one
if
paps
:
for
pap
in
paps
:
#kick out all unwanted signatures
if
sig
!=
pap
:
new_pid
=
get_new_personid
()
pids_to_update
.
add
(
new_pid
)
move_signature
(
pap
,
new_pid
)
# Make sure that the incoming claim is unique and get rid of all rejections, they are useless
# from now on
run_sql
(
"delete from aidPERSONIDPAPERS where bibref_table like
%s
and "
" bibref_value =
%s
and bibrec=
%s
"
,
sig
)
add_signature
(
sig
,
None
,
pid
)
run_sql
(
"update aidPERSONIDPAPERS "
"set personid =
%s
"
", flag =
%s
"
", lcul =
%s
"
"where bibref_table =
%s
"
"and bibref_value =
%s
"
"and bibrec =
%s
"
,
(
pid
,
'2'
,
user_level
,
table
,
ref
,
rec
))
update_personID_canonical_names
(
pids_to_update
)
return
res
def
reject_papers_from_person
(
pid
,
papers
,
user_level
=
0
):
'''
Confirms the negative relationship between pid and paper, as from user input.
@param pid: id of the person
@type pid: integer
@param papers: list of papers to confirm
@type papers: ((str,),) e.g. (('100:7531,9024',),)
@return: list of tuples: (status, message_key)
@rtype: [(bool, str), ]
'''
new_pid
=
get_new_personid
()
pids_to_update
=
set
([
pid
])
res
=
[]
for
p
in
papers
:
brr
,
rec
=
p
.
split
(
","
)
table
,
ref
=
brr
.
split
(
':'
)
sig
=
(
table
,
ref
,
rec
)
# To be rejected, a record should be present!
records
=
personid_name_from_signature
(
sig
)
# For the sake of mental sanity (see commentis in confirm_papers_to_personid, just ignore in case this paper is no longer existent
# assert(records)
if
not
records
:
res
.
append
((
False
,
'reject_failure'
))
continue
res
.
append
((
True
,
'reject_success'
))
fpid
,
name
=
records
[
0
]
# If the record is assigned to a different person already, the rejection is meaningless
# Otherwise, we assign the paper to someone else (not important who it will eventually
# get moved by tortoise) and add the rejection to the current person
if
fpid
==
pid
:
move_signature
(
sig
,
new_pid
,
force_claimed
=
True
,
unclaim
=
True
)
pids_to_update
.
add
(
new_pid
)
run_sql
(
"INSERT INTO aidPERSONIDPAPERS "
"(personid, bibref_table, bibref_value, bibrec, name, flag, lcul) "
"VALUES (
%s
,
%s
,
%s
,
%s
,
%s
,
%s
,
%s
)"
,
(
pid
,
table
,
ref
,
rec
,
name
,
-
2
,
user_level
))
update_personID_canonical_names
(
pids_to_update
)
return
res
def
reset_papers_flag
(
pid
,
papers
):
'''
Resets the flag associated to the papers to '0'
@param pid: id of the person
@type pid: integer
@param papers: list of papers to confirm
@type papers: ((str,),) e.g. (('100:7531,9024',),)
@return: list of tuples: (status, message_key)
@rtype: [(bool, str), ]
'''
res
=
[]
for
p
in
papers
:
bibref
,
rec
=
p
.
split
(
","
)
table
,
ref
=
bibref
.
split
(
":"
)
ref
=
int
(
ref
)
sig
=
(
table
,
ref
,
rec
)
gen_papers
=
run_sql
(
"select bibref_table, bibref_value, bibrec, flag "
"from aidPERSONIDPAPERS "
"where bibrec=
%s
"
"and personid=
%s
"
,
(
rec
,
pid
))
paps
=
[
el
[
0
:
3
]
for
el
in
gen_papers
]
#run_sql("select bibref_table, bibref_value, bibrec "
# "from aidPERSONIDPAPERS "
# "where personid=%s "
# "and bibrec=%s "
# , (pid, rec))
rej_paps
=
[
el
[
0
:
3
]
for
el
in
gen_papers
if
el
[
3
]
==
-
2
]
#rej_paps = run_sql("select bibref_table, bibref_value, bibrec "
# "from aidPERSONIDPAPERS "
# "where personid=%s "
# "and bibrec=%s "
# "and flag = -2"
# , (pid, rec))
pid_bibref_exists
=
[
el
[
0
:
3
]
for
el
in
gen_papers
if
el
[
0
]
==
table
and
el
[
1
]
==
ref
and
el
[
3
]
>
-
2
]
#bibref_exists = run_sql("select * "
# "from aidPERSONIDPAPERS "
# "and bibref_table=%s "
# "and bibref_value=%s "
# "and personid=%s "
# "and bibrec=%s "
# "and flag > -2"
# , (table, ref, pid, rec))
# again, see confirm_papers_to_person for the sake of mental sanity
# assert paps or rej_paps
if
rej_paps
or
not
pid_bibref_exists
:
res
.
append
((
False
,
'reset_failure'
))
continue
res
.
append
((
True
,
'reset_success'
))
assert
len
(
paps
)
<
2
run_sql
(
"delete from aidPERSONIDPAPERS where bibref_table like
%s
and "
"bibref_value =
%s
and bibrec =
%s
"
,
(
sig
))
add_signature
(
sig
,
None
,
pid
)
return
res
def
user_can_modify_data
(
uid
,
pid
):
'''
Return True if the uid can modify data of this personID, false otherwise.
@param uid: the user id
@type: int
@param pid: the person id
@type: int
@return: can user mofidfy data?
@rtype: boolean
'''
pid_uid
=
run_sql
(
"select data from aidPERSONIDDATA where tag =
%s
"
" and personid =
%s
"
,
(
'uid'
,
str
(
pid
)))
if
len
(
pid_uid
)
>=
1
and
str
(
uid
)
==
str
(
pid_uid
[
0
][
0
]):
rights
=
bconfig
.
CLAIMPAPER_CHANGE_OWN_DATA
else
:
rights
=
bconfig
.
CLAIMPAPER_CHANGE_OTHERS_DATA
return
acc_authorize_action
(
uid
,
rights
)[
0
]
==
0
def
get_possible_bibrecref
(
names
,
bibrec
,
always_match
=
False
):
'''
Returns a list of bibrefs for which the surname is matching
@param names: list of names strings
@param bibrec: bibrec number
@param always_match: match with all the names (full bibrefs list)
'''
splitted_names
=
[
split_name_parts
(
n
)
for
n
in
names
]
bibrec_names_100
=
run_sql
(
"select o.id, o.value from bib10x o, "
"(select i.id_bibxxx as iid from bibrec_bib10x i "
"where id_bibrec=
%s
) as dummy "
"where o.tag='100__a' AND o.id = dummy.iid"
,
(
str
(
bibrec
),))
bibrec_names_700
=
run_sql
(
"select o.id, o.value from bib70x o, "
"(select i.id_bibxxx as iid from bibrec_bib70x i "
"where id_bibrec=
%s
) as dummy "
"where o.tag='700__a' AND o.id = dummy.iid"
,
(
str
(
bibrec
),))
# bibrec_names_100 = run_sql("select id,value from bib10x where tag='100__a' and id in "
# "(select id_bibxxx from bibrec_bib10x where id_bibrec=%s)",
# (str(bibrec),))
# bibrec_names_700 = run_sql("select id,value from bib70x where tag='700__a' and id in "
# "(select id_bibxxx from bibrec_bib70x where id_bibrec=%s)",
# (str(bibrec),))
bibreflist
=
[]
for
b
in
bibrec_names_100
:
spb
=
split_name_parts
(
b
[
1
])
for
n
in
splitted_names
:
if
(
n
[
0
]
.
lower
()
==
spb
[
0
]
.
lower
())
or
always_match
:
if
[
'100:'
+
str
(
b
[
0
]),
b
[
1
]]
not
in
bibreflist
:
bibreflist
.
append
([
'100:'
+
str
(
b
[
0
]),
b
[
1
]])
for
b
in
bibrec_names_700
:
spb
=
split_name_parts
(
b
[
1
])
for
n
in
splitted_names
:
if
(
n
[
0
]
.
lower
()
==
spb
[
0
]
.
lower
())
or
always_match
:
if
[
'700:'
+
str
(
b
[
0
]),
b
[
1
]]
not
in
bibreflist
:
bibreflist
.
append
([
'700:'
+
str
(
b
[
0
]),
b
[
1
]])
return
bibreflist
def
user_can_modify_paper
(
uid
,
paper
):
'''
Return True if the uid can modify this paper, false otherwise.
If the paper is assigned more then one time (from algorithms) consider the most privileged
assignment.
@param uid: the user id
@type: int
@param paper: the paper bibref,bibrec pair x00:1234,4321
@type: str
@return: can user mofidfy paper attribution?
@rtype: boolean
'''
bibref
,
rec
=
paper
.
split
(
","
)
table
,
ref
=
bibref
.
split
(
":"
)
prow
=
run_sql
(
"select personid, lcul from aidPERSONIDPAPERS "
"where bibref_table =
%s
and bibref_value =
%s
and bibrec =
%s
"
"order by lcul desc limit 0,1"
,
(
table
,
ref
,
rec
))
if
len
(
prow
)
==
0
:
return
((
acc_authorize_action
(
uid
,
bconfig
.
CLAIMPAPER_CLAIM_OWN_PAPERS
)[
0
]
==
0
)
or
(
acc_authorize_action
(
uid
,
bconfig
.
CLAIMPAPER_CLAIM_OTHERS_PAPERS
)[
0
]
==
0
))
min_req_acc_n
=
int
(
prow
[
0
][
1
])
req_acc
=
resolve_paper_access_right
(
bconfig
.
CLAIMPAPER_CLAIM_OWN_PAPERS
)
pid_uid
=
run_sql
(
"select data from aidPERSONIDDATA where tag =
%s
and personid =
%s
"
,
(
'uid'
,
str
(
prow
[
0
][
0
])))
if
len
(
pid_uid
)
>
0
:
if
(
str
(
pid_uid
[
0
][
0
])
!=
str
(
uid
))
and
min_req_acc_n
>
0
:
req_acc
=
resolve_paper_access_right
(
bconfig
.
CLAIMPAPER_CLAIM_OTHERS_PAPERS
)
if
min_req_acc_n
<
req_acc
:
min_req_acc_n
=
req_acc
min_req_acc
=
resolve_paper_access_right
(
min_req_acc_n
)
return
(
acc_authorize_action
(
uid
,
min_req_acc
)[
0
]
==
0
)
and
(
resolve_paper_access_right
(
min_req_acc
)
>=
min_req_acc_n
)
def
resolve_paper_access_right
(
acc
):
'''
Given a string or an integer, resolves to the corresponding integer or string
If asked for a wrong/not present parameter falls back to the minimum privilege.
'''
access_dict
=
{
bconfig
.
CLAIMPAPER_VIEW_PID_UNIVERSE
:
0
,
bconfig
.
CLAIMPAPER_CLAIM_OWN_PAPERS
:
25
,
bconfig
.
CLAIMPAPER_CLAIM_OTHERS_PAPERS
:
50
}
if
isinstance
(
acc
,
str
):
try
:
return
access_dict
[
acc
]
except
:
return
0
inverse_dict
=
dict
([[
v
,
k
]
for
k
,
v
in
access_dict
.
items
()])
lower_accs
=
[
a
for
a
in
inverse_dict
.
keys
()
if
a
<=
acc
]
try
:
return
inverse_dict
[
max
(
lower_accs
)]
except
:
return
bconfig
.
CLAIMPAPER_VIEW_PID_UNIVERSE
def
get_recently_modified_record_ids
(
date
):
'''
Returns the bibrecs with modification date more recent then date.
@param date: date
'''
touched_papers
=
frozenset
(
p
[
0
]
for
p
in
run_sql
(
"select id from bibrec "
"where modification_date >
%s
"
,
(
date
,)))
return
touched_papers
&
frozenset
(
get_all_valid_bibrecs
())
def
filter_modified_record_ids
(
bibrecs
,
date
):
'''
Returns the bibrecs with modification date before the date.
@param date: date
'''
return
ifilter
(
lambda
x
:
run_sql
(
"select count(*) from bibrec "
"where id =
%s
and "
"modification_date <
%s
"
,
(
x
[
2
],
date
))[
0
][
0
]
,
bibrecs
)
def
get_user_log
(
transactionid
=
''
,
userinfo
=
''
,
userid
=
''
,
personID
=
''
,
action
=
''
,
tag
=
''
,
value
=
''
,
comment
=
''
,
only_most_recent
=
False
):
'''
Get user log table entry matching all the given parameters; all of them are optional.
IF no parameters are given retuns the complete log table
@param transactionid: id of the transaction
@param userinfo: user name or identifier
@param personid: id of the person involved
@param action: action
@param tag: tag
@param value: value
@param comment: comment
'''
sql_query
=
(
'select id,transactionid,timestamp,userinfo,personid,action,tag,value,comment '
+
'from aidUSERINPUTLOG where 1 '
)
if
transactionid
:
sql_query
+=
' and transactionid=
\'
'
+
str
(
transactionid
)
+
'
\'
'
if
userinfo
:
sql_query
+=
' and userinfo=
\'
'
+
str
(
userinfo
)
+
'
\'
'
if
userid
:
sql_query
+=
' and userid=
\'
'
+
str
(
userid
)
+
'
\'
'
if
personID
:
sql_query
+=
' and personid=
\'
'
+
str
(
personID
)
+
'
\'
'
if
action
:
sql_query
+=
' and action=
\'
'
+
str
(
action
)
+
'
\'
'
if
tag
:
sql_query
+=
' and tag=
\'
'
+
str
(
tag
)
+
'
\'
'
if
value
:
sql_query
+=
' and value=
\'
'
+
str
(
value
)
+
'
\'
'
if
comment
:
sql_query
+=
' and comment=
\'
'
+
str
(
comment
)
+
'
\'
'
if
only_most_recent
:
sql_query
+=
' order by timestamp desc limit 0,1'
return
run_sql
(
sql_query
)
def
list_2_SQL_str
(
items
,
f
=
lambda
x
:
x
):
"""
Concatenates all items in items to a sql string using f.
@param items: a set of items
@param type items: X
@param f: a function which transforms each item from items to string
@param type f: X:->str
@return: "(x1, x2, x3, ... xn)" for xi in items
@return type: string
"""
strs
=
(
str
(
f
(
x
))
for
x
in
items
)
return
"(
%s
)"
%
", "
.
join
(
strs
)
def
_get_authors_from_paper_from_db
(
paper
):
'''
selects all author bibrefs by a given papers
'''
fullbibrefs100
=
run_sql
(
"select id_bibxxx from bibrec_bib10x where id_bibrec=
%s
"
,
(
paper
,))
if
len
(
fullbibrefs100
)
>
0
:
fullbibrefs100str
=
list_2_SQL_str
(
fullbibrefs100
,
lambda
x
:
str
(
x
[
0
]))
return
run_sql
(
"select id from bib10x where tag='100__a' and id in
%s
"
%
(
fullbibrefs100str
,))
return
tuple
()
def
_get_authors_from_paper_from_cache
(
paper
):
'''
selects all author bibrefs by a given papers
'''
try
:
ids
=
MARC_100_700_CACHE
[
'brb100'
][
paper
][
'id'
]
.
keys
()
refs
=
[
i
for
i
in
ids
if
'100__a'
in
MARC_100_700_CACHE
[
'b100'
][
i
][
0
]]
except
KeyError
:
return
tuple
()
return
zip
(
refs
)
def
get_authors_from_paper
(
paper
):
if
MARC_100_700_CACHE
:
if
bconfig
.
DEBUG_CHECKS
:
assert
_get_authors_from_paper_from_cache
(
paper
)
==
_get_authors_from_paper_from_cache
(
paper
)
return
_get_authors_from_paper_from_cache
(
paper
)
else
:
return
_get_authors_from_paper_from_db
(
paper
)
def
_get_coauthors_from_paper_from_db
(
paper
):
'''
selects all coauthor bibrefs by a given papers
'''
fullbibrefs700
=
run_sql
(
"select id_bibxxx from bibrec_bib70x where id_bibrec=
%s
"
,
(
paper
,))
if
len
(
fullbibrefs700
)
>
0
:
fullbibrefs700str
=
list_2_SQL_str
(
fullbibrefs700
,
lambda
x
:
str
(
x
[
0
]))
return
run_sql
(
"select id from bib70x where tag='700__a' and id in
%s
"
%
(
fullbibrefs700str
,))
return
tuple
()
def
_get_coauthors_from_paper_from_cache
(
paper
):
'''
selects all author bibrefs by a given papers
'''
try
:
ids
=
MARC_100_700_CACHE
[
'brb700'
][
paper
][
'id'
]
.
keys
()
refs
=
[
i
for
i
in
ids
if
'700__a'
in
MARC_100_700_CACHE
[
'b700'
][
i
][
0
]]
except
KeyError
:
return
tuple
()
return
zip
(
refs
)
def
get_coauthors_from_paper
(
paper
):
if
MARC_100_700_CACHE
:
if
bconfig
.
DEBUG_CHECKS
:
assert
_get_coauthors_from_paper_from_cache
(
paper
)
==
_get_coauthors_from_paper_from_db
(
paper
)
return
_get_coauthors_from_paper_from_cache
(
paper
)
else
:
return
_get_coauthors_from_paper_from_db
(
paper
)
def
get_bibrefrec_subset
(
table
,
papers
,
refs
):
table
=
"bibrec_bib
%s
x"
%
str
(
table
)[:
-
1
]
contents
=
run_sql
(
"select id_bibrec, id_bibxxx from
%s
"
%
table
)
papers
=
set
(
papers
)
refs
=
set
(
refs
)
# yes, there are duplicates and we must set them
return
set
(
ifilter
(
lambda
x
:
x
[
0
]
in
papers
and
x
[
1
]
in
refs
,
contents
))
def
get_deleted_papers
():
return
run_sql
(
"select o.id_bibrec from bibrec_bib98x o, "
"(select i.id as iid from bib98x i "
"where value = 'DELETED' "
"and tag like '980__a') as dummy "
"where o.id_bibxxx = dummy.iid"
)
def
add_personID_external_id
(
personid
,
external_id_str
,
value
):
run_sql
(
"insert into aidPERSONIDDATA (personid,tag,data) values (
%s
,
%s
,
%s
)"
,
(
personid
,
'extid:
%s
'
%
external_id_str
,
value
))
def
remove_personID_external_id
(
personid
,
external_id_str
,
value
=
False
):
if
not
value
:
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
and tag=
%s
"
,
(
personid
,
'extid:
%s
'
%
external_id_str
))
else
:
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
and tag=
%s
and data=
%s
"
,
(
personid
,
'extid:
%s
'
%
external_id_str
,
value
))
def
get_personiID_external_ids
(
personid
):
ids
=
run_sql
(
"select tag,data from aidPERSONIDDATA where personid=
%s
and tag like 'extid:
%%
'"
,
(
personid
,))
extids
=
{}
for
i
in
ids
:
id_str
=
i
[
0
]
.
split
(
':'
)[
1
]
idd
=
i
[
1
]
try
:
extids
[
id_str
]
.
append
(
idd
)
except
KeyError
:
extids
[
id_str
]
=
[
idd
]
return
extids
#bibauthorid_maintenance personid update private methods
def
update_personID_canonical_names
(
persons_list
=
None
,
overwrite
=
False
,
suggested
=
''
,
overwrite_not_claimed_only
=
False
):
'''
Updates the personID table creating or updating canonical names for persons
@param: persons_list: persons to consider for the update (('1'),)
@param: overwrite: if to touch already existing canonical names
@param: suggested: string to suggest a canonical name for the person
'''
if
not
persons_list
and
overwrite
:
persons_list
=
set
([
x
[
0
]
for
x
in
run_sql
(
'select personid from aidPERSONIDPAPERS'
)])
elif
not
persons_list
:
persons_list
=
set
([
x
[
0
]
for
x
in
run_sql
(
'select personid from aidPERSONIDPAPERS'
)])
existing_cnamed_pids
=
set
(
[
x
[
0
]
for
x
in
run_sql
(
'select personid from aidPERSONIDDATA where tag=
%s
'
,
(
'canonical_name'
,))])
persons_list
=
persons_list
-
existing_cnamed_pids
for
idx
,
pid
in
enumerate
(
persons_list
):
update_status
(
float
(
idx
)
/
float
(
len
(
persons_list
)),
"Updating canonical_names..."
)
if
overwrite_not_claimed_only
:
has_claims
=
run_sql
(
"select personid from aidPERSONIDPAPERS where personid =
%s
and flag = 2"
,
(
pid
,))
if
has_claims
:
continue
current_canonical
=
run_sql
(
"select data from aidPERSONIDDATA where "
"personid=
%s
and tag=
%s
"
,
(
pid
,
'canonical_name'
))
if
overwrite
or
len
(
current_canonical
)
==
0
:
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
and tag=
%s
"
,
(
pid
,
'canonical_name'
))
names
=
get_person_names_count
(
pid
)
names
=
sorted
(
names
,
key
=
lambda
k
:
k
[
1
],
reverse
=
True
)
if
len
(
names
)
<
1
and
not
suggested
:
continue
else
:
if
suggested
:
canonical_name
=
suggested
else
:
canonical_name
=
create_canonical_name
(
names
[
0
][
0
])
existing_cnames
=
run_sql
(
"select data from aidPERSONIDDATA "
"where tag=
%s
and data like
%s
"
,
(
'canonical_name'
,
str
(
canonical_name
)
+
'%'
))
existing_cnames
=
set
(
name
[
0
]
.
lower
()
for
name
in
existing_cnames
)
for
i
in
count
(
1
):
cur_try
=
canonical_name
+
'.'
+
str
(
i
)
if
cur_try
.
lower
()
not
in
existing_cnames
:
canonical_name
=
cur_try
break
run_sql
(
"insert into aidPERSONIDDATA (personid, tag, data) values (
%s
,
%s
,
%s
) "
,
(
pid
,
'canonical_name'
,
canonical_name
))
update_status_final
(
"Updating canonical_names finished."
)
def
personid_get_recids_affected_since
(
last_timestamp
):
'''
Returns a list of recids which have been manually changed since timestamp)
@param: last_timestamp: last update, datetime.datetime
'''
vset
=
set
(
int
(
v
[
0
]
.
split
(
','
)[
1
])
for
v
in
run_sql
(
"select distinct value from aidUSERINPUTLOG "
"where timestamp >
%s
"
,
(
last_timestamp
,))
if
','
in
v
[
0
]
and
':'
in
v
[
0
])
pids
=
set
(
int
(
p
[
0
])
for
p
in
run_sql
(
"select distinct personid from aidUSERINPUTLOG "
"where timestamp >
%s
"
,
(
last_timestamp
,))
if
p
[
0
]
>
0
)
if
pids
:
pids_s
=
list_2_SQL_str
(
pids
)
vset
|=
set
(
int
(
b
[
0
])
for
b
in
run_sql
(
"select bibrec from aidPERSONIDPAPERS "
"where personid in
%s
"
%
pids_s
))
return
list
(
vset
)
# I'm not sure about this cast. It might work without it.
def
get_all_paper_records
(
pid
,
claimed_only
=
False
):
if
not
claimed_only
:
return
run_sql
(
"SELECT distinct bibrec FROM aidPERSONIDPAPERS WHERE personid =
%s
"
,
(
str
(
pid
),))
else
:
return
run_sql
(
"SELECT distinct bibrec FROM aidPERSONIDPAPERS WHERE "
"personid =
%s
and flag=2 or flag=-2"
,
(
str
(
pid
),))
def
get_all_modified_names_from_personid
(
since
=
None
):
if
since
:
all_pids
=
run_sql
(
"SELECT DISTINCT personid "
"FROM aidPERSONIDPAPERS "
"WHERE flag > -2 "
"AND last_updated >
%s
"
%
since
)
else
:
all_pids
=
run_sql
(
"SELECT DISTINCT personid "
"FROM aidPERSONIDPAPERS "
"WHERE flag > -2 "
)
return
((
name
[
0
][
0
],
set
(
n
[
1
]
for
n
in
name
),
len
(
name
))
for
name
in
(
run_sql
(
"SELECT personid, name "
"FROM aidPERSONIDPAPERS "
"WHERE personid =
%s
"
"AND flag > -2"
,
p
)
for
p
in
all_pids
))
def
destroy_partial_marc_caches
():
global
MARC_100_700_CACHE
MARC_100_700_CACHE
=
None
gc
.
collect
()
def
populate_partial_marc_caches
():
global
MARC_100_700_CACHE
if
MARC_100_700_CACHE
:
return
def
br_dictionarize
(
maptable
):
gc
.
disable
()
md
=
defaultdict
(
dict
)
maxiters
=
len
(
set
(
map
(
itemgetter
(
0
),
maptable
)))
for
i
,
v
in
enumerate
(
groupby
(
maptable
,
itemgetter
(
0
))):
if
i
%
1000
==
0
:
update_status
(
float
(
i
)
/
maxiters
,
'br_dictionarizing...'
)
if
i
%
1000000
==
0
:
update_status
(
float
(
i
)
/
maxiters
,
'br_dictionarizing...GC'
)
gc
.
collect
()
idx
=
defaultdict
(
list
)
fn
=
defaultdict
(
list
)
for
_
,
k
,
z
in
v
[
1
]:
idx
[
k
]
.
append
(
z
)
fn
[
z
]
.
append
(
k
)
md
[
v
[
0
]][
'id'
]
=
idx
md
[
v
[
0
]][
'fn'
]
=
fn
update_status_final
(
'br_dictionarizieng done'
)
gc
.
enable
()
return
md
def
bib_dictionarize
(
bibtable
):
return
dict
((
i
[
0
],
(
i
[
1
],
i
[
2
]))
for
i
in
bibtable
)
update_status
(
.
0
,
'Populating get_grouped_records_table_cache'
)
bibrec_bib10x
=
sorted
(
run_sql
(
"select id_bibrec,id_bibxxx,field_number from bibrec_bib10x"
))
update_status
(
.
125
,
'Populating get_grouped_records_table_cache'
)
brd_b10x
=
br_dictionarize
(
bibrec_bib10x
)
del
bibrec_bib10x
update_status
(
.
25
,
'Populating get_grouped_records_table_cache'
)
bibrec_bib70x
=
sorted
(
run_sql
(
"select id_bibrec,id_bibxxx,field_number from bibrec_bib70x"
))
update_status
(
.
375
,
'Populating get_grouped_records_table_cache'
)
brd_b70x
=
br_dictionarize
(
bibrec_bib70x
)
del
bibrec_bib70x
update_status
(
.
5
,
'Populating get_grouped_records_table_cache'
)
bib10x
=
(
run_sql
(
"select id,tag,value from bib10x"
))
update_status
(
.
625
,
'Populating get_grouped_records_table_cache'
)
bibd_10x
=
bib_dictionarize
(
bib10x
)
del
bib10x
update_status
(
.
75
,
'Populating get_grouped_records_table_cache'
)
bib70x
=
(
run_sql
(
"select id,tag,value from bib70x"
))
update_status
(
.
875
,
'Populating get_grouped_records_table_cache'
)
bibd_70x
=
bib_dictionarize
(
bib70x
)
del
bib70x
update_status_final
(
'Finished populating get_grouped_records_table_cache'
)
MARC_100_700_CACHE
=
{
'brb100'
:
brd_b10x
,
'brb700'
:
brd_b70x
,
'b100'
:
bibd_10x
,
'b700'
:
bibd_70x
}
def
_get_grouped_records_using_caches
(
brr
,
*
args
):
try
:
c
=
MARC_100_700_CACHE
[
'brb
%s
'
%
str
(
brr
[
0
])][
brr
[
2
]]
fn
=
c
[
'id'
][
brr
[
1
]]
except
KeyError
:
return
dict
((
arg
,
[])
for
arg
in
args
)
if
not
fn
or
len
(
fn
)
>
1
:
#if len fn > 1 it's BAD: the same signature is at least twice on the same paper.
#Let's default to nothing, to be on the safe side.
return
dict
((
arg
,
[])
for
arg
in
args
)
ids
=
set
(
chain
(
*
(
c
[
'fn'
][
i
]
for
i
in
fn
)))
tuples
=
[
MARC_100_700_CACHE
[
'b
%s
'
%
str
(
brr
[
0
])][
i
]
for
i
in
ids
]
results
=
{}
for
t
in
tuples
:
present
=
[
x
for
x
in
args
if
x
in
t
[
0
]]
assert
len
(
present
)
<=
1
if
present
:
arg
=
present
[
0
]
try
:
results
[
arg
]
.
append
(
t
[
1
])
except
KeyError
:
results
[
arg
]
=
[
t
[
1
]]
for
arg
in
args
:
if
arg
not
in
results
.
keys
():
results
[
arg
]
=
[]
return
results
def
_get_grouped_records_from_db
(
bibrefrec
,
*
args
):
'''
By a given bibrefrec: mark:ref,rec this function will scan
bibmarkx table and extract all records with tag in argc, which
are grouped togerther with this bibrec.
Returns a dictionary with { tag : [extracted_values] }
if the values is not found.
@type bibrefrec: (mark(int), ref(int), rec(int))
'''
table
,
ref
,
rec
=
bibrefrec
target_table
=
"bib
%s
x"
%
(
str
(
table
)[:
-
1
])
mapping_table
=
"bibrec_
%s
"
%
target_table
group_id
=
run_sql
(
"SELECT field_number "
"FROM
%s
"
"WHERE id_bibrec =
%d
"
"AND id_bibxxx =
%d
"
%
(
mapping_table
,
rec
,
ref
))
if
len
(
group_id
)
==
0
:
# unfortunately the mapping is not found, so
# we cannot find anything
return
dict
((
arg
,
[])
for
arg
in
args
)
elif
len
(
group_id
)
==
1
:
# All is fine
field_number
=
group_id
[
0
][
0
]
else
:
# sounds bad, but ignore the error
field_number
=
min
(
x
[
0
]
for
x
in
group_id
)
grouped
=
run_sql
(
"SELECT id_bibxxx "
"FROM
%s
"
"WHERE id_bibrec =
%d
"
"AND field_number =
%d
"
%
(
mapping_table
,
rec
,
int
(
field_number
)))
assert
len
(
grouped
)
>
0
,
"There should be a most one grouped value per tag."
grouped_s
=
list_2_SQL_str
(
grouped
,
lambda
x
:
str
(
x
[
0
]))
ret
=
{}
for
arg
in
args
:
qry
=
run_sql
(
"SELECT value "
"FROM
%s
"
"WHERE tag LIKE '
%%%s%%
' "
"AND id IN
%s
"
%
(
target_table
,
arg
,
grouped_s
))
ret
[
arg
]
=
[
q
[
0
]
for
q
in
qry
]
return
ret
def
get_grouped_records
(
bibrefrec
,
*
args
):
if
MARC_100_700_CACHE
:
if
bconfig
.
DEBUG_CHECKS
:
assert
_get_grouped_records_using_caches
(
bibrefrec
,
*
args
)
==
_get_grouped_records_from_db
(
bibrefrec
,
*
args
)
return
_get_grouped_records_using_caches
(
bibrefrec
,
*
args
)
else
:
return
_get_grouped_records_from_db
(
bibrefrec
,
*
args
)
def
get_person_with_extid
(
idd
,
match_tag
=
False
):
if
match_tag
:
mtag
=
" and tag = '
%s
'"
%
'extid:'
+
match_tag
else
:
mtag
=
''
pids
=
run_sql
(
"select personid from aidPERSONIDDATA where data=
%s
"
%
'
%s
'
+
mtag
,
(
idd
,))
return
set
(
pids
)
def
get_inspire_id
(
p
):
'''
Gets inspire id for a signature (bibref_table,bibref_value.bibrec)
'''
return
get_grouped_records
((
str
(
p
[
0
]),
p
[
1
],
p
[
2
]),
str
(
p
[
0
])
+
'__i'
)
.
values
()[
0
]
def
get_claimed_papers_from_papers
(
papers
):
'''
Given a set of papers it returns the subset of claimed papers
@param papers: set of papers
@type papers: frozenset
@return: tuple
'''
papers_s
=
list_2_SQL_str
(
papers
)
claimed_papers
=
run_sql
(
"select bibrec from aidPERSONIDPAPERS "
"where bibrec in
%s
and flag = 1"
%
papers_s
)
return
claimed_papers
def
collect_personID_external_ids_from_papers
(
personid
,
limit_to_claimed_papers
=
False
):
gathered_ids
=
{}
if
limit_to_claimed_papers
:
flag
=
1
else
:
flag
=
-
2
person_papers
=
run_sql
(
"select bibref_table,bibref_value,bibrec from aidPERSONIDPAPERS where "
"personid=
%s
and flag >
%s
"
,
(
personid
,
flag
))
if
COLLECT_INSPIRE_ID
:
inspireids
=
[]
for
p
in
person_papers
:
extid
=
get_inspire_id
(
p
)
if
extid
:
inspireids
.
append
(
extid
)
inspireids
=
set
((
i
[
0
]
for
i
in
inspireids
))
gathered_ids
[
'INSPIREID'
]
=
inspireids
# if COLLECT_ORCID:
# orcids = []
# for p in person_papers:
# extid = get_orcid(p)
# if extid:
# orcids.append(extid)
# orcids = set((i[0] for i in orcids))
# gathered_ids['ORCID'] = orcids
# if COLLECT_ARXIV_ID:
# arxivids = []
# for p in person_papers:
# extid = get_arxiv_id(p)
# if extid:
# arxivids.append(extid)
# arxivids = set((i[0] for i in arxivids))
# gathered_ids['ARXIVID'] = arxivids
return
gathered_ids
def
update_personID_external_ids
(
persons_list
=
None
,
overwrite
=
False
,
limit_to_claimed_papers
=
False
,
force_cache_tables
=
False
):
if
force_cache_tables
:
populate_partial_marc_caches
()
if
not
persons_list
:
persons_list
=
set
([
x
[
0
]
for
x
in
run_sql
(
'select personid from aidPERSONIDPAPERS'
)])
for
idx
,
pid
in
enumerate
(
persons_list
):
update_status
(
float
(
idx
)
/
float
(
len
(
persons_list
)),
"Updating external ids..."
)
collected
=
collect_personID_external_ids_from_papers
(
pid
,
limit_to_claimed_papers
=
limit_to_claimed_papers
)
present
=
get_personiID_external_ids
(
pid
)
if
overwrite
:
for
idd
in
present
.
keys
():
for
k
in
present
[
idd
]:
remove_personID_external_id
(
pid
,
idd
,
value
=
k
)
present
=
{}
for
idd
in
collected
.
keys
():
for
k
in
collected
[
idd
]:
if
idd
not
in
present
or
k
not
in
present
[
idd
]:
add_personID_external_id
(
pid
,
idd
,
k
)
if
force_cache_tables
:
destroy_partial_marc_caches
()
update_status_final
(
"Updating external ids finished."
)
def
_get_name_by_bibrecref_from_db
(
bib
):
'''
@param bib: bibrefrec or bibref
@type bib: (mark, bibref, bibrec) OR (mark, bibref)
'''
table
=
"bib
%s
x"
%
str
(
bib
[
0
])[:
-
1
]
refid
=
bib
[
1
]
tag
=
"
%s
__a"
%
str
(
bib
[
0
])
ret
=
run_sql
(
"select value from
%s
where id = '
%s
' and tag = '
%s
'"
%
(
table
,
refid
,
tag
))
assert
len
(
ret
)
==
1
,
"A bibrefrec must have exactly one name(
%s
)"
%
str
(
bib
)
return
ret
[
0
][
0
]
def
_get_name_by_bibrecref_from_cache
(
bib
):
'''
@param bib: bibrefrec or bibref
@type bib: (mark, bibref, bibrec) OR (mark, bibref)
'''
table
=
"b
%s
"
%
bib
[
0
]
refid
=
bib
[
1
]
tag
=
"
%s
__a"
%
str
(
bib
[
0
])
ret
=
None
try
:
if
tag
in
MARC_100_700_CACHE
[
table
][
refid
][
0
]:
ret
=
MARC_100_700_CACHE
[
table
][
refid
][
1
]
except
(
KeyError
,
IndexError
),
e
:
#The GC did run and the table is not clean?
#We might want to allow empty response here
raise
Exception
(
str
(
bib
)
+
str
(
e
))
if
bconfig
.
DEBUG_CHECKS
:
assert
ret
==
_get_name_by_bibrecref_from_db
(
bib
)
return
ret
def
get_name_by_bibrecref
(
bib
):
if
MARC_100_700_CACHE
:
if
bconfig
.
DEBUG_CHECKS
:
assert
_get_name_by_bibrecref_from_cache
(
bib
)
==
_get_name_by_bibrecref_from_db
(
bib
)
return
_get_name_by_bibrecref_from_cache
(
bib
)
else
:
return
_get_name_by_bibrecref_from_db
(
bib
)
def
get_collaboration
(
bibrec
):
bibxxx
=
run_sql
(
"select id_bibxxx from bibrec_bib71x where id_bibrec =
%s
"
,
(
str
(
bibrec
),))
if
len
(
bibxxx
)
==
0
:
return
()
bibxxx
=
list_2_SQL_str
(
bibxxx
,
lambda
x
:
str
(
x
[
0
]))
ret
=
run_sql
(
"select value from bib71x where id in
%s
and tag like '
%s
'"
%
(
bibxxx
,
"710__g"
))
return
[
r
[
0
]
for
r
in
ret
]
def
get_key_words
(
bibrec
):
if
bconfig
.
CFG_ADS_SITE
:
bibxxx
=
run_sql
(
"select id_bibxxx from bibrec_bib65x where id_bibrec =
%s
"
,
(
str
(
bibrec
),))
else
:
bibxxx
=
run_sql
(
"select id_bibxxx from bibrec_bib69x where id_bibrec =
%s
"
,
(
str
(
bibrec
),))
if
len
(
bibxxx
)
==
0
:
return
()
bibxxx
=
list_2_SQL_str
(
bibxxx
,
lambda
x
:
str
(
x
[
0
]))
if
bconfig
.
CFG_ADS_SITE
:
ret
=
run_sql
(
"select value from bib69x where id in
%s
and tag like '
%s
'"
%
(
bibxxx
,
"6531_a"
))
else
:
ret
=
run_sql
(
"select value from bib69x where id in
%s
and tag like '
%s
'"
%
(
bibxxx
,
"695__a"
))
return
[
r
[
0
]
for
r
in
ret
]
def
get_all_authors
(
bibrec
):
bibxxx_1
=
run_sql
(
"select id_bibxxx from bibrec_bib10x where id_bibrec =
%s
"
,
(
str
(
bibrec
),))
bibxxx_7
=
run_sql
(
"select id_bibxxx from bibrec_bib70x where id_bibrec =
%s
"
,
(
str
(
bibrec
),))
if
bibxxx_1
:
bibxxxs_1
=
list_2_SQL_str
(
bibxxx_1
,
lambda
x
:
str
(
x
[
0
]))
authors_1
=
run_sql
(
"select value from bib10x where tag = '
%s
' and id in
%s
"
%
(
'100__a'
,
bibxxxs_1
,))
else
:
authors_1
=
[]
if
bibxxx_7
:
bibxxxs_7
=
list_2_SQL_str
(
bibxxx_7
,
lambda
x
:
str
(
x
[
0
]))
authors_7
=
run_sql
(
"select value from bib70x where tag = '
%s
' and id in
%s
"
%
(
'700__a'
,
bibxxxs_7
,))
else
:
authors_7
=
[]
return
[
a
[
0
]
for
a
in
authors_1
]
+
[
a
[
0
]
for
a
in
authors_7
]
def
get_title_from_rec
(
rec
):
"""
Returns the name of the paper like str if found.
Otherwise returns None.
"""
title_id
=
run_sql
(
"SELECT id_bibxxx "
"FROM bibrec_bib24x "
"WHERE id_bibrec =
%s
"
,
(
rec
,))
if
title_id
:
title_id_s
=
list_2_SQL_str
(
title_id
,
lambda
x
:
x
[
0
])
title
=
run_sql
(
"SELECT value "
"FROM bib24x "
"WHERE id in
%s
"
"AND tag = '245__a'"
%
title_id_s
)
if
title
:
return
title
[
0
][
0
]
def
get_bib10x
():
return
run_sql
(
"select id, value from bib10x where tag like
%s
"
,
(
"100__a"
,))
def
get_bib70x
():
return
run_sql
(
"select id, value from bib70x where tag like
%s
"
,
(
"700__a"
,))
class
Bib_matrix
(
object
):
'''
This small class contains the sparse matrix
and encapsulates it.
'''
# please increment this value every time you
# change the output of the comparison functions
current_comparison_version
=
10
__special_items
=
((
None
,
-
3.
),
(
'+'
,
-
2.
),
(
'-'
,
-
1.
))
special_symbols
=
dict
((
x
[
0
],
x
[
1
])
for
x
in
__special_items
)
special_numbers
=
dict
((
x
[
1
],
x
[
0
])
for
x
in
__special_items
)
def
__init__
(
self
,
cluster_set
=
None
):
if
cluster_set
:
self
.
_bibmap
=
dict
((
b
[
1
],
b
[
0
])
for
b
in
enumerate
(
cluster_set
.
all_bibs
()))
width
=
len
(
self
.
_bibmap
)
size
=
((
width
-
1
)
*
width
)
/
2
self
.
_matrix
=
Bib_matrix
.
create_empty_matrix
(
size
)
else
:
self
.
_bibmap
=
dict
()
self
.
creation_time
=
get_sql_time
()
@staticmethod
def
create_empty_matrix
(
lenght
):
ret
=
numpy
.
ndarray
(
shape
=
(
lenght
,
2
),
dtype
=
float
,
order
=
'C'
)
ret
.
fill
(
Bib_matrix
.
special_symbols
[
None
])
return
ret
def
_resolve_entry
(
self
,
bibs
):
assert
len
(
bibs
)
==
2
first
=
self
.
_bibmap
[
bibs
[
0
]]
second
=
self
.
_bibmap
[
bibs
[
1
]]
if
first
>
second
:
first
,
second
=
second
,
first
assert
first
<
second
return
first
+
((
second
-
1
)
*
second
)
/
2
def
__setitem__
(
self
,
bibs
,
val
):
entry
=
self
.
_resolve_entry
(
bibs
)
self
.
_matrix
[
entry
]
=
Bib_matrix
.
special_symbols
.
get
(
val
,
val
)
def
__getitem__
(
self
,
bibs
):
entry
=
self
.
_resolve_entry
(
bibs
)
ret
=
tuple
(
self
.
_matrix
[
entry
])
return
Bib_matrix
.
special_numbers
.
get
(
ret
[
0
],
ret
)
def
__contains__
(
self
,
bib
):
return
bib
in
self
.
_bibmap
def
get_keys
(
self
):
return
self
.
_bibmap
.
keys
()
@staticmethod
def
get_file_dir
(
name
):
sub_dir
=
name
[:
2
]
if
not
sub_dir
:
sub_dir
=
"empty_last_name"
return
"
%s%s
/"
%
(
bconfig
.
TORTOISE_FILES_PATH
,
sub_dir
)
@staticmethod
def
get_map_path
(
dir_path
,
name
):
return
"
%s%s
.bibmap"
%
(
dir_path
,
name
)
@staticmethod
def
get_matrix_path
(
dir_path
,
name
):
return
"
%s%s
.npy"
%
(
dir_path
,
name
)
def
load
(
self
,
name
,
load_map
=
True
,
load_matrix
=
True
):
files_dir
=
Bib_matrix
.
get_file_dir
(
name
)
if
not
os
.
path
.
isdir
(
files_dir
):
self
.
_bibmap
=
dict
()
return
False
try
:
if
load_map
:
bibmap_v
=
cPickle
.
load
(
open
(
Bib_matrix
.
get_map_path
(
files_dir
,
name
),
'r'
))
rec_v
,
self
.
creation_time
,
self
.
_bibmap
=
bibmap_v
if
(
rec_v
!=
Bib_matrix
.
current_comparison_version
or
Bib_matrix
.
current_comparison_version
<
0
):
# you can use negative
# version to recalculate
self
.
_bibmap
=
dict
()
return
False
if
load_matrix
:
self
.
_matrix
=
numpy
.
load
(
Bib_matrix
.
get_matrix_path
(
files_dir
,
name
))
except
(
IOError
,
UnpicklingError
):
if
load_map
:
self
.
_bibmap
=
dict
()
self
.
creation_time
=
get_sql_time
()
return
False
return
True
def
store
(
self
,
name
):
files_dir
=
Bib_matrix
.
get_file_dir
(
name
)
if
not
os
.
path
.
isdir
(
files_dir
):
try
:
os
.
mkdir
(
files_dir
)
except
OSError
,
e
:
if
e
.
errno
==
17
or
'file exists'
in
str
(
e
.
strerror
)
.
lower
():
pass
else
:
raise
e
bibmap_v
=
(
Bib_matrix
.
current_comparison_version
,
self
.
creation_time
,
self
.
_bibmap
)
cPickle
.
dump
(
bibmap_v
,
open
(
Bib_matrix
.
get_map_path
(
files_dir
,
name
),
'w'
))
numpy
.
save
(
open
(
Bib_matrix
.
get_matrix_path
(
files_dir
,
name
),
"w"
),
self
.
_matrix
)
def
delete_paper_from_personid
(
rec
):
'''
Deletes all information in PERSONID about a given paper
'''
run_sql
(
"delete from aidPERSONIDPAPERS where bibrec =
%s
"
,
(
rec
,))
def
get_signatures_from_rec
(
bibrec
):
'''
Retrieves all information in PERSONID
about a given bibrec.
'''
return
run_sql
(
"select personid, bibref_table, bibref_value, bibrec, name "
"from aidPERSONIDPAPERS where bibrec =
%s
"
,
(
bibrec
,))
def
modify_signature
(
oldref
,
oldrec
,
newref
,
newname
):
'''
Modifies a signature in aidPERSONIDpapers.
'''
return
run_sql
(
"UPDATE aidPERSONIDPAPERS "
"SET bibref_table =
%s
, bibref_value =
%s
, name =
%s
"
"WHERE bibref_table =
%s
AND bibref_value =
%s
AND bibrec =
%s
"
,
(
str
(
newref
[
0
]),
newref
[
1
],
newname
,
str
(
oldref
[
0
]),
oldref
[
1
],
oldrec
))
def
find_pids_by_name
(
name
):
'''
Finds names and personids by a prefix name.
'''
return
set
(
run_sql
(
"SELECT personid, name "
"FROM aidPERSONIDPAPERS "
"WHERE name like
%s
"
,
(
name
+
',%'
,)))
def
find_pids_by_exact_name
(
name
):
"""
Finds names and personids by a name.
"""
return
set
(
run_sql
(
"SELECT personid "
"FROM aidPERSONIDPAPERS "
"WHERE name =
%s
"
,
(
name
,)))
def
remove_sigs
(
signatures
):
'''
Removes records from aidPERSONIDPAPERS
'''
for
sig
in
signatures
:
run_sql
(
"DELETE FROM aidPERSONIDPAPERS "
"WHERE bibref_table like
%s
AND bibref_value =
%s
AND bibrec =
%s
"
,
(
str
(
sig
[
0
]),
sig
[
1
],
sig
[
2
]))
def
remove_personid_papers
(
pids
):
'''
Removes all signatures from aidPERSONIDPAPERS with pid in pids
'''
if
pids
:
run_sql
(
"delete from aidPERSONIDPAPERS where personid in
%s
"
%
list_2_SQL_str
(
pids
))
def
get_full_personid_papers
(
table_name
=
"`aidPERSONIDPAPERS`"
):
'''
Get all columns and rows from aidPERSONIDPAPERS
or any other table with the same structure.
'''
return
run_sql
(
"select personid, bibref_table, "
"bibref_value, bibrec, name, flag, "
"lcul from
%s
"
%
table_name
)
def
get_full_results
():
'''
Depricated. Should be removed soon.
'''
return
run_sql
(
"select personid, bibref_table, bibref_value, bibrec "
"from aidRESULTS"
)
def
get_lastname_results
(
last_name
):
'''
Returns rows from aidRESULTS which share a common last name.
'''
return
run_sql
(
"select personid, bibref_table, bibref_value, bibrec "
"from aidRESULTS "
"where personid like '"
+
last_name
+
".%'"
)
def
get_full_personid_data
(
table_name
=
"`aidPERSONIDDATA`"
):
'''
Get all columns and rows from aidPERSONIDDATA
or any other table with the same structure.
'''
return
run_sql
(
"select personid, tag, data, "
"opt1, opt2, opt3 from
%s
"
%
table_name
)
def
get_specific_personid_full_data
(
pid
):
'''
Get all columns and rows from aidPERSONIDDATA
'''
return
run_sql
(
"select personid, tag, data, "
"opt1, opt2, opt3 from aidPERSONIDDATA where personid=
%s
"
,
(
pid
,))
def
get_canonical_names_by_pid
(
pid
):
'''
Get all data that has as a tag canonical_name from aidPERSONIDDATA
'''
return
run_sql
(
"select data "
"from aidPERSONIDDATA where personid=
%s
and tag=
%s
"
,
(
pid
,
"canonical_name"
))
def
get_orcids_by_pids
(
pid
):
'''
Get all data that has as a tag extid:ORCID from aidPERSONIDDATA
'''
return
run_sql
(
"select data "
"from aidPERSONIDDATA where personid=
%s
and tag=
%s
"
,
(
pid
,
"extid:ORCID"
))
def
get_inspire_ids_by_pids
(
pid
):
'''
Get all data that has as a tag extid:INSPIREID from aidPERSONIDDATA
'''
return
run_sql
(
"select data "
"from aidPERSONIDDATA where personid=
%s
and tag=
%s
"
,
(
pid
,
"extid:INSPIREID"
))
def
get_uids_by_pids
(
pid
):
'''
Get all data that has as a tag uid from aidPERSONIDDATA
'''
return
run_sql
(
"select data "
"from aidPERSONIDDATA where personid=
%s
and tag=
%s
"
,
(
pid
,
"uid"
))
def
get_name_string_to_pid_dictionary
():
'''
Get a dictionary which maps name strigs to person ids
'''
namesdict
=
{}
all_names
=
run_sql
(
"select personid,name from aidPERSONIDPAPERS"
)
for
x
in
all_names
:
namesdict
.
setdefault
(
x
[
1
],
set
())
.
add
(
x
[
0
])
return
namesdict
#could be useful to optimize rabbit, still unused and untested, watch out.
def
get_bibrecref_to_pid_dictuonary
():
brr2pid
=
{}
all_brr
=
run_sql
(
"select personid,bibref_table,bibref_value.bibrec from aidPERSONIDPAPERS"
)
for
x
in
all_brr
:
brr2pid
.
setdefault
(
tuple
(
x
[
1
:]),
set
())
.
add
(
x
[
0
])
return
brr2pid
def
check_personid_papers
(
output_file
=
None
):
'''
Checks all invariants of personid.
Writes in stdout if output_file if False.
'''
if
output_file
:
fp
=
open
(
output_file
,
"w"
)
printer
=
lambda
x
:
fp
.
write
(
x
+
'
\n
'
)
else
:
printer
=
bibauthor_print
checkers
=
(
check_wrong_names
,
check_duplicated_papers
,
check_duplicated_signatures
,
check_wrong_rejection
,
check_canonical_names
,
check_empty_personids
#check_claim_inspireid_contradiction
)
# Avoid writing f(a) or g(a), because one of the calls
# might be optimized.
return
all
([
check
(
printer
)
for
check
in
checkers
])
def
repair_personid
(
output_file
=
None
):
'''
This should make check_personid_papers() to return true.
'''
if
output_file
:
fp
=
open
(
output_file
,
"w"
)
printer
=
lambda
x
:
fp
.
write
(
x
+
'
\n
'
)
else
:
printer
=
bibauthor_print
checkers
=
(
check_wrong_names
,
check_duplicated_papers
,
check_duplicated_signatures
,
check_wrong_rejection
,
check_canonical_names
,
check_empty_personids
#check_claim_inspireid_contradiction
)
first_check
=
[
check
(
printer
)
for
check
in
checkers
]
repair_pass
=
[
check
(
printer
,
repair
=
True
)
for
check
in
checkers
]
last_check
=
[
check
(
printer
)
for
check
in
checkers
]
if
not
all
(
first_check
):
assert
not
(
all
(
repair_pass
))
assert
all
(
last_check
)
return
all
(
last_check
)
def
check_duplicated_papers
(
printer
,
repair
=
False
):
all_ok
=
True
bibrecs_to_reassign
=
[]
recs
=
run_sql
(
"select personid,bibrec from aidPERSONIDPAPERS where flag <>
%s
"
,
(
-
2
,))
d
=
{}
for
x
,
y
in
recs
:
d
.
setdefault
(
x
,
[])
.
append
(
y
)
for
pid
,
bibrec
in
d
.
iteritems
():
if
not
len
(
bibrec
)
==
len
(
set
(
bibrec
)):
all_ok
=
False
dups
=
sorted
(
bibrec
)
dups
=
[
x
for
i
,
x
in
enumerate
(
dups
[
0
:
len
(
dups
)
-
1
])
if
x
==
dups
[
i
+
1
]]
printer
(
"Person
%d
has duplicated papers:
%s
"
%
(
pid
,
dups
))
if
repair
:
for
dupbibrec
in
dups
:
printer
(
"Repairing duplicated bibrec
%s
"
%
str
(
dupbibrec
))
involved_claimed
=
run_sql
(
"select personid,bibref_table,bibref_value,bibrec,flag "
"from aidPERSONIDPAPERS where personid=
%s
and bibrec=
%s
"
"and flag >= 2"
,
(
pid
,
dupbibrec
))
if
len
(
involved_claimed
)
!=
1
:
bibrecs_to_reassign
.
append
(
dupbibrec
)
run_sql
(
"delete from aidPERSONIDPAPERS where personid=
%s
and bibrec=
%s
"
,
(
pid
,
dupbibrec
))
else
:
involved_not_claimed
=
run_sql
(
"select personid,bibref_table,bibref_value,bibrec,flag "
"from aidPERSONIDPAPERS where personid=
%s
and bibrec=
%s
"
"and flag < 2"
,
(
pid
,
dupbibrec
))
for
v
in
involved_not_claimed
:
run_sql
(
"delete from aidPERSONIDPAPERS where personid=
%s
and bibref_table=
%s
"
"and bibref_value=
%s
and bibrec=
%s
and flag=
%s
"
,
(
v
))
if
repair
and
bibrecs_to_reassign
:
printer
(
"Reassigning deleted bibrecs
%s
"
%
str
(
bibrecs_to_reassign
))
from
invenio.bibauthorid_rabbit
import
rabbit
rabbit
(
bibrecs_to_reassign
)
return
all_ok
def
check_duplicated_signatures
(
printer
,
repair
=
False
):
all_ok
=
True
brr
=
run_sql
(
"select bibref_table, bibref_value, bibrec from aidPERSONIDPAPERS where flag >
%s
"
,
(
"-2"
,))
bibrecs_to_reassign
=
[]
d
=
{}
for
x
,
y
,
z
in
brr
:
d
.
setdefault
(
z
,
[])
.
append
((
x
,
y
))
for
bibrec
,
bibrefs
in
d
.
iteritems
():
if
not
len
(
bibrefs
)
==
len
(
set
(
bibrefs
)):
all_ok
=
False
dups
=
sorted
(
bibrefs
)
dups
=
[
x
for
i
,
x
in
enumerate
(
dups
[
0
:
len
(
dups
)
-
1
])
if
x
==
dups
[
i
+
1
]]
printer
(
"Paper
%d
has duplicated signatures:
%s
"
%
(
bibrec
,
dups
))
if
repair
:
for
dup
in
set
(
dups
):
printer
(
"Repairing duplicate
%s
"
%
str
(
dup
))
claimed
=
run_sql
(
"select personid,bibref_table,bibref_value,bibrec from "
"aidPERSONIDPAPERS where bibref_table=
%s
and bibref_value=
%s
"
"and bibrec=
%s
and flag=2"
,
(
dup
[
0
],
dup
[
1
],
bibrec
))
if
len
(
claimed
)
!=
1
:
bibrecs_to_reassign
.
append
(
bibrec
)
run_sql
(
"delete from aidPERSONIDPAPERS where bibref_table=
%s
and "
"bibref_value =
%s
and bibrec =
%s
"
,
(
dup
[
0
],
dup
[
1
],
bibrec
))
else
:
run_sql
(
"delete from aidPERSONIDPAPERS where bibref_table=
%s
and bibref_value=
%s
"
"and bibrec=
%s
and flag<2"
,
(
dup
[
0
],
dup
[
1
],
bibrec
))
if
repair
and
bibrecs_to_reassign
:
printer
(
"Reassigning deleted duplicates
%s
"
%
str
(
bibrecs_to_reassign
))
from
invenio.bibauthorid_rabbit
import
rabbit
rabbit
(
bibrecs_to_reassign
)
return
all_ok
def
get_wrong_names
():
'''
Returns a generator with all wrong names in aidPERSONIDPAPERS.
Every element is (table, ref, correct_name).
'''
bib100
=
dict
(((
x
[
0
],
create_normalized_name
(
split_name_parts
(
x
[
1
])))
for
x
in
get_bib10x
()))
bib700
=
dict
(((
x
[
0
],
create_normalized_name
(
split_name_parts
(
x
[
1
])))
for
x
in
get_bib70x
()))
pidnames100
=
set
(
run_sql
(
"select bibref_value, name from aidPERSONIDPAPERS "
" where bibref_table='100'"
))
pidnames700
=
set
(
run_sql
(
"select bibref_value, name from aidPERSONIDPAPERS "
" where bibref_table='700'"
))
wrong100
=
set
((
'100'
,
x
[
0
],
bib100
.
get
(
x
[
0
],
None
))
for
x
in
pidnames100
if
x
[
1
]
!=
bib100
.
get
(
x
[
0
],
None
))
wrong700
=
set
((
'700'
,
x
[
0
],
bib700
.
get
(
x
[
0
],
None
))
for
x
in
pidnames700
if
x
[
1
]
!=
bib700
.
get
(
x
[
0
],
None
))
total
=
len
(
wrong100
)
+
len
(
wrong700
)
return
chain
(
wrong100
,
wrong700
),
total
def
check_wrong_names
(
printer
,
repair
=
False
):
ret
=
True
wrong_names
,
number
=
get_wrong_names
()
if
number
>
0
:
ret
=
False
printer
(
"
%d
corrupted names in aidPERSONIDPAPERS."
%
number
)
for
wrong_name
in
wrong_names
:
if
wrong_name
[
2
]:
printer
(
"Outdated name, '
%s
'(
%s
:
%d
)."
%
(
wrong_name
[
2
],
wrong_name
[
0
],
wrong_name
[
1
]))
else
:
printer
(
"Invalid id(
%s
:
%d
)."
%
(
wrong_name
[
0
],
wrong_name
[
1
]))
if
repair
:
printer
(
"Fixing wrong name:
%s
"
%
str
(
wrong_name
))
if
wrong_name
[
2
]:
run_sql
(
"update aidPERSONIDPAPERS set name=
%s
where bibref_table=
%s
and bibref_value=
%s
"
,
(
wrong_name
[
2
],
wrong_name
[
0
],
wrong_name
[
1
]))
else
:
run_sql
(
"delete from aidPERSONIDPAPERS where bibref_table=
%s
and bibref_value=
%s
"
,
(
wrong_name
[
0
],
wrong_name
[
1
]))
return
ret
def
check_canonical_names
(
printer
,
repair
=
False
):
ret
=
True
pid_cn
=
run_sql
(
"select personid, data from aidPERSONIDDATA where tag =
%s
"
,
(
'canonical_name'
,))
pid_2_cn
=
dict
((
k
,
len
(
list
(
d
)))
for
k
,
d
in
groupby
(
sorted
(
pid_cn
,
key
=
itemgetter
(
0
)),
key
=
itemgetter
(
0
)))
pid_to_repair
=
[]
for
pid
in
get_existing_personids
():
canon
=
pid_2_cn
.
get
(
pid
,
0
)
if
canon
!=
1
:
if
canon
==
0
:
papers
=
run_sql
(
"select count(*) from aidPERSONIDPAPERS where personid =
%s
"
,
(
pid
,))[
0
][
0
]
if
papers
!=
0
:
printer
(
"Personid
%d
does not have a canonical name, but have
%d
papers."
%
(
pid
,
papers
))
ret
=
False
pid_to_repair
.
append
(
pid
)
else
:
printer
(
"Personid
%d
has
%d
canonical names."
,
(
pid
,
canon
))
pid_to_repair
.
append
(
pid
)
ret
=
False
if
repair
and
not
ret
:
printer
(
"Repairing canonical names for pids:
%s
"
%
str
(
pid_to_repair
))
update_personID_canonical_names
(
pid_to_repair
,
overwrite
=
True
)
return
ret
def
check_empty_personids
(
printer
,
repair
=
False
):
ret
=
True
paper_pids
=
set
(
p
[
0
]
for
p
in
run_sql
(
"select personid from aidPERSONIDPAPERS"
))
data_pids
=
set
(
p
[
0
]
for
p
in
run_sql
(
"select personid from aidPERSONIDDATA"
))
for
p
in
data_pids
-
paper_pids
:
fields
=
run_sql
(
"select count(*) from aidPERSONIDDATA where personid =
%s
and tag <>
%s
"
,
(
p
,
"canonical_name"
,))[
0
][
0
]
if
fields
==
0
:
printer
(
"Personid
%d
has no papers and nothing else than canonical_name."
%
p
)
if
repair
:
printer
(
"Deleting empty person
%s
"
%
str
(
p
))
run_sql
(
"delete from aidPERSONIDDATA where personid=
%s
"
,
(
p
,))
ret
=
False
return
ret
def
check_wrong_rejection
(
printer
,
repair
=
False
):
all_ok
=
True
to_reassign
=
[]
to_deal_with
=
[]
all_rejections
=
set
(
run_sql
(
"select bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS "
"where flag =
%s
"
,
(
'-2'
,)))
all_confirmed
=
set
(
run_sql
(
"select bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS "
"where flag >
%s
"
,
(
'-2'
,)))
not_assigned
=
all_rejections
-
all_confirmed
if
not_assigned
:
all_ok
=
False
for
s
in
not_assigned
:
printer
(
'Paper (
%s
:
%s
,
%s
) was rejected but never reassigned'
%
s
)
to_reassign
.
append
(
s
)
all_rejections
=
set
(
run_sql
(
"select personid, bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS "
"where flag =
%s
"
,
(
'-2'
,)))
all_confirmed
=
set
(
run_sql
(
"select personid, bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS "
"where flag >
%s
"
,
(
'-2'
,)))
both_confirmed_and_rejected
=
all_rejections
&
all_confirmed
if
both_confirmed_and_rejected
:
all_ok
=
False
for
i
in
both_confirmed_and_rejected
:
printer
(
"Conflicting assignment/rejection:
%s
"
%
str
(
i
))
to_deal_with
.
append
(
i
)
if
repair
and
(
to_reassign
or
to_deal_with
):
from
invenio.bibauthorid_rabbit
import
rabbit
if
to_reassign
:
#Rabbit is not designed to reassign signatures which are rejected but not assigned:
#All signatures should stay assigned, if a rejection occours the signature should get
#moved to a new place and the rejection entry added, but never exist as a rejection only.
#Hence, to force rabbit to reassign it, we have to delete the rejection.
printer
(
"Reassigning bibrecs with missing entries:
%s
"
%
str
(
to_reassign
))
for
sig
in
to_reassign
:
run_sql
(
"delete from aidPERSONIDPAPERS where bibref_table=
%s
and "
"bibref_value=
%s
and bibrec =
%s
and flag=-2"
,
(
sig
))
rabbit
([
s
[
2
]
for
s
in
to_reassign
])
if
to_deal_with
:
#We got claims and rejections on the same person for the same paper. Let's forget about
#it and reassign it automatically, they'll make up their minds sooner or later.
printer
(
"Deleting and reassigning bibrefrecs with conflicts
%s
"
%
str
(
to_deal_with
))
for
sig
in
to_deal_with
:
run_sql
(
"delete from aidPERSONIDPAPERS where personid=
%s
and bibref_table=
%s
and "
"bibref_value=
%s
and bibrec =
%s
"
,
(
sig
))
rabbit
(
map
(
itemgetter
(
3
),
to_deal_with
))
return
all_ok
def
check_merger
():
'''
This function presumes that copy_personid was
called before the merger.
'''
is_ok
=
True
old_claims
=
set
(
run_sql
(
"select personid, bibref_table, bibref_value, bibrec, flag "
"from aidPERSONIDPAPERS_copy "
"where flag = -2 or flag = 2"
))
cur_claims
=
set
(
run_sql
(
"select personid, bibref_table, bibref_value, bibrec, flag "
"from aidPERSONIDPAPERS "
"where flag = -2 or flag = 2"
))
errors
=
((
old_claims
-
cur_claims
,
"Some claims were lost during the merge."
),
(
cur_claims
-
old_claims
,
"Some new claims appeared after the merge."
))
act
=
{
-
2
:
'Rejection'
,
2
:
'Claim'
}
for
err_set
,
err_msg
in
errors
:
if
err_set
:
is_ok
=
False
bibauthor_print
(
err_msg
)
bibauthor_print
(
""
.
join
(
"
%s
: personid
%d
%d
:
%d
,
%d
\n
"
%
(
act
[
cl
[
4
]],
cl
[
0
],
int
(
cl
[
1
]),
cl
[
2
],
cl
[
3
])
for
cl
in
err_set
))
old_assigned
=
set
(
run_sql
(
"select bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS_copy"
))
#"where flag <> -2 and flag <> 2"))
cur_assigned
=
set
(
run_sql
(
"select bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS"
))
#"where flag <> -2 and flag <> 2"))
errors
=
((
old_assigned
-
cur_assigned
,
"Some signatures were lost during the merge."
),
(
cur_assigned
-
old_assigned
,
"Some new signatures appeared after the merge."
))
for
err_sig
,
err_msg
in
errors
:
if
err_sig
:
is_ok
=
False
bibauthor_print
(
err_msg
)
bibauthor_print
(
""
.
join
(
"
%s
:
%d
,
%d
\n
"
%
sig
for
sig
in
err_sig
))
return
is_ok
def
check_results
():
is_ok
=
True
all_result_rows
=
run_sql
(
"select personid,bibref_table,bibref_value,bibrec from aidRESULTS"
)
keyfunc
=
lambda
x
:
x
[
1
:]
duplicated
=
(
d
for
d
in
(
list
(
d
)
for
k
,
d
in
groupby
(
sorted
(
all_result_rows
,
key
=
keyfunc
),
key
=
keyfunc
))
if
len
(
d
)
>
1
)
for
dd
in
duplicated
:
is_ok
=
False
for
d
in
dd
:
print
"Duplicated row in aidRESULTS"
print
"
%s
%s
%s
%s
"
%
d
print
clusters
=
{}
for
rr
in
all_result_rows
:
clusters
[
rr
[
0
]]
=
clusters
.
get
(
rr
[
0
],
[])
+
[
rr
[
3
]]
faulty_clusters
=
dict
((
cid
,
len
(
recs
)
-
len
(
set
(
recs
)))
for
cid
,
recs
in
clusters
.
items
()
if
not
len
(
recs
)
==
len
(
set
(
recs
)))
if
faulty_clusters
:
is_ok
=
False
print
"Recids NOT unique in clusters!"
print
(
"A total of
%s
clusters hold an average of
%.2f
duplicates"
%
(
len
(
faulty_clusters
),
(
sum
(
faulty_clusters
.
values
())
/
float
(
len
(
faulty_clusters
)))))
for
c
in
faulty_clusters
:
print
"Name:
%-20s
Size:
%4d
Faulty:
%2d
"
%
(
c
,
len
(
clusters
[
c
]),
faulty_clusters
[
c
])
return
is_ok
def
check_claim_inspireid_contradiction
():
iids10x
=
run_sql
(
"select id from bib10x where tag = '100__i'"
)
iids70x
=
run_sql
(
"select id from bib70x where tag = '700__i'"
)
refs10x
=
set
(
x
[
0
]
for
x
in
run_sql
(
"select id from bib10x where tag = '100__a'"
))
refs70x
=
set
(
x
[
0
]
for
x
in
run_sql
(
"select id from bib70x where tag = '700__a'"
))
if
iids10x
:
iids10x
=
list_2_SQL_str
(
iids10x
,
lambda
x
:
str
(
x
[
0
]))
iids10x
=
run_sql
(
"select id_bibxxx, id_bibrec, field_number "
"from bibrec_bib10x "
"where id_bibxxx in
%s
"
%
iids10x
)
iids10x
=
((
row
[
0
],
[(
ref
,
rec
)
for
ref
,
rec
in
run_sql
(
"select id_bibxxx, id_bibrec "
"from bibrec_bib10x "
"where id_bibrec = '
%s
' "
"and field_number = '
%s
'"
%
row
[
1
:])
if
ref
in
refs10x
])
for
row
in
iids10x
)
else
:
iids10x
=
()
if
iids70x
:
iids70x
=
list_2_SQL_str
(
iids70x
,
lambda
x
:
str
(
x
[
0
]))
iids70x
=
run_sql
(
"select id_bibxxx, id_bibrec, field_number "
"from bibrec_bib70x "
"where id_bibxxx in
%s
"
%
iids70x
)
iids70x
=
((
row
[
0
],
[(
ref
,
rec
)
for
ref
,
rec
in
run_sql
(
"select id_bibxxx, id_bibrec "
"from bibrec_bib70x "
"where id_bibrec = '
%s
' "
"and field_number = '
%s
'"
%
(
row
[
1
:]))
if
ref
in
refs70x
])
for
row
in
iids70x
)
else
:
iids70x
=
()
# [(iids, [bibs])]
inspired
=
list
(
chain
(((
iid
,
list
(
set
((
'100'
,)
+
bib
for
bib
in
bibs
)))
for
iid
,
bibs
in
iids10x
),
((
iid
,
list
(
set
((
'700'
,)
+
bib
for
bib
in
bibs
)))
for
iid
,
bibs
in
iids70x
)))
assert
all
(
len
(
x
[
1
])
==
1
for
x
in
inspired
)
inspired
=
((
k
,
map
(
itemgetter
(
0
),
map
(
itemgetter
(
1
),
d
)))
for
k
,
d
in
groupby
(
sorted
(
inspired
,
key
=
itemgetter
(
0
)),
key
=
itemgetter
(
0
)))
# [(inspireid, [bibs])]
inspired
=
[([(
run_sql
(
"select personid "
"from aidPERSONIDPAPERS "
"where bibref_table =
%s
"
"and bibref_value =
%s
"
"and bibrec =
%s
"
"and flag = '2'"
,
bib
),
bib
)
for
bib
in
cluster
[
1
]],
cluster
[
0
])
for
cluster
in
inspired
]
# [([([pid], bibs)], inspireid)]
for
cluster
,
iid
in
inspired
:
pids
=
set
(
chain
.
from_iterable
(
imap
(
itemgetter
(
0
),
cluster
)))
if
len
(
pids
)
>
1
:
print
"InspireID:
%s
links the following papers:"
%
iid
print
map
(
itemgetter
(
1
),
cluster
)
print
"More than one personid claimed them:"
print
list
(
pids
)
print
continue
if
len
(
pids
)
==
0
:
# not even one paper with this inspireid has been
# claimed, screw it
continue
pid
=
list
(
pids
)[
0
][
0
]
# The last step is to check all non-claimed papers for being
# claimed by the person on some different signature.
problem
=
(
run_sql
(
"select bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS "
"where bibrec =
%s
"
"and personid =
%s
"
"and flag =
%s
"
,
(
bib
[
2
],
pid
,
2
))
for
bib
in
(
bib
for
lpid
,
bib
in
cluster
if
not
lpid
))
problem
=
list
(
chain
.
from_iterable
(
problem
))
if
problem
:
print
"A personid has claimed a paper from an inspireid cluster and a contradictory paper."
print
"Personid
%d
"
%
pid
print
"Inspireid cluster
%s
"
%
str
(
map
(
itemgetter
(
1
),
cluster
))
print
"Contradicting claims:
%s
"
%
str
(
problem
)
print
def
get_all_bibrecs
():
'''
Get all record ids present in aidPERSONIDPAPERS
'''
return
set
([
x
[
0
]
for
x
in
run_sql
(
"select bibrec from aidPERSONIDPAPERS"
)])
def
get_bibrefrec_to_pid_flag_mapping
():
'''
create a map between signatures and personid/flag
'''
whole_table
=
run_sql
(
"select bibref_table,bibref_value,bibrec,personid,flag from aidPERSONIDPAPERS"
)
gc
.
disable
()
ret
=
{}
for
x
in
whole_table
:
sig
=
(
x
[
0
],
x
[
1
],
x
[
2
])
pid_flag
=
(
x
[
3
],
x
[
4
])
ret
[
sig
]
=
ret
.
get
(
sig
,
[])
+
[
pid_flag
]
gc
.
collect
()
gc
.
enable
()
return
ret
def
remove_all_bibrecs
(
bibrecs
):
'''
Remove give record ids from aidPERSONIDPAPERS table
@param bibrecs:
@type bibrecs:
'''
bibrecs_s
=
list_2_SQL_str
(
bibrecs
)
run_sql
(
"delete from aidPERSONIDPAPERS where bibrec in
%s
"
%
bibrecs_s
)
def
empty_results_table
():
'''
Get rid of all tortoise results
'''
run_sql
(
"TRUNCATE aidRESULTS"
)
def
save_cluster
(
named_cluster
):
'''
Save a cluster in aidRESULTS
@param named_cluster:
@type named_cluster:
'''
name
,
cluster
=
named_cluster
for
bib
in
cluster
.
bibs
:
run_sql
(
"INSERT INTO aidRESULTS "
"(personid, bibref_table, bibref_value, bibrec) "
"VALUES (
%s
,
%s
,
%s
,
%s
) "
,
(
name
,
str
(
bib
[
0
]),
bib
[
1
],
bib
[
2
]))
def
remove_result_cluster
(
name
):
'''
Remove result cluster using name string
@param name:
@type name:
'''
run_sql
(
"DELETE FROM aidRESULTS "
"WHERE personid like '
%s
.
%%
'"
%
name
)
def
personid_name_from_signature
(
sig
):
'''
Find personid and name string of a signature
@param sig:
@type sig:
'''
ret
=
run_sql
(
"select personid, name "
"from aidPERSONIDPAPERS "
"where bibref_table =
%s
and bibref_value =
%s
and bibrec =
%s
"
"and flag > '-2'"
,
sig
)
assert
len
(
ret
)
<
2
,
ret
return
ret
def
personid_from_signature
(
sig
):
'''
Find personid owner of a signature
@param sig:
@type sig:
'''
ret
=
run_sql
(
"select personid, flag "
"from aidPERSONIDPAPERS "
"where bibref_table =
%s
and bibref_value =
%s
and bibrec =
%s
"
"and flag > '-2'"
,
sig
)
assert
len
(
ret
)
<
2
,
ret
return
ret
def
get_signature_info
(
sig
):
'''
Get personid and flag relative to a signature
@param sig:
@type sig:
'''
ret
=
run_sql
(
"select personid, flag "
"from aidPERSONIDPAPERS "
"where bibref_table =
%s
and bibref_value =
%s
and bibrec =
%s
"
"order by flag"
,
sig
)
return
ret
def
get_claimed_papers
(
pid
):
'''
Find all papers which have been manually claimed
@param pid:
@type pid:
'''
return
run_sql
(
"select bibref_table, bibref_value, bibrec "
"from aidPERSONIDPAPERS "
"where personid =
%s
"
"and flag >
%s
"
,
(
pid
,
1
))
def
copy_personids
():
'''
Make a copy of aidPERSONID tables to aidPERSONID*_copy tables for later comparison/restore
'''
run_sql
(
"DROP TABLE IF EXISTS `aidPERSONIDDATA_copy`"
)
run_sql
(
"CREATE TABLE `aidPERSONIDDATA_copy` ( "
"`personid` BIGINT( 8 ) UNSIGNED NOT NULL , "
"`tag` VARCHAR( 64 ) NOT NULL , "
"`data` VARCHAR( 256 ) NOT NULL , "
"`opt1` MEDIUMINT( 8 ) DEFAULT NULL , "
"`opt2` MEDIUMINT( 8 ) DEFAULT NULL , "
"`opt3` VARCHAR( 256 ) DEFAULT NULL , "
"KEY `personid-b` ( `personid` ) , "
"KEY `tag-b` ( `tag` ) , "
"KEY `data-b` ( `data` ) , "
"KEY `opt1` ( `opt1` ) "
") ENGINE = MYISAM DEFAULT CHARSET = utf8"
)
run_sql
(
"INSERT INTO `aidPERSONIDDATA_copy` "
"SELECT * "
"FROM `aidPERSONIDDATA`"
)
run_sql
(
"DROP TABLE IF EXISTS `aidPERSONIDPAPERS_copy`"
)
run_sql
(
"CREATE TABLE `aidPERSONIDPAPERS_copy` ( "
"`personid` bigint( 8 ) unsigned NOT NULL , "
"`bibref_table` enum( '100', '700' ) NOT NULL , "
"`bibref_value` mediumint( 8 ) unsigned NOT NULL , "
"`bibrec` mediumint( 8 ) unsigned NOT NULL , "
"`name` varchar( 256 ) NOT NULL , "
"`flag` smallint( 2 ) NOT NULL DEFAULT '0', "
"`lcul` smallint( 2 ) NOT NULL DEFAULT '0', "
"`last_updated` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP , "
"KEY `personid-b` ( `personid` ) , "
"KEY `reftable-b` ( `bibref_table` ) , "
"KEY `refvalue-b` ( `bibref_value` ) , "
"KEY `rec-b` ( `bibrec` ) , "
"KEY `name-b` ( `name` ) , "
"KEY `timestamp-b` ( `last_updated` ) , "
"KEY `ptvrf-b` ( `personid` , `bibref_table` , `bibref_value` , `bibrec` , `flag` ) "
") ENGINE = MyISAM DEFAULT CHARSET = utf8"
)
run_sql
(
"INSERT INTO `aidPERSONIDPAPERS_copy` "
"SELECT * "
"FROM `aidPERSONIDPAPERS"
)
def
delete_empty_persons
():
'''
find and eliminate empty persons (not holding papers nor any other information that canonical_name
'''
pp
=
run_sql
(
"select personid from aidPERSONIDPAPERS"
)
pp
=
set
(
p
[
0
]
for
p
in
pp
)
pd
=
run_sql
(
"select personid from aidPERSONIDDATA"
)
pd
=
set
(
p
[
0
]
for
p
in
pd
)
fpd
=
run_sql
(
"select personid from aidPERSONIDDATA where tag <> 'canonical_name'"
)
fpd
=
set
(
p
[
0
]
for
p
in
fpd
)
to_delete
=
pd
-
(
pp
|
fpd
)
if
to_delete
:
run_sql
(
"delete from aidPERSONIDDATA where personid in
%s
"
%
list_2_SQL_str
(
to_delete
))
def
restore_personids
():
'''
Restore personid tables from last copy saved with copy_personids
'''
run_sql
(
"TRUNCATE `aidPERSONIDDATA`"
)
run_sql
(
"INSERT INTO `aidPERSONIDDATA` "
"SELECT * "
"FROM `aidPERSONIDDATA_copy`"
)
run_sql
(
"TRUNCATE `aidPERSONIDPAPERS`"
)
run_sql
(
"INSERT INTO `aidPERSONIDPAPERS` "
"SELECT * "
"FROM `aidPERSONIDPAPERS_copy"
)
def
resolve_affiliation
(
ambiguous_aff_string
):
"""
This is a method available in the context of author disambiguation in ADS
only. No other platform provides the db table used by this function.
@warning: to be used in an ADS context only.
@param ambiguous_aff_string: Ambiguous affiliation string
@type ambiguous_aff_string: str
@return: The normalized version of the name string as presented in the database
@rtype: str
"""
if
not
ambiguous_aff_string
or
not
bconfig
.
CFG_ADS_SITE
:
return
"None"
aff_id
=
run_sql
(
"select aff_id from ads_affiliations where affstring=
%s
"
,
(
ambiguous_aff_string
,))
if
aff_id
:
return
aff_id
[
0
][
0
]
else
:
return
"None"
def
get_free_pids
():
'''
Returns an iterator with all free personids.
It's cool, because it fills holes.
'''
all_pids
=
frozenset
(
x
[
0
]
for
x
in
chain
(
run_sql
(
"select personid from aidPERSONIDPAPERS"
)
,
run_sql
(
"select personid from aidPERSONIDDATA"
)))
return
ifilter
(
lambda
x
:
x
not
in
all_pids
,
count
())
def
remove_results_outside
(
many_names
):
'''
Delete results from aidRESULTS not including many_names
@param many_names:
@type many_names:
'''
many_names
=
frozenset
(
many_names
)
res_names
=
frozenset
(
x
[
0
]
.
split
(
"."
)[
0
]
for
x
in
run_sql
(
"select personid from aidRESULTS"
))
for
name
in
res_names
-
many_names
:
run_sql
(
"delete from aidRESULTS where personid like '
%s
.
%%
'"
%
name
)
def
get_signatures_from_bibrefs
(
bibrefs
):
'''
@param bibrefs:
@type bibrefs:
'''
bib10x
=
ifilter
(
lambda
x
:
x
[
0
]
==
100
,
bibrefs
)
bib10x_s
=
list_2_SQL_str
(
bib10x
,
lambda
x
:
x
[
1
])
bib70x
=
ifilter
(
lambda
x
:
x
[
0
]
==
700
,
bibrefs
)
bib70x_s
=
list_2_SQL_str
(
bib70x
,
lambda
x
:
x
[
1
])
valid_recs
=
set
(
get_all_valid_bibrecs
())
if
bib10x_s
!=
'()'
:
sig10x
=
run_sql
(
"select 100, id_bibxxx, id_bibrec "
"from bibrec_bib10x "
"where id_bibxxx in
%s
"
%
(
bib10x_s
,))
else
:
sig10x
=
()
if
bib70x_s
!=
'()'
:
sig70x
=
run_sql
(
"select 700, id_bibxxx, id_bibrec "
"from bibrec_bib70x "
"where id_bibxxx in
%s
"
%
(
bib70x_s
,))
else
:
sig70x
=
()
return
ifilter
(
lambda
x
:
x
[
2
]
in
valid_recs
,
chain
(
set
(
sig10x
),
set
(
sig70x
)))
def
get_all_valid_bibrecs
():
'''
Returns a list of valid record ids
'''
collection_restriction_pattern
=
" or "
.
join
([
"980__a:
\"
%s
\"
"
%
x
for
x
in
bconfig
.
LIMIT_TO_COLLECTIONS
])
return
perform_request_search
(
p
=
"
%s
"
%
collection_restriction_pattern
,
rg
=
0
)
def
get_coauthor_pids
(
pid
,
exclude_bibrecs
=
None
):
'''
Find personids sharing bibrecs with given pid, eventually excluding a given set of common bibrecs.
@param pid:
@type pid:
@param exclude_bibrecs:
@type exclude_bibrecs:
'''
papers
=
get_person_bibrecs
(
pid
)
if
exclude_bibrecs
:
papers
=
set
(
papers
)
-
set
(
exclude_bibrecs
)
if
not
papers
:
return
[]
papers_s
=
list_2_SQL_str
(
papers
)
pids
=
run_sql
(
"select personid,bibrec from aidPERSONIDPAPERS "
"where bibrec in
%s
and flag > -2"
%
papers_s
)
pids
=
set
((
int
(
p
[
0
]),
int
(
p
[
1
]))
for
p
in
pids
)
pids
=
sorted
([
p
[
0
]
for
p
in
pids
])
pids
=
groupby
(
pids
)
pids
=
[(
key
,
len
(
list
(
val
)))
for
key
,
val
in
pids
if
key
!=
pid
]
pids
=
sorted
(
pids
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
return
pids
def
get_doi_from_rec
(
recid
):
"""
Returns the doi of the paper like str if found.
Otherwise returns None.
0247 $2 DOI $a id
"""
idx
=
run_sql
(
"SELECT id_bibxxx, field_number FROM bibrec_bib02x WHERE id_bibrec =
%s
"
,
(
recid
,))
if
idx
:
doi_id_s
=
list_2_SQL_str
(
idx
,
lambda
x
:
x
[
0
])
doi
=
run_sql
(
"SELECT id, tag, value FROM bib02x WHERE id in
%s
"
%
doi_id_s
)
if
doi
:
grouped
=
groupby
(
idx
,
lambda
x
:
x
[
1
])
doi_dict
=
dict
((
x
[
0
],
x
[
1
:])
for
x
in
doi
)
for
group
in
grouped
:
elms
=
[
x
[
0
]
for
x
in
list
(
group
[
1
])]
found
=
False
code
=
None
for
el
in
elms
:
if
doi_dict
[
el
][
0
]
==
'0247_2'
and
doi_dict
[
el
][
1
]
==
'DOI'
:
found
=
True
elif
doi_dict
[
el
][
0
]
==
'0247_a'
:
code
=
doi_dict
[
el
][
1
]
if
found
and
code
:
return
code
return
None
def
export_person
(
person_id
):
'''list of records table: personidpapers and personiddate check existing function for getting the records!!!
exports a structure of dictunaries of tuples of [...] if strings, like:
{'name':('namestring',),
'repeatable_field':({'field1':('val1',)},{'field1':'val2'})}
'''
person_info
=
defaultdict
(
defaultdict
)
full_names
=
get_person_db_names_set
(
person_id
)
if
full_names
:
splitted_names
=
[
split_name_parts
(
n
[
0
])
for
n
in
full_names
]
splitted_names
=
[
x
+
[
len
(
x
[
2
])]
for
x
in
splitted_names
]
max_first_names
=
max
([
x
[
4
]
for
x
in
splitted_names
])
full_name_candidates
=
filter
(
lambda
x
:
x
[
4
]
==
max_first_names
,
splitted_names
)
full_name
=
create_normalized_name
(
full_name_candidates
[
0
])
person_info
[
'names'
][
'full_name'
]
=
(
full_name
,)
person_info
[
'names'
][
'surname'
]
=
(
full_name_candidates
[
0
][
0
],)
if
full_name_candidates
[
0
][
2
]:
person_info
[
'names'
][
'first_names'
]
=
(
' '
.
join
(
full_name_candidates
[
0
][
2
]),)
person_info
[
'names'
][
'name_variants'
]
=
(
'; '
.
join
([
create_normalized_name
(
x
)
for
x
in
splitted_names
]),)
bibrecs
=
get_person_bibrecs
(
person_id
)
recids_data
=
[]
for
recid
in
bibrecs
:
recid_dict
=
defaultdict
(
defaultdict
)
recid_dict
[
'INSPIRE-record-id'
]
=
(
str
(
recid
),)
recid_dict
[
'INSPIRE-record-url'
]
=
(
'
%s
/record/
%s
'
%
(
CFG_SITE_URL
,
str
(
recid
)),)
rec_doi
=
get_doi_from_rec
(
recid
)
if
rec_doi
:
recid_dict
[
'DOI'
]
=
(
str
(
rec_doi
),)
recids_data
.
append
(
recid_dict
)
person_info
[
'records'
][
'record'
]
=
tuple
(
recids_data
)
person_info
[
'identifiers'
][
'INSPIRE_person_ID'
]
=
(
str
(
person_id
),)
canonical_names
=
get_canonical_names_by_pid
(
person_id
)
if
canonical_names
:
person_info
[
'identifiers'
][
'INSPIRE_canonical_name'
]
=
(
str
(
canonical_names
[
0
][
0
]),)
person_info
[
'profile_page'
][
'INSPIRE_profile_page'
]
=
(
'
%s
/author/
%s
'
%
(
CFG_SITE_URL
,
canonical_names
[
0
][
0
]),)
else
:
person_info
[
'profile_page'
][
'INSPIRE_profile_page'
]
=
(
'
%s
/author/
%s
'
%
(
CFG_SITE_URL
,
str
(
person_id
)),)
orcids
=
get_orcids_by_pids
(
person_id
)
if
orcids
:
person_info
[
'identifiers'
][
'ORCID'
]
=
tuple
(
str
(
x
[
0
])
for
x
in
orcids
)
inspire_ids
=
get_inspire_ids_by_pids
(
person_id
)
if
inspire_ids
:
person_info
[
'identifiers'
][
'INSPIREID'
]
=
tuple
(
str
(
x
[
0
])
for
x
in
inspire_ids
)
return
person_info
def
export_person_to_foaf
(
person_id
):
'''
Exports to foaf xml a dictionary of dictionaries or tuples of strings as retured by export_person
'''
infodict
=
export_person
(
person_id
)
def
export
(
val
,
indent
=
0
):
if
isinstance
(
val
,
dict
):
contents
=
list
()
for
k
,
v
in
val
.
iteritems
():
if
isinstance
(
v
,
tuple
):
contents
.
append
(
''
.
join
(
[
X
[
str
(
k
)](
indent
=
indent
,
body
=
export
(
c
))
for
c
in
v
]
))
else
:
contents
.
append
(
X
[
str
(
k
)](
indent
=
indent
,
body
=
export
(
v
,
indent
=
indent
+
1
))
)
return
''
.
join
(
contents
)
elif
isinstance
(
val
,
str
):
return
str
(
X
.
escaper
(
val
))
else
:
raise
Exception
(
'WHAT THE HELL DID WE GET HERE?
%s
'
%
str
(
val
)
)
return
X
[
'person'
](
body
=
export
(
infodict
,
indent
=
1
))
Event Timeline
Log In to Comment