diff --git a/modules/bibauthorid/lib/bibauthorid_dbinterface.py b/modules/bibauthorid/lib/bibauthorid_dbinterface.py index 9ac2b1396..020430db8 100644 --- a/modules/bibauthorid/lib/bibauthorid_dbinterface.py +++ b/modules/bibauthorid/lib/bibauthorid_dbinterface.py @@ -1,2570 +1,2590 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011, 2012 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ''' bibauthorid_bdinterface This is the only file in bibauthorid which should use the data base. It should have an interface for all other files in the module. ''' import bibauthorid_config as bconfig import numpy import cPickle from cPickle import UnpicklingError import os import gc from itertools import groupby, count, ifilter, chain, imap from operator import itemgetter from search_engine import perform_request_search from access_control_engine import acc_authorize_action from bibauthorid_name_utils import split_name_parts from bibauthorid_name_utils import create_canonical_name from bibauthorid_name_utils import create_normalized_name from bibauthorid_general_utils import bibauthor_print from bibauthorid_general_utils import update_status \ , update_status_final from dbquery import run_sql -from collections import defaultdict +try: + from collections import defaultdict +except ImportError: + class defaultdict(dict): + def __init__(self, default_factory, *args, **kwargs): + super(defaultdict, self).__init__(*args, **kwargs) + self.default_factory = default_factory + + def __missing__(self, key): + try: + self[key] = self.default_factory() + except TypeError: + raise KeyError("Missing key %s" % (key,)) + else: + return self[key] + + def __getitem__(self, key): + try: + return super(defaultdict, self).__getitem__(key) + except KeyError: + return self.__missing__(key) MARC_100_700_CACHE = None COLLECT_INSPIRE_ID = bconfig.COLLECT_EXTERNAL_ID_INSPIREID def get_sql_time(): ''' Returns the time acoarding to the database. The type is datetime.datetime. ''' return run_sql("select now()")[0][0] def set_personid_row(person_id, tag, value, opt1=0, opt2=0, opt3=""): ''' Inserts data and the additional options of a person by a given personid and tag. ''' run_sql("INSERT INTO aidPERSONIDDATA " "(`personid`, `tag`, `data`, `opt1`, `opt2`, `opt3`) " "VALUES (%s, %s, %s, %s, %s, %s)", (person_id, tag, value, opt1, opt2, opt3)) def get_personid_row(person_id, tag): ''' Returns all the records associated to a person and a tag. @param person_id: id of the person to read the attribute from @type person_id: int @param tag: the tag to read. @type tag: string @return: the data associated with a virtual author @rtype: tuple of tuples ''' return run_sql("SELECT data, opt1, opt2, opt3 " "data FROM aidPERSONIDDATA " "WHERE personid = %s AND tag = %s", (person_id, tag)) def del_personid_row(tag, person_id=None, value=None): ''' Change the value associated to the given tag for a certain person. @param person_id: ID of the person @type person_id: int @param tag: tag to be updated @type tag: string @param value: value to be written for the tag @type value: string ''' if person_id: if value: run_sql("delete from aidPERSONIDDATA where personid=%s and tag=%s and data=%s", (person_id, tag, value,)) else: run_sql("delete from aidPERSONIDDATA where personid=%s and tag=%s", (person_id, tag,)) else: if value: run_sql("delete from aidPERSONIDDATA where tag=%s and data=%s", (tag, value,)) else: run_sql("delete from aidPERSONIDDATA where tag=%s", (tag,)) def get_all_papers_of_pids(personid_list): ''' Get all papers of authors in a given list and sorts the results by bibrefrec. @param personid_list: list with the authors. @type personid_list: iteratable of integers. ''' if personid_list: plist = list_2_SQL_str(personid_list) paps = run_sql("select personid, bibref_table, bibref_value, bibrec, flag " "from aidPERSONIDPAPERS " "where personid in %s " % plist) inner = set(row[1:4] for row in paps if row[4] > -2) return (x for x in paps if x[1:4] in inner) return () def del_person_not_manually_claimed_papers(pid): ''' Deletes papers from a person which have not been manually claimed. ''' run_sql("delete from aidPERSONIDPAPERS " "where and (flag <> '-2' and flag <> '2') and personid=%s", (pid,)) def get_personid_from_uid(uid): ''' Returns the personID associated with the provided ui. If the personID is already associated with the person the secon parameter is True, false otherwise. @param uid: userID @type uid: ((int,),) ''' pid = run_sql("select personid from aidPERSONIDDATA where tag=%s and data=%s", ('uid', str(uid[0][0]))) if len(pid) == 1: return (pid[0], True) else: return ([-1], False) def get_uid_from_personid(pid): uid = run_sql("select data from aidPERSONIDDATA where tag='uid' and personid = %s", (pid,)) if uid: return uid[0][0] else: return None def get_new_personid(): pids = (run_sql("select max(personid) from aidPERSONIDDATA")[0][0], run_sql("select max(personid) from aidPERSONIDPAPERS")[0][0]) pids = tuple(int(p) for p in pids if p != None) if len(pids) == 2: return max(*pids) + 1 elif len(pids) == 1: return pids[0] + 1 else: return 0 def get_existing_personids(with_papers_only=False): if not with_papers_only: try: pids_data = set(map(int, zip(*run_sql("select distinct personid from aidPERSONIDDATA"))[0])) except IndexError: pids_data = set() else: pids_data = set() try: pids_pap = set(map(int, zip(*run_sql("select distinct personid from aidPERSONIDPAPERS"))[0])) except IndexError: pids_pap = set() return pids_data | pids_pap def get_existing_result_clusters(): return run_sql("select distinct personid from aidRESULTS") def create_new_person(uid= -1, uid_is_owner=False): ''' Create a new person. Set the uid as owner if requested. ''' pid = get_new_personid() if uid_is_owner: set_personid_row(pid, 'uid', str(uid)) else: set_personid_row(pid, 'user-created', str(uid)) return pid def create_new_person_from_uid(uid): return create_new_person(uid, uid_is_owner=True) def new_person_from_signature(sig, name=None): ''' Creates a new person from a signature. ''' pid = get_new_personid() add_signature(sig, name, pid) return pid def add_signature(sig, name, pid): ''' Inserts a signature in personid. ''' if not name: name = get_name_by_bibrecref(sig) name = create_normalized_name(split_name_parts(name)) run_sql("INSERT INTO aidPERSONIDPAPERS " "(personid, bibref_table, bibref_value, bibrec, name) " "VALUES (%s, %s, %s, %s, %s)" , (pid, str(sig[0]), sig[1], sig[2], name)) def move_signature(sig, pid): ''' Inserts a signature in personid. ''' run_sql("update aidPERSONIDPAPERS set personid=%s " "where bibref_table like %s and bibref_value=%s " "and bibrec=%s and flag <> 2 and flag <> -2", (pid,) + sig) def find_conflicts(sig, pid): """ """ return run_sql("select bibref_table, bibref_value, bibrec, flag " "from aidPERSONIDPAPERS where " "personid = %s and " "bibrec = %s and " "flag <> -2" , (pid, sig[2])) def update_request_ticket(person_id, tag_data_tuple, ticket_id=None): ''' Creates / updates a request ticket for a personID @param: personid int @param: tag_data_tuples 'image' of the ticket: (('paper', '700:316,10'), ('owner', 'admin'), ('external_id', 'ticket_18')) @return: ticketid ''' #tags: rt_owner (the owner of the ticket, associating the rt_number to the transaction) # rt_external_id # rt_paper_cornfirm, rt_paper_reject, rt_paper_forget, rt_name, rt_email, rt_whatever #flag: rt_number if not ticket_id: last_id = run_sql("select max(opt1) from aidPERSONIDDATA where personid=%s and tag like %s", (str(person_id), 'rt_%'))[0][0] if last_id: ticket_id = last_id + 1 else: ticket_id = 1 else: delete_request_ticket(person_id, ticket_id) for d in tag_data_tuple: run_sql("insert into aidPERSONIDDATA (personid, tag, data, opt1) " "values (%s,%s,%s,%s)", (str(person_id), 'rt_' + str(d[0]), str(d[1]), str(ticket_id))) return ticket_id def delete_request_ticket(person_id, ticket_id=None): ''' Removes a ticket from a person_id. If ticket_id is not provider removes all the tickets pending on a person. ''' if ticket_id: run_sql("delete from aidPERSONIDDATA where personid=%s and tag like %s and opt1 =%s", (str(person_id), 'rt_%', str(ticket_id))) else: run_sql("delete from aidPERSONIDDATA where personid=%s and tag like %s", (str(person_id), 'rt_%')) def get_all_personids_by_name(regexpr): return run_sql("select personid, name " "from aidPERSONIDPAPERS " "where name like %s", (regexpr,)) def get_personids_by_canonical_name(target): pid = run_sql("select personid from aidPERSONIDDATA where " "tag='canonical_name' and data like %s", (target,)) if pid: return run_sql("select personid, name from aidPERSONIDPAPERS " "where personid=%s", (pid[0][0],)) else: return [] def get_bibref_modification_status(bibref): ''' Determines if a record attached to a person has been touched by a human by checking the flag. @param pid: The Person ID of the person to check the assignment from @type pid: int @param bibref: The paper identifier to be checked (e.g. "100:12,144") @type bibref: string returns [bool:human_modified, int:lcul] ''' if not bibref: raise ValueError("A bibref is expected!") head, rec = bibref.split(',') table, ref = head.split(':') flags = run_sql("SELECT flag, lcul FROM aidPERSONIDPAPERS WHERE " "bibref_table = %s and bibref_value = %s and bibrec = %s" , (table, ref, rec)) if flags: return flags[0] else: return (False, 0) def get_canonical_id_from_personid(pid): ''' Finds the person id canonical name (e.g. Ellis_J_R_1) @param pid @type int @return: sql result of the request @rtype: tuple of tuple ''' return run_sql("SELECT data FROM aidPERSONIDDATA WHERE " "tag = %s AND personid = %s", ('canonical_name', str(pid))) def get_papers_status(paper): ''' Gets the personID and flag assiciated to papers @param papers: list of papers @type papers: '100:7531,9024' @return: (('data','personID','flag',),) @rtype: tuple of tuples ''' head, bibrec = paper.split(',') _table, bibref = head.split(':') rets = run_sql("select PersonID, flag " "from aidPERSONIDPAPERS " "where bibref_table = %s " "and bibref_value = %s " "and bibrec = %s" % (head, bibrec, bibref)) return [[paper] + list(x) for x in rets] def get_persons_from_recids(recids, return_alt_names=False, return_all_person_papers=False): rec_2_pid = dict() pid_2_data = dict() all_pids = set() def get_canonical_name(pid): return run_sql("SELECT data " "FROM aidPERSONIDDATA " "WHERE tag = %s " "AND personid = %s", ('canonical_name', pid)) for rec in recids: pids = run_sql("SELECT personid " "FROM aidPERSONIDPAPERS " "WHERE bibrec = %s " " and flag > -2 ", (rec,)) # for some reason python's set is faster than a mysql distinct pids = set(p[0] for p in pids) all_pids |= pids rec_2_pid[rec] = list(pids) for pid in all_pids: pid_data = {} canonical = get_canonical_name(pid) #We can supposed that this person didn't have a chance to get a canonical name yet #because it was not fully processed by it's creator. Anyway it's safe to try to create one #before failing miserably if not canonical: update_personID_canonical_names([pid]) canonical = get_canonical_name(pid) #assert len(canonical) == 1 #This condition cannot hold in case claims or update daemons are run in parallel #with this, as it can happen that a person with papers exists for wich a canonical name #has not been computed yet. Hence, it will be indexed next time, so it learns. #Each person should have at most one canonical name, so: assert len(canonical) <= 1, "A person cannot have more than one canonical name" if len(canonical) == 1: pid_data = {'canonical_id' : canonical[0][0]} if return_alt_names: names = run_sql("SELECT name " "FROM aidPERSONIDPAPERS " "WHERE personid = %s " " and flag > -2 ", (pid,)) names = set(n[0] for n in names) pid_data['alternatative_names'] = list(names) if return_all_person_papers: recs = run_sql("SELECT bibrec " "FROM aidPERSONIDPAPERS " "WHERE personid = %s " " and flag > -2 ", (pid,)) recs = set(r[0] for r in recs) pid_data['person_records'] = list(recs) pid_2_data[pid] = pid_data return (rec_2_pid, pid_2_data) def get_person_db_names_count(pid, sort_by_count=True): ''' Returns the set of name strings and count associated to a person id. The name strings are as found in the database. @param pid: ID of the person @type pid: ('2',) ''' id_2_count = run_sql("select bibref_table, bibref_value " "from aidPERSONIDPAPERS " "where personid = %s " "and flag > -2", (pid,)) ref100 = [refid[1] for refid in id_2_count if refid[0] == '100'] ref700 = [refid[1] for refid in id_2_count if refid[0] == '700'] ref100_count = dict((key, len(list(data))) for key, data in groupby(sorted(ref100))) ref700_count = dict((key, len(list(data))) for key, data in groupby(sorted(ref700))) if ref100: ref100_s = list_2_SQL_str(ref100, str) id100_2_str = run_sql("select id, value " "from bib10x " "where id in %s" % ref100_s) else: id100_2_str = tuple() if ref700: ref700_s = list_2_SQL_str(ref700, str) id700_2_str = run_sql("select id, value " "from bib70x " "where id in %s" % ref700_s) else: id700_2_str = tuple() ret100 = [(name, ref100_count[refid]) for refid, name in id100_2_str] ret700 = [(name, ref700_count[refid]) for refid, name in id700_2_str] ret = ret100 + ret700 if sort_by_count: ret = sorted(ret, key=itemgetter(1), reverse=True) return ret def get_person_id_from_canonical_id(canonical_id): ''' Finds the person id from a canonical name (e.g. Ellis_J_R_1) @param canonical_id: the canonical ID @type canonical_id: string @return: sql result of the request @rtype: tuple of tuple ''' return run_sql("SELECT personid FROM aidPERSONIDDATA WHERE " "tag='canonical_name' AND data = %s", (canonical_id,)) def get_person_names_count(pid): ''' Returns the set of name strings and count associated to a person id @param pid: ID of the person @type pid: ('2',) @param value: value to be written for the tag @type value: string ''' return run_sql("select name, count(name) from aidPERSONIDPAPERS where " "personid=%s and flag > -2 group by name", (pid,)) def get_person_db_names_set(pid): ''' Returns the set of db_name strings associated to a person id @param pid: ID of the person @type pid: 2 ''' names = get_person_db_names_count(pid) if names: return zip(set(zip(*names)[0])) else: return [] def get_personids_from_bibrec(bibrec): ''' Returns all the personids associated to a bibrec. ''' pids = run_sql("select distinct personid from aidPERSONIDPAPERS where bibrec=%s and flag > -2", (bibrec,)) if pids: return zip(*pids)[0] else: return [] def get_personids_and_papers_from_bibrecs(bibrecs, limit_by_name=None): ''' ''' if not bibrecs: return [] else: bibrecs = list_2_SQL_str(bibrecs) if limit_by_name: try: surname = split_name_parts(limit_by_name)[0] except IndexError: surname = None else: surname = None if not surname: data = run_sql("select personid,bibrec from aidPERSONIDPAPERS where bibrec in %s" % (bibrecs,)) else: surname = split_name_parts(limit_by_name)[0] data = run_sql(("select personid,bibrec from aidPERSONIDPAPERS where bibrec in %s " "and name like " % bibrecs) + ' %s ', (surname + '%',)) pidlist = [(k, set([s[1] for s in d])) for k, d in groupby(sorted(data, key=lambda x:x[0]), key=lambda x:x[0])] pidlist = sorted(pidlist, key=lambda x:len(x[1]), reverse=True) return pidlist def get_person_bibrecs(pid): ''' Returns bibrecs associated with a personid @param pid: integer personid @return [bibrec1,...,bibrecN] ''' papers = run_sql("select bibrec from aidPERSONIDPAPERS where personid=%s and flag > -2", (str(pid),)) if papers: return list(set(zip(*papers)[0])) else: return [] def get_person_papers(pid, flag, show_author_name=False, show_title=False, show_rt_status=False, show_affiliations=False, show_date=False, show_experiment=False): query = "bibref_table, bibref_value, bibrec, flag" if show_author_name: query += ", name" all_papers = run_sql("SELECT " + query + " " "FROM aidPERSONIDPAPERS " "WHERE personid = %s " "AND flag >= %s", (pid, flag)) def format_paper(paper): bibrefrec = "%s:%d,%d" % paper[:3] ret = {'data' : bibrefrec, 'flag' : paper[3] } if show_author_name: ret['authorname'] = paper[4] if show_title: ret['title'] = "" title = get_title_from_rec(paper[2]) if title: ret['title'] = (title,) if show_rt_status: rt_count = run_sql("SELECT count(personid) " "FROM aidPERSONIDDATA WHERE " "tag like 'rt_%%' and data = %s" , (bibrefrec,)) ret['rt_status'] = (rt_count[0][0] > 0) if show_affiliations: tag = '%s__u' % paper[0] ret['affiliation'] = get_grouped_records(paper[:3], tag)[tag] if show_date: ret['date'] = [] date_id = run_sql("SELECT id_bibxxx " "FROM bibrec_bib26x " "WHERE id_bibrec = %s " , (paper[2],)) if date_id: date_id_s = list_2_SQL_str(date_id, lambda x: x[0]) date = run_sql("SELECT value " "FROM bib26x " "WHERE id in %s " "AND tag = %s" % (date_id_s, "'269__c'")) if date: ret['date'] = zip(*date)[0] if show_experiment: ret['experiment'] = [] experiment_id = run_sql("SELECT id_bibxxx " "FROM bibrec_bib69x " "WHERE id_bibrec = %s " , (paper[2],)) if experiment_id: experiment_id_s = list_2_SQL_str(experiment_id, lambda x: x[0]) experiment = run_sql("SELECT value " "FROM bib69x " "WHERE id in %s " "AND tag = %s" % (experiment_id_s, "'693__e'")) if experiment: ret['experiment'] = zip(*experiment)[0] return ret return [format_paper(paper) for paper in all_papers] def get_persons_with_open_tickets_list(): ''' Finds all the persons with open tickets and returns pids and count of tickets @return: [[pid, ticket_count]] ''' return run_sql("select personid, count(distinct opt1) from " "aidPERSONIDDATA where tag like 'rt_%' group by personid") def get_request_ticket(person_id, ticket_id=None): ''' Retrieves one or many requests tickets from a person @param: person_id: person id integer @param: matching: couple of values to match ('tag', 'value') @param: ticket_id: ticket id (flag) value @returns: [[[('tag', 'value')], ticket_id]] [[[('a', 'va'), ('b', 'vb')], 1L], [[('b', 'daOEIaoe'), ('a', 'caaoOUIe')], 2L]] ''' if ticket_id: tstr = " and opt1='%s' " % ticket_id else: tstr = " " tickets = run_sql("select tag,data,opt1 from aidPERSONIDDATA where personid=%s and " " tag like 'rt_%%' " + tstr , (person_id,)) return [[[(s[0][3:], s[1]) for s in d], k] for k, d in groupby(sorted(tickets, key=lambda k: k[2]), key=lambda k: k[2])] def insert_user_log(userinfo, personid, action, tag, value, comment='', transactionid=0, timestamp=None, userid=''): ''' Instert log entries in the user log table. For example of entres look at the table generation script. @param userinfo: username or user identifier @type: string @param personid: personid involved in the transaction @type: longint @param action: action type @type: string @param tag: tag @type: string @param value: value for the transaction @type: string @param comment: optional comment for the transaction @type: string @param transactionid: optional id for the transaction @type: longint @return: the transactionid @rtype: longint ''' if not timestamp: timestamp = run_sql('select now()')[0][0] run_sql('insert into aidUSERINPUTLOG ' '(transactionid,timestamp,userinfo,userid,personid,action,tag,value,comment) values ' '(%s,%s,%s,%s,%s,%s,%s,%s,%s)', (transactionid, timestamp, userinfo, userid, personid, action, tag, value, comment)) return transactionid def person_bibref_is_touched_old(pid, bibref): ''' Determines if a record attached to a person has been touched by a human by checking the flag. @param pid: The Person ID of the person to check the assignment from @type pid: int @param bibref: The paper identifier to be checked (e.g. "100:12,144") @type bibref: string ''' bibref, rec = bibref.split(",") table, ref = bibref.split(":") flag = run_sql("SELECT flag " "FROM aidPERSONIDPAPERS " "WHERE personid = %s " "AND bibref_table = %s " "AND bibref_value = %s " "AND bibrec = %s" , (pid, table, ref, rec)) try: flag = flag[0][0] except (IndexError): return False if not flag: return False elif -2 < flag < 2: return False else: return True def confirm_papers_to_person(pid, papers, user_level=0): ''' Confirms the relationship between pid and paper, as from user input. @param pid: id of the person @type pid: ('2',) @param papers: list of papers to confirm @type papers: (('100:7531,9024',),) ''' new_pid = get_new_personid() pids_to_update = set(pid) for p in papers: bibref, rec = p[0].split(",") rec = int(rec) table, ref = bibref.split(":") ref = int(ref) sig = (table, ref, rec) paps = run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where personid=%s " "and bibrec=%s " "and flag > -2" , (pid[0], rec)) rej_paps = run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where personid=%s " "and bibrec=%s " "and flag = -2" , (pid[0], rec)) assert paps or rej_paps assert len(paps) < 2 if paps and sig != paps[0]: pids_to_update.add(new_pid) move_signature(paps[0], new_pid) run_sql("delete from aidPERSONIDPAPERS where bibref_table like %s and " " bibref_value = %s and bibrec=%s" , sig) add_signature(sig, None, pid[0]) run_sql("update aidPERSONIDPAPERS " "set personid = %s " ", flag = %s " ", lcul = %s " "where bibref_table = %s " "and bibref_value = %s " "and bibrec = %s" , (pid[0], '2', user_level, table, ref, rec)) update_personID_canonical_names(pids_to_update) def reject_papers_from_person(pid, papers, user_level=0): ''' Confirms the negative relationship between pid and paper, as from user input. @param pid: id of the person @type pid: integer @param papers: list of papers to confirm @type papers: ('100:7531,9024',) ''' new_pid = get_new_personid() pids_to_update = set([pid]) for p in papers: brr, rec = p.split(",") table, ref = brr.split(':') sig = (table, ref, rec) records = personid_name_from_signature(sig) assert(records) fpid, name = records[0] if fpid == pid: run_sql("INSERT INTO aidPERSONIDPAPERS " "(personid, bibref_table, bibref_value, bibrec, name, flag, lcul) " "VALUES (%s, %s, %s, %s, %s, %s, %s)" , (pid, table, ref, rec, name, -2, user_level)) move_signature(sig, new_pid) pids_to_update.add(new_pid) update_personID_canonical_names(pids_to_update) def reset_papers_flag(pid, papers): ''' Resets the flag associated to the papers to '0' @param papers: list of papers to confirm @type papers: (('100:7531,9024',),) ''' for p in papers: bibref, rec = p[0].split(",") table, ref = bibref.split(":") sig = (table, ref, rec) paps = run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where personid=%s " "and bibrec=%s " , (pid[0], rec)) rej_paps = run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where personid=%s " "and bibrec=%s " "and flag = -2" , (pid[0], rec)) assert paps or rej_paps assert len(paps) < 2 run_sql("delete from aidPERSONIDPAPERS where bibref_table like %s and " "bibref_value = %s and bibrec = %s", (sig)) add_signature(sig, None, pid[0]) def user_can_modify_data(uid, pid): ''' Return True if the uid can modify data of this personID, false otherwise. @param uid: the user id @type: int @param pid: the person id @type: int @return: can user mofidfy data? @rtype: boolean ''' pid_uid = run_sql("select data from aidPERSONIDDATA where tag = %s" " and personid = %s", ('uid', str(pid))) if len(pid_uid) >= 1 and str(uid) == str(pid_uid[0][0]): rights = bconfig.CLAIMPAPER_CHANGE_OWN_DATA else: rights = bconfig.CLAIMPAPER_CHANGE_OTHERS_DATA return acc_authorize_action(uid, rights)[0] == 0 def get_possible_bibrecref(names, bibrec, always_match=False): ''' Returns a list of bibrefs for which the surname is matching @param names: list of names strings @param bibrec: bibrec number @param always_match: match with all the names (full bibrefs list) ''' splitted_names = [split_name_parts(n) for n in names] bibrec_names_100 = run_sql("select o.id, o.value from bib10x o, " "(select i.id_bibxxx as iid from bibrec_bib10x i " "where id_bibrec=%s) as dummy " "where o.tag='100__a' AND o.id = dummy.iid", (str(bibrec),)) bibrec_names_700 = run_sql("select o.id, o.value from bib70x o, " "(select i.id_bibxxx as iid from bibrec_bib70x i " "where id_bibrec=%s) as dummy " "where o.tag='700__a' AND o.id = dummy.iid", (str(bibrec),)) # bibrec_names_100 = run_sql("select id,value from bib10x where tag='100__a' and id in " # "(select id_bibxxx from bibrec_bib10x where id_bibrec=%s)", # (str(bibrec),)) # bibrec_names_700 = run_sql("select id,value from bib70x where tag='700__a' and id in " # "(select id_bibxxx from bibrec_bib70x where id_bibrec=%s)", # (str(bibrec),)) bibreflist = [] for b in bibrec_names_100: spb = split_name_parts(b[1]) for n in splitted_names: if (n[0].lower() == spb[0].lower()) or always_match: if ['100:' + str(b[0]), b[1]] not in bibreflist: bibreflist.append(['100:' + str(b[0]), b[1]]) for b in bibrec_names_700: spb = split_name_parts(b[1]) for n in splitted_names: if (n[0].lower() == spb[0].lower()) or always_match: if ['700:' + str(b[0]), b[1]] not in bibreflist: bibreflist.append(['700:' + str(b[0]), b[1]]) return bibreflist def user_can_modify_paper(uid, paper): ''' Return True if the uid can modify this paper, false otherwise. If the paper is assigned more then one time (from algorithms) consider the most privileged assignment. @param uid: the user id @type: int @param paper: the paper bibref,bibrec pair x00:1234,4321 @type: str @return: can user mofidfy paper attribution? @rtype: boolean ''' bibref, rec = paper.split(",") table, ref = bibref.split(":") prow = run_sql("select personid, lcul from aidPERSONIDPAPERS " "where bibref_table = %s and bibref_value = %s and bibrec = %s " "order by lcul desc limit 0,1", (table, ref, rec)) if len(prow) == 0: return ((acc_authorize_action(uid, bconfig.CLAIMPAPER_CLAIM_OWN_PAPERS)[0] == 0) or (acc_authorize_action(uid, bconfig.CLAIMPAPER_CLAIM_OTHERS_PAPERS)[0] == 0)) min_req_acc_n = int(prow[0][1]) req_acc = resolve_paper_access_right(bconfig.CLAIMPAPER_CLAIM_OWN_PAPERS) pid_uid = run_sql("select data from aidPERSONIDDATA where tag = %s and personid = %s", ('uid', str(prow[0][0]))) if len(pid_uid) > 0: if (str(pid_uid[0][0]) != str(uid)) and min_req_acc_n > 0: req_acc = resolve_paper_access_right(bconfig.CLAIMPAPER_CLAIM_OTHERS_PAPERS) if min_req_acc_n < req_acc: min_req_acc_n = req_acc min_req_acc = resolve_paper_access_right(min_req_acc_n) return (acc_authorize_action(uid, min_req_acc)[0] == 0) and (resolve_paper_access_right(min_req_acc) >= min_req_acc_n) def resolve_paper_access_right(acc): ''' Given a string or an integer, resolves to the corresponding integer or string If asked for a wrong/not present parameter falls back to the minimum privilege. ''' access_dict = {bconfig.CLAIMPAPER_VIEW_PID_UNIVERSE: 0, bconfig.CLAIMPAPER_CLAIM_OWN_PAPERS: 25, bconfig.CLAIMPAPER_CLAIM_OTHERS_PAPERS: 50} if isinstance(acc, str): try: return access_dict[acc] except: return 0 inverse_dict = dict([[v, k] for k, v in access_dict.items()]) lower_accs = [a for a in inverse_dict.keys() if a <= acc] try: return inverse_dict[max(lower_accs)] except: return bconfig.CLAIMPAPER_VIEW_PID_UNIVERSE def get_recently_modified_record_ids(date): ''' Returns the bibrecs with modification date more recent then date. @param date: date ''' touched_papers = frozenset(p[0] for p in run_sql( "select id from bibrec " "where modification_date > %s" , (date,))) return touched_papers & frozenset(get_all_valid_bibrecs()) def filter_modified_record_ids(bibrecs, date): ''' Returns the bibrecs with modification date before the date. @param date: date ''' return ifilter( lambda x: run_sql("select count(*) from bibrec " "where id = %s and " "modification_date < %s" , (x[2], date))[0][0] , bibrecs) def get_cached_author_page(pageparam): ''' Return cached authorpage @param: pageparam (int personid) @return (id, 'authorpage_cache', personid, authorpage_html, date_cached) ''' #TABLE: id, tag, identifier, data, date caches = run_sql("select id, object_name, object_key, object_value, last_updated \ from aidCACHE \ where object_name='authorpage_cache' and object_key=%s", (str(pageparam),)) if len(caches) >= 1: return caches[0] else: return [] def delete_cached_author_page(personid): ''' Deletes from the author page cache the page concerning one person ''' run_sql("delete from aidCACHE where object_name='authorpage_cache' and object_key=%s", (str(personid),)) def update_cached_author_page_timestamp(pageparam): ''' Updates cached author page timestamp @param pageparam: int personid ''' #TABLE: id, tag, identifier, data, date run_sql("update aidCACHE set last_updated=now() where object_name='authorpage_cache' and object_key=%s", (str(pageparam),)) def update_cached_author_page(pageparam, page): ''' Updates cached author page, deleting old caches for same pageparam @param pageparam: int personid @param page: string html authorpage ''' #TABLE: id, tag, identifier, data, date run_sql("delete from aidCACHE where object_name='authorpage_cache' and object_key=%s", (str(pageparam),)) run_sql("insert into aidCACHE values (Null,'authorpage_cache',%s,%s,now())", (str(pageparam), str(page))) def get_user_log(transactionid='', userinfo='', userid='', personID='', action='', tag='', value='', comment='', only_most_recent=False): ''' Get user log table entry matching all the given parameters; all of them are optional. IF no parameters are given retuns the complete log table @param transactionid: id of the transaction @param userinfo: user name or identifier @param personid: id of the person involved @param action: action @param tag: tag @param value: value @param comment: comment ''' sql_query = ('select id,transactionid,timestamp,userinfo,personid,action,tag,value,comment ' + 'from aidUSERINPUTLOG where 1 ') if transactionid: sql_query += ' and transactionid=\'' + str(transactionid) + '\'' if userinfo: sql_query += ' and userinfo=\'' + str(userinfo) + '\'' if userid: sql_query += ' and userid=\'' + str(userid) + '\'' if personID: sql_query += ' and personid=\'' + str(personID) + '\'' if action: sql_query += ' and action=\'' + str(action) + '\'' if tag: sql_query += ' and tag=\'' + str(tag) + '\'' if value: sql_query += ' and value=\'' + str(value) + '\'' if comment: sql_query += ' and comment=\'' + str(comment) + '\'' if only_most_recent: sql_query += ' order by timestamp desc limit 0,1' return run_sql(sql_query) def list_2_SQL_str(items, f=lambda x: x): """ Concatenates all items in items to a sql string using f. @param items: a set of items @param type items: X @param f: a function which transforms each item from items to string @param type f: X:->str @return: "(x1, x2, x3, ... xn)" for xi in items @return type: string """ strs = (str(f(x)) for x in items) return "(%s)" % ", ".join(strs) def _get_authors_from_paper_from_db(paper): ''' selects all author bibrefs by a given papers ''' fullbibrefs100 = run_sql("select id_bibxxx from bibrec_bib10x where id_bibrec=%s", (paper,)) if len(fullbibrefs100) > 0: fullbibrefs100str = list_2_SQL_str(fullbibrefs100, lambda x: str(x[0])) return run_sql("select id from bib10x where tag='100__a' and id in %s" % (fullbibrefs100str,)) return tuple() def _get_authors_from_paper_from_cache(paper): ''' selects all author bibrefs by a given papers ''' try: ids = MARC_100_700_CACHE['brb100'][paper]['id'].keys() refs = [i for i in ids if '100__a' in MARC_100_700_CACHE['b100'][i][0]] except KeyError: return tuple() return zip(refs) def get_authors_from_paper(paper): if MARC_100_700_CACHE: if bconfig.DEBUG_CHECKS: assert _get_authors_from_paper_from_cache(paper) == _get_authors_from_paper_from_cache(paper) return _get_authors_from_paper_from_cache(paper) else: return _get_authors_from_paper_from_db(paper) def _get_coauthors_from_paper_from_db(paper): ''' selects all coauthor bibrefs by a given papers ''' fullbibrefs700 = run_sql("select id_bibxxx from bibrec_bib70x where id_bibrec=%s", (paper,)) if len(fullbibrefs700) > 0: fullbibrefs700str = list_2_SQL_str(fullbibrefs700, lambda x: str(x[0])) return run_sql("select id from bib70x where tag='700__a' and id in %s" % (fullbibrefs700str,)) return tuple() def _get_coauthors_from_paper_from_cache(paper): ''' selects all author bibrefs by a given papers ''' try: ids = MARC_100_700_CACHE['brb700'][paper]['id'].keys() refs = [i for i in ids if '700__a' in MARC_100_700_CACHE['b700'][i][0]] except KeyError: return tuple() return zip(refs) def get_coauthors_from_paper(paper): if MARC_100_700_CACHE: if bconfig.DEBUG_CHECKS: assert _get_coauthors_from_paper_from_cache(paper) == _get_coauthors_from_paper_from_db(paper) return _get_coauthors_from_paper_from_cache(paper) else: return _get_coauthors_from_paper_from_db(paper) def get_bibrefrec_subset(table, papers, refs): table = "bibrec_bib%sx" % str(table)[:-1] contents = run_sql("select id_bibrec, id_bibxxx from %s" % table) papers = set(papers) refs = set(refs) # yes, there are duplicates and we must set them return set(ifilter(lambda x: x[0] in papers and x[1] in refs, contents)) def get_deleted_papers(): return run_sql("select o.id_bibrec from bibrec_bib98x o, " "(select i.id as iid from bib98x i " "where value = 'DELETED' " "and tag like '980__a') as dummy " "where o.id_bibxxx = dummy.iid") def add_personID_external_id(personid, external_id_str, value): run_sql("insert into aidPERSONIDDATA (personid,tag,data) values (%s,%s,%s)", (personid, 'extid:%s' % external_id_str, value)) def remove_personID_external_id(personid, external_id_str, value=False): if not value: run_sql("delete from aidPERSONIDDATA where personid=%s and tag=%s", (personid, 'extid:%s' % external_id_str)) else: run_sql("delete from aidPERSONIDDATA where personid=%s and tag=%s and data=%s", (personid, 'extid:%s' % external_id_str, value)) def get_personiID_external_ids(personid): ids = run_sql("select tag,data from aidPERSONIDDATA where personid=%s and tag like 'extid:%%'", (personid,)) extids = {} for i in ids: id_str = i[0].split(':')[1] idd = i[1] try: extids[id_str].append(idd) except KeyError: extids[id_str] = [idd] return extids #bibauthorid_maintenance personid update private methods def update_personID_canonical_names(persons_list=None, overwrite=False, suggested=''): ''' Updates the personID table creating or updating canonical names for persons @param: persons_list: persons to consider for the update (('1'),) @param: overwrite: if to touch already existing canonical names @param: suggested: string to suggest a canonical name for the person ''' if not persons_list and overwrite: persons_list = set([x[0] for x in run_sql('select personid from aidPERSONIDPAPERS')]) elif not persons_list: persons_list = set([x[0] for x in run_sql('select personid from aidPERSONIDPAPERS')]) existing_cnamed_pids = set( [x[0] for x in run_sql('select personid from aidPERSONIDDATA where tag=%s', ('canonical_name',))]) persons_list = persons_list - existing_cnamed_pids for idx, pid in enumerate(persons_list): update_status(float(idx) / float(len(persons_list)), "Updating canonical_names...") current_canonical = run_sql("select data from aidPERSONIDDATA where " "personid=%s and tag=%s", (pid, 'canonical_name')) if overwrite or len(current_canonical) == 0: run_sql("delete from aidPERSONIDDATA where personid=%s and tag=%s", (pid, 'canonical_name')) names = get_person_names_count(pid) names = sorted(names, key=lambda k: k[1], reverse=True) if len(names) < 1 and not suggested: continue else: if suggested: canonical_name = suggested else: canonical_name = create_canonical_name(names[0][0]) existing_cnames = run_sql("select data from aidPERSONIDDATA " "where tag=%s and data like %s", ('canonical_name', str(canonical_name) + '%')) existing_cnames = set(name[0].lower() for name in existing_cnames) for i in count(1): cur_try = canonical_name + '.' + str(i) if cur_try.lower() not in existing_cnames: canonical_name = cur_try break run_sql("insert into aidPERSONIDDATA (personid, tag, data) values (%s,%s,%s) ", (pid, 'canonical_name', canonical_name)) update_status_final("Updating canonical_names finished.") def personid_get_recids_affected_since(last_timestamp): ''' Returns a list of recids which have been manually changed since timestamp @TODO: extend the system to track and signal even automatic updates (unless a full reindex is acceptable in case of magic automatic update) @param: last_timestamp: last update, datetime.datetime ''' vset = set(int(v[0].split(',')[1]) for v in run_sql( "select distinct value from aidUSERINPUTLOG " "where timestamp > %s", (last_timestamp,)) if ',' in v[0] and ':' in v[0]) pids = set(int(p[0]) for p in run_sql( "select distinct personid from aidUSERINPUTLOG " "where timestamp > %s", (last_timestamp,)) if p[0] > 0) if pids: pids_s = list_2_SQL_str(pids) vset |= set(int(b[0]) for b in run_sql( "select bibrec from aidPERSONIDPAPERS " "where personid in %s" % pids_s)) return list(vset) # I'm not sure about this cast. It might work without it. def get_all_paper_records(pid, claimed_only=False): if not claimed_only: return run_sql("SELECT distinct bibrec FROM aidPERSONIDPAPERS WHERE personid = %s", (str(pid),)) else: return run_sql("SELECT distinct bibrec FROM aidPERSONIDPAPERS WHERE " "personid = %s and flag=2 or flag=-2", (str(pid),)) def get_all_modified_names_from_personid(since=None): if since: all_pids = run_sql("SELECT DISTINCT personid " "FROM aidPERSONIDPAPERS " "WHERE flag > -2 " "AND last_updated > %s" % since) else: all_pids = run_sql("SELECT DISTINCT personid " "FROM aidPERSONIDPAPERS " "WHERE flag > -2 ") return ((name[0][0], set(n[1] for n in name), len(name)) for name in (run_sql( "SELECT personid, name " "FROM aidPERSONIDPAPERS " "WHERE personid = %s " "AND flag > -2", p) for p in all_pids)) def destroy_partial_marc_caches(): global MARC_100_700_CACHE MARC_100_700_CACHE = None gc.collect() def populate_partial_marc_caches(): global MARC_100_700_CACHE if MARC_100_700_CACHE: return def br_dictionarize(maptable): gc.collect() gc.disable() md = defaultdict(dict) maxiters = len(set(map(itemgetter(0), maptable))) for i, v in enumerate(groupby(maptable, itemgetter(0))): if i % 1000 == 0: update_status(float(i) / maxiters, 'br_dictionarizing...') if i % 500000 == 0: update_status(float(i) / maxiters, 'br_dictionarizing...GC') gc.collect() - id = defaultdict(list) + idx = defaultdict(list) fn = defaultdict(list) for _, k, z in v[1]: id[k].append(z) fn[z].append(k) - md[v[0]]['id'] = id + md[v[0]]['id'] = idx md[v[0]]['fn'] = fn update_status_final('br_dictionarizieng done') gc.enable() return md def bib_dictionarize(bibtable): return dict((i[0], (i[1], i[2])) for i in bibtable) update_status(.0, 'Populating get_grouped_records_table_cache') bibrec_bib10x = sorted(run_sql("select * from bibrec_bib10x")) update_status(.125, 'Populating get_grouped_records_table_cache') brd_b10x = br_dictionarize(bibrec_bib10x) del bibrec_bib10x update_status(.25, 'Populating get_grouped_records_table_cache') bibrec_bib70x = sorted(run_sql("select * from bibrec_bib70x")) update_status(.375, 'Populating get_grouped_records_table_cache') brd_b70x = br_dictionarize(bibrec_bib70x) del bibrec_bib70x update_status(.5, 'Populating get_grouped_records_table_cache') bib10x = (run_sql("select * from bib10x")) update_status(.625, 'Populating get_grouped_records_table_cache') bibd_10x = bib_dictionarize(bib10x) del bib10x update_status(.75, 'Populating get_grouped_records_table_cache') bib70x = (run_sql("select * from bib70x")) update_status(.875, 'Populating get_grouped_records_table_cache') bibd_70x = bib_dictionarize(bib70x) del bib70x update_status_final('Finished populating get_grouped_records_table_cache') MARC_100_700_CACHE = {'brb100':brd_b10x, 'brb700':brd_b70x, 'b100':bibd_10x, 'b700':bibd_70x} def _get_grouped_records_using_caches(brr, *args): try: c = MARC_100_700_CACHE['brb%s' % str(brr[0])][brr[2]] fn = c['id'][brr[1]] except KeyError: return dict((arg, []) for arg in args) if not fn or len(fn) > 1: #if len fn > 1 it's BAD: the same signature is at least twice on the same paper. #Let's default to nothing, to be on the safe side. return dict((arg, []) for arg in args) ids = set(chain(*(c['fn'][i] for i in fn))) tuples = [MARC_100_700_CACHE['b%s' % str(brr[0])][i] for i in ids] results = {} for t in tuples: if t[0] in args: try: results[t[0]].append(t[1]) except KeyError: results[t[0]] = [t[1]] for arg in args: if arg not in results.keys(): results[arg] = [] return results def _get_grouped_records_from_db(bibrefrec, *args): ''' By a given bibrefrec: mark:ref,rec this function will scan bibmarkx table and extract all records with tag in argc, which are grouped togerther with this bibrec. Returns a dictionary with { tag : [extracted_values] } if the values is not found. @type bibrefrec: (mark(int), ref(int), rec(int)) ''' table, ref, rec = bibrefrec target_table = "bib%sx" % (str(table)[:-1]) mapping_table = "bibrec_%s" % target_table group_id = run_sql("SELECT field_number " "FROM %s " "WHERE id_bibrec = %d " "AND id_bibxxx = %d" % (mapping_table, rec, ref)) if len(group_id) == 0: # unfortunately the mapping is not found, so # we cannot find anything return dict((arg, []) for arg in args) elif len(group_id) == 1: # All is fine field_number = group_id[0][0] else: # sounds bad, but ignore the error field_number = min(x[0] for x in group_id) grouped = run_sql("SELECT id_bibxxx " "FROM %s " "WHERE id_bibrec = %d " "AND field_number = %d" % (mapping_table, rec, int(field_number))) assert len(grouped) > 0, "There should be a most one grouped value per tag." grouped_s = list_2_SQL_str(grouped, lambda x: str(x[0])) ret = {} for arg in args: qry = run_sql("SELECT value " "FROM %s " "WHERE tag LIKE '%s' " "AND id IN %s" % (target_table, arg, grouped_s)) ret[arg] = [q[0] for q in qry] return ret def get_grouped_records(bibrefrec, *args): if MARC_100_700_CACHE: if bconfig.DEBUG_CHECKS: assert _get_grouped_records_using_caches(bibrefrec, *args) == _get_grouped_records_from_db(bibrefrec, *args) return _get_grouped_records_using_caches(bibrefrec, *args) else: return _get_grouped_records_from_db(bibrefrec, *args) def get_person_with_extid(idd, match_tag=False): if match_tag: mtag = " and tag = '%s'" % 'extid:' + match_tag else: mtag = '' pids = run_sql("select personid from aidPERSONIDDATA where data=%s" % '%s' + mtag, (idd,)) return set(pids) def get_inspire_id(p): ''' Gets inspire id for a signature (bibref_table,bibref_value.bibrec) ''' return get_grouped_records((str(p[0]), p[1], p[2]), str(p[0]) + '__i').values()[0] def collect_personID_external_ids_from_papers(personid, limit_to_claimed_papers=False): gathered_ids = {} if limit_to_claimed_papers: flag = 1 else: flag = -2 person_papers = run_sql("select bibref_table,bibref_value,bibrec from aidPERSONIDPAPERS where " "personid=%s and flag > %s", (personid, flag)) if COLLECT_INSPIRE_ID: inspireids = [] for p in person_papers: extid = get_inspire_id(p) if extid: inspireids.append(extid) inspireids = set((i[0] for i in inspireids)) gathered_ids['INSPIREID'] = inspireids return gathered_ids def update_personID_external_ids(persons_list=None, overwrite=False, limit_to_claimed_papers=False, force_cache_tables=False): if force_cache_tables: populate_partial_marc_caches() if not persons_list: persons_list = set([x[0] for x in run_sql('select personid from aidPERSONIDPAPERS')]) for idx, pid in enumerate(persons_list): update_status(float(idx) / float(len(persons_list)), "Updating external ids...") collected = collect_personID_external_ids_from_papers(pid, limit_to_claimed_papers=limit_to_claimed_papers) present = get_personiID_external_ids(pid) if overwrite: for idd in present.keys(): for k in present[idd]: remove_personID_external_id(pid, idd, value=k) present = {} for idd in collected.keys(): for k in collected[idd]: if idd not in present or k not in present[idd]: add_personID_external_id(pid, idd, k) if force_cache_tables: destroy_partial_marc_caches() update_status_final("Updating external ids finished.") def _get_name_by_bibrecref_from_db(bib): ''' @param bib: bibrefrec or bibref @type bib: (mark, bibref, bibrec) OR (mark, bibref) ''' table = "bib%sx" % str(bib[0])[:-1] refid = bib[1] tag = "%s__a" % str(bib[0]) ret = run_sql("select value from %s where id = '%s' and tag = '%s'" % (table, refid, tag)) assert len(ret) == 1, "A bibrefrec must have exactly one name(%s)" % str(bib) return ret[0][0] def _get_name_by_bibrecref_from_cache(bib): ''' @param bib: bibrefrec or bibref @type bib: (mark, bibref, bibrec) OR (mark, bibref) ''' table = "b%s" % bib[0] refid = bib[1] tag = "%s__a" % str(bib[0]) ret = None try: if tag in MARC_100_700_CACHE[table][refid][0]: ret = MARC_100_700_CACHE[table][refid][1] except (KeyError, IndexError), e: #The GC did run and the table is not clean? #We might want to allow empty response here raise Exception(str(bib) + str(e)) if bconfig.DEBUG_CHECKS: assert ret == _get_name_by_bibrecref_from_db(bib) return ret def get_name_by_bibrecref(bib): if MARC_100_700_CACHE: if bconfig.DEBUG_CHECKS: assert _get_name_by_bibrecref_from_cache(bib) == _get_name_by_bibrecref_from_db(bib) return _get_name_by_bibrecref_from_cache(bib) else: return _get_name_by_bibrecref_from_db(bib) def get_collaboration(bibrec): bibxxx = run_sql("select id_bibxxx from bibrec_bib71x where id_bibrec = %s", (str(bibrec),)) if len(bibxxx) == 0: return () bibxxx = list_2_SQL_str(bibxxx, lambda x: str(x[0])) ret = run_sql("select value from bib71x where id in %s and tag like '%s'" % (bibxxx, "710__g")) return [r[0] for r in ret] def get_key_words(bibrec): if bconfig.CFG_ADS_SITE: bibxxx = run_sql("select id_bibxxx from bibrec_bib65x where id_bibrec = %s", (str(bibrec),)) else: bibxxx = run_sql("select id_bibxxx from bibrec_bib69x where id_bibrec = %s", (str(bibrec),)) if len(bibxxx) == 0: return () bibxxx = list_2_SQL_str(bibxxx, lambda x: str(x[0])) if bconfig.CFG_ADS_SITE: ret = run_sql("select value from bib69x where id in %s and tag like '%s'" % (bibxxx, "6531_a")) else: ret = run_sql("select value from bib69x where id in %s and tag like '%s'" % (bibxxx, "695__a")) return [r[0] for r in ret] def get_all_authors(bibrec): bibxxx_1 = run_sql("select id_bibxxx from bibrec_bib10x where id_bibrec = %s", (str(bibrec),)) bibxxx_7 = run_sql("select id_bibxxx from bibrec_bib70x where id_bibrec = %s", (str(bibrec),)) if bibxxx_1: bibxxxs_1 = list_2_SQL_str(bibxxx_1, lambda x: str(x[0])) authors_1 = run_sql("select value from bib10x where tag = '%s' and id in %s" % ('100__a', bibxxxs_1,)) else: authors_1 = [] if bibxxx_7: bibxxxs_7 = list_2_SQL_str(bibxxx_7, lambda x: str(x[0])) authors_7 = run_sql("select value from bib70x where tag = '%s' and id in %s" % ('700__a', bibxxxs_7,)) else: authors_7 = [] return [a[0] for a in authors_1] + [a[0] for a in authors_7] def get_title_from_rec(rec): """ Returns the name of the paper like str if found. Otherwise returns None. """ title_id = run_sql("SELECT id_bibxxx " "FROM bibrec_bib24x " "WHERE id_bibrec = %s", (rec,)) if title_id: title_id_s = list_2_SQL_str(title_id, lambda x: x[0]) title = run_sql("SELECT value " "FROM bib24x " "WHERE id in %s " "AND tag = '245__a'" % title_id_s) if title: return title[0][0] def get_bib10x(): return run_sql("select id, value from bib10x where tag like %s", ("100__a",)) def get_bib70x(): return run_sql("select id, value from bib70x where tag like %s", ("700__a",)) class Bib_matrix(object): ''' This small class contains the sparse matrix and encapsulates it. ''' # please increment this value every time you # change the output of the comparison functions current_comparison_version = 10 __special_items = ((None, -3.), ('+', -2.), ('-', -1.)) special_symbols = dict((x[0], x[1]) for x in __special_items) special_numbers = dict((x[1], x[0]) for x in __special_items) def __init__(self, cluster_set=None): if cluster_set: self._bibmap = dict((b[1], b[0]) for b in enumerate(cluster_set.all_bibs())) width = len(self._bibmap) size = ((width - 1) * width) / 2 self._matrix = Bib_matrix.create_empty_matrix(size) else: self._bibmap = dict() self.creation_time = get_sql_time() @staticmethod def create_empty_matrix(lenght): ret = numpy.ndarray(shape=(lenght, 2), dtype=float, order='C') ret.fill(Bib_matrix.special_symbols[None]) return ret def _resolve_entry(self, bibs): entry = sorted(self._bibmap[bib] for bib in bibs) assert entry[0] < entry[1] return entry[0] + ((entry[1] - 1) * entry[1]) / 2 def __setitem__(self, bibs, val): entry = self._resolve_entry(bibs) self._matrix[entry] = Bib_matrix.special_symbols.get(val, val) def __getitem__(self, bibs): entry = self._resolve_entry(bibs) ret = tuple(self._matrix[entry]) return Bib_matrix.special_numbers.get(ret[0], ret) def __contains__(self, bib): return bib in self._bibmap def get_keys(self): return self._bibmap.keys() @staticmethod def get_file_dir(name): sub_dir = name[:2] if not sub_dir: sub_dir = "empty_last_name" return "%s%s/" % (bconfig.TORTOISE_FILES_PATH, sub_dir) @staticmethod def get_map_path(dir_path, name): return "%s%s.bibmap" % (dir_path, name) @staticmethod def get_matrix_path(dir_path, name): return "%s%s.npy" % (dir_path, name) def load(self, name, load_map=True, load_matrix=True): files_dir = Bib_matrix.get_file_dir(name) if not os.path.isdir(files_dir): self._bibmap = dict() return False try: if load_map: bibmap_v = cPickle.load(open(Bib_matrix.get_map_path(files_dir, name), 'r')) rec_v, self.creation_time, self._bibmap = bibmap_v if (rec_v != Bib_matrix.current_comparison_version or Bib_matrix.current_comparison_version < 0): # you can use negative # version to recalculate self._bibmap = dict() return False if load_matrix: self._matrix = numpy.load(Bib_matrix.get_matrix_path(files_dir, name)) except (IOError, UnpicklingError): if load_map: self._bibmap = dict() self.creation_time = get_sql_time() return False return True def store(self, name): files_dir = Bib_matrix.get_file_dir(name) if not os.path.isdir(files_dir): try: os.mkdir(files_dir) except OSError, e: if e.errno == 17 or 'file exists' in str(e.strerror).lower(): pass else: raise e bibmap_v = (Bib_matrix.current_comparison_version, self.creation_time, self._bibmap) cPickle.dump(bibmap_v, open(Bib_matrix.get_map_path(files_dir, name), 'w')) numpy.save(open(Bib_matrix.get_matrix_path(files_dir, name), "w"), self._matrix) def delete_paper_from_personid(rec): ''' Deletes all information in PERSONID about a given paper ''' run_sql("delete from aidPERSONIDPAPERS where bibrec = %s", (rec,)) def get_signatures_from_rec(bibrec): ''' Retrieves all information in PERSONID about a given bibrec. ''' return run_sql("select personid, bibref_table, bibref_value, bibrec, name " "from aidPERSONIDPAPERS where bibrec = %s" , (bibrec,)) def modify_signature(oldref, oldrec, newref, newname): ''' Modifies a signature in aidPERSONIDpapers. ''' return run_sql("UPDATE aidPERSONIDPAPERS " "SET bibref_table = %s, bibref_value = %s, name = %s " "WHERE bibref_table = %s AND bibref_value = %s AND bibrec = %s" , (str(newref[0]), newref[1], newname, str(oldref[0]), oldref[1], oldrec)) def find_pids_by_name(name): ''' Finds names and personids by a prefix name. ''' return set(run_sql("SELECT personid, name " "FROM aidPERSONIDPAPERS " "WHERE name like %s" , (name + ',%',))) def find_pids_by_exact_name(name): """ Finds names and personids by a name. """ return set(run_sql("SELECT personid " "FROM aidPERSONIDPAPERS " "WHERE name = %s" , (name,))) def remove_sigs(signatures): ''' Removes records from aidPERSONIDPAPERS ''' for sig in signatures: run_sql("DELETE FROM aidPERSONIDPAPERS " "WHERE bibref_table like %s AND bibref_value = %s AND bibrec = %s" , (str(sig[0]), sig[1], sig[2])) def remove_personid_papers(pids): ''' Removes all signatures from aidPERSONIDPAPERS with pid in pids ''' if pids: run_sql("delete from aidPERSONIDPAPERS where personid in %s" % list_2_SQL_str(pids)) def get_full_personid_papers(table_name="`aidPERSONIDPAPERS`"): ''' Get all columns and rows from aidPERSONIDPAPERS or any other table with the same structure. ''' return run_sql("select personid, bibref_table, " "bibref_value, bibrec, name, flag, " "lcul from %s" % table_name) def get_full_results(): ''' Depricated. Should be removed soon. ''' return run_sql("select personid, bibref_table, bibref_value, bibrec " "from aidRESULTS") def get_lastname_results(last_name): ''' Returns rows from aidRESULTS which share a common last name. ''' return run_sql("select personid, bibref_table, bibref_value, bibrec " "from aidRESULTS " "where personid like '" + last_name + ".%'") def get_full_personid_data(table_name="`aidPERSONIDDATA`"): ''' Get all columns and rows from aidPERSONIDDATA or any other table with the same structure. ''' return run_sql("select personid, tag, data, " "opt1, opt2, opt3 from %s" % table_name) def get_name_string_to_pid_dictionary(): namesdict = {} all_names = run_sql("select personid,name from aidPERSONIDPAPERS") for x in all_names: namesdict.setdefault(x[1], set()).add(x[0]) return namesdict #could be useful to optimize rabbit, still unused, watch out. def get_bibrecref_to_pid_dictuonary(): brr2pid = {} all_brr = run_sql("select personid,bibref_table,bibref_value.bibrec from aidPERSONIDPAPERS") for x in all_brr: brr2pid.setdefault(tuple(x[1:]), set()).add(x[0]) return brr2pid def check_personid_papers(output_file=None): ''' Checks all invariants of personid. Writes in stdout if output_file if False. ''' if output_file: fp = open(output_file, "w") printer = lambda x: fp.write(x + '\n') else: printer = bibauthor_print checkers = ( check_wrong_names, check_duplicated_papers, check_duplicated_signatures, check_wrong_rejection, check_canonical_names, check_empty_personids #check_claim_inspireid_contradiction ) # Avoid writing f(a) or g(a), because one of the calls # might be optimized. return all([check(printer) for check in checkers]) def repair_personid(output_file=None): ''' This should make check_personid_papers() to return true. ''' if output_file: fp = open(output_file, "w") printer = lambda x: fp.write(x + '\n') else: printer = bibauthor_print checkers = ( check_wrong_names, check_duplicated_papers, check_duplicated_signatures, check_wrong_rejection, check_canonical_names, check_empty_personids #check_claim_inspireid_contradiction ) first_check = [check(printer) for check in checkers] repair_pass = [check(printer, repair=True) for check in checkers] last_check = [check(printer) for check in checkers] if not all(first_check): assert not(all(repair_pass)) assert all(last_check) return all(last_check) def check_duplicated_papers(printer, repair=False): all_ok = True bibrecs_to_reassign = [] recs = run_sql("select personid,bibrec from aidPERSONIDPAPERS where flag <> %s", (-2,)) d = {} for x, y in recs: d.setdefault(x, []).append(y) for pid , bibrec in d.iteritems(): if not len(bibrec) == len(set(bibrec)): all_ok = False dups = sorted(bibrec) dups = [x for i, x in enumerate(dups[0:len(dups) - 1]) if x == dups[i + 1]] printer("Person %d has duplicated papers: %s", (pid, dups)) if repair: for dupbibrec in dups: printer("Repairing duplicated bibrec %s" % str(dupbibrec)) involved_claimed = run_sql("select personid,bibref_table,bibref_value,bibrec,flag " "from aidPERSONIDPAPERS where personid=%s and bibrec=%s " "and flag >= 2", (pid, dupbibrec)) if len(involved_claimed) != 1: bibrecs_to_reassign.append(dupbibrec) run_sql("delete from aidPERSONIDPAPERS where personid=%s and bibrec=%s", (pid, dupbibrec)) else: involved_not_claimed = run_sql("select personid,bibref_table,bibref_value,bibrec,flag " "from aidPERSONIDPAPERS where personid=%s and bibrec=%s " "and flag < 2", (pid, dupbibrec)) for v in involved_not_claimed: run_sql("delete from aidPERSONIDPAPERS where personid=%s and bibref_table=%s " "and bibref_value=%s and bibrec=%s and flag=%s", (v)) if repair and bibrecs_to_reassign: printer("Reassigning deleted bibrecs %s" % str(bibrecs_to_reassign)) from bibauthorid_rabbit import rabbit rabbit(bibrecs_to_reassign) return all_ok def check_duplicated_signatures(printer, repair=False): all_ok = True brr = run_sql("select bibref_table, bibref_value, bibrec from aidPERSONIDPAPERS where flag > %s", ("-2",)) bibrecs_to_reassign = [] d = {} for x, y, z in brr: d.setdefault(z, []).append((x, y)) for bibrec, bibrefs in d.iteritems(): if not len(bibrefs) == len(set(bibrefs)): all_ok = False dups = sorted(bibrefs) dups = [x for i, x in enumerate(dups[0:len(dups) - 1]) if x == dups[i + 1]] printer("Paper %d has duplicated signatures: %s" % (bibrec, dups)) if repair: for dup in set(dups): printer("Repairing duplicate %s" % str(dup)) claimed = run_sql("select personid,bibref_table,bibref_value,bibrec from " "aidPERSONIDPAPERS where bibref_table=%s and bibref_value=%s " "and bibrec=%s and flag=2", (dup[0], dup[1], bibrec)) if len(claimed) != 1: bibrecs_to_reassign.append(bibrec) run_sql("delete from aidPERSONIDPAPERS where bibref_table=%s and " "bibref_value = %s and bibrec = %s", (dup[0], dup[1], bibrec)) else: run_sql("delete from aidPERSONIDPAPERS where bibref_table=%s and bibref_value=%s " "and bibrec=%s and flag<2", (dup[0], dup[1], bibrec)) if repair and bibrecs_to_reassign: printer("Reassigning deleted duplicates %s" % str(bibrecs_to_reassign)) from bibauthorid_rabbit import rabbit rabbit(bibrecs_to_reassign) return all_ok def get_wrong_names(): ''' Returns a generator with all wrong names in aidPERSONIDPAPERS. Every element is (table, ref, correct_name). ''' bib100 = dict(((x[0], create_normalized_name(split_name_parts(x[1]))) for x in get_bib10x())) bib700 = dict(((x[0], create_normalized_name(split_name_parts(x[1]))) for x in get_bib70x())) pidnames100 = set(run_sql("select bibref_value, name from aidPERSONIDPAPERS " " where bibref_table='100'")) pidnames700 = set(run_sql("select bibref_value, name from aidPERSONIDPAPERS " " where bibref_table='700'")) wrong100 = set(('100', x[0], bib100.get(x[0], None)) for x in pidnames100 if x[1] != bib100.get(x[0], None)) wrong700 = set(('700', x[0], bib700.get(x[0], None)) for x in pidnames700 if x[1] != bib700.get(x[0], None)) total = len(wrong100) + len(wrong700) return chain(wrong100, wrong700), total def check_wrong_names(printer, repair=False): ret = True wrong_names, number = get_wrong_names() if number > 0: ret = False printer("%d corrupted names in aidPERSONIDPAPERS." % number) - for i, wrong_name in enumerate(wrong_names): + for wrong_name in wrong_names: if wrong_name[2]: printer("Outdated name, '%s'(%s:%d)." % (wrong_name[2], wrong_name[0], wrong_name[1])) else: printer("Invalid id(%s:%d)." % (wrong_name[0], wrong_name[1])) if repair: printer("Fixing wrong name: %s" % str(wrong_name)) if wrong_name[2]: run_sql("update aidPERSONIDPAPERS set name=%s where bibref_table=%s and bibref_value=%s", (wrong_name[2], wrong_name[0], wrong_name[1])) else: run_sql("delete from aidPERSONIDPAPERS where bibref_table=%s and bibref_value=%s", (wrong_name[0], wrong_name[1])) return ret def check_canonical_names(printer, repair=False): ret = True pid_cn = run_sql("select personid, data from aidPERSONIDDATA where tag = %s", ('canonical_name',)) pid_2_cn = dict((k, len(list(d))) for k, d in groupby(sorted(pid_cn, key=itemgetter(0)), key=itemgetter(0))) pid_to_repair = [] for pid in get_existing_personids(): canon = pid_2_cn.get(pid, 0) if canon != 1: if canon == 0: papers = run_sql("select count(*) from aidPERSONIDPAPERS where personid = %s", (pid,))[0][0] if papers != 0: printer("Personid %d does not have a canonical name, but have %d papers." % (pid, papers)) ret = False pid_to_repair.append(pid) else: printer("Personid %d has %d canonical names.", (pid, canon)) pid_to_repair.append(pid) ret = False if repair and not ret: printer("Repairing canonical names for pids: %s" % str(pid_to_repair)) update_personID_canonical_names(pid_to_repair, overwrite=True) return ret def check_empty_personids(printer, repair=False): ret = True paper_pids = set(p[0] for p in run_sql("select personid from aidPERSONIDPAPERS")) data_pids = set(p[0] for p in run_sql("select personid from aidPERSONIDDATA")) for p in data_pids - paper_pids: fields = run_sql("select count(*) from aidPERSONIDDATA where personid = %s and tag <> %s", (p, "canonical_name",))[0][0] if fields == 0: printer("Personid %d has no papers and nothing else than canonical_name." % p) if repair: printer("Deleting empty person %s" % str(p)) run_sql("delete from aidPERSONIDDATA where personid=%s", (p,)) ret = False return ret def check_wrong_rejection(printer, repair=False): all_ok = True to_reassign = [] to_deal_with = [] all_rejections = set(run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where flag = %s", ('-2',))) all_confirmed = set(run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where flag > %s", ('-2',))) not_assigned = all_rejections - all_confirmed if not_assigned: all_ok = False for s in not_assigned: printer('Paper (%s:%s,%s) was rejected but never reassigned' % s) to_reassign.append(s[2]) all_rejections = set(run_sql("select personid, bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where flag = %s", ('-2',))) all_confirmed = set(run_sql("select personid, bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where flag > %s", ('-2',))) both_confirmed_and_rejected = all_rejections & all_confirmed if both_confirmed_and_rejected: all_ok = False for i in both_confirmed_and_rejected: printer("Conflicting assignment/rejection: %s" % str(i)) to_deal_with.append(i) if repair and (to_reassign or to_deal_with): from bibauthorid_rabbit import rabbit if to_reassign: printer("Reassigning bibrecs with missing entries: %s" % str(to_reassign)) rabbit(to_reassign) if to_deal_with: #We got claims and rejections on the same person for the same paper. Let's forget about #it and reassign it automatically, they'll make up their minds sooner or later. printer("Deleting and reassigning bibrefrecs with conflicts %s" % str(to_deal_with)) for sig in to_deal_with: run_sql("delete from aidPERSONIDPAPERS where personid=%s and bibref_table=%s and " "bibref_value=%s and bibrec = %s", (sig)) rabbit(map(itemgetter(3), to_deal_with)) return all_ok def check_merger(): ''' This function presumes that copy_personid was called before the merger. ''' is_ok = True old_claims = set(run_sql("select personid, bibref_table, bibref_value, bibrec, flag " "from aidPERSONIDPAPERS_copy " "where flag = -2 or flag = 2")) cur_claims = set(run_sql("select personid, bibref_table, bibref_value, bibrec, flag " "from aidPERSONIDPAPERS " "where flag = -2 or flag = 2")) errors = ((old_claims - cur_claims, "Some claims were lost during the merge."), (cur_claims - old_claims, "Some new claims appeared after the merge.")) act = { -2 : 'Rejection', 2 : 'Claim' } for err_set, err_msg in errors: if err_set: is_ok = False bibauthor_print(err_msg) bibauthor_print("".join(" %s: personid %d %d:%d,%d\n" % (act[cl[4]], cl[0], int(cl[1]), cl[2], cl[3]) for cl in err_set)) old_assigned = set(run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS_copy")) #"where flag <> -2 and flag <> 2")) cur_assigned = set(run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS")) #"where flag <> -2 and flag <> 2")) errors = ((old_assigned - cur_assigned, "Some signatures were lost during the merge."), (cur_assigned - old_assigned, "Some new signatures appeared after the merge.")) for err_sig, err_msg in errors: if err_sig: is_ok = False bibauthor_print(err_msg) bibauthor_print("".join(" %s:%d,%d\n" % sig for sig in err_sig)) return is_ok def check_results(): is_ok = True all_result_rows = run_sql("select * from aidRESULTS") keyfunc = lambda x: x[1:] duplicated = (d for d in (list(d) for k, d in groupby(sorted(all_result_rows, key=keyfunc), key=keyfunc)) if len(d) > 1) for dd in duplicated: is_ok = False for d in dd: print "Duplicated row in aidRESULTS" print "%s %s %s %s" % d print clusters = {} for rr in all_result_rows: clusters[rr[0]] = clusters.get(rr[0], []) + [rr[3]] faulty_clusters = dict((cid, len(recs) - len(set(recs))) for cid, recs in clusters.items() if not len(recs) == len(set(recs))) if faulty_clusters: is_ok = False print "Recids NOT unique in clusters!" print ("A total of %s clusters hold an average of %.2f duplicates" % (len(faulty_clusters), (sum(faulty_clusters.values()) / float(len(faulty_clusters))))) for c in faulty_clusters: print "Name: %-20s Size: %4d Faulty: %2d" % (c, len(clusters[c]), faulty_clusters[c]) return is_ok def check_claim_inspireid_contradiction(): iids10x = run_sql("select id from bib10x where tag = '100__i'") iids70x = run_sql("select id from bib70x where tag = '700__i'") refs10x = set(x[0] for x in run_sql("select id from bib10x where tag = '100__a'")) refs70x = set(x[0] for x in run_sql("select id from bib70x where tag = '700__a'")) if iids10x: iids10x = list_2_SQL_str(iids10x, lambda x: str(x[0])) iids10x = run_sql("select id_bibxxx, id_bibrec, field_number " "from bibrec_bib10x " "where id_bibxxx in %s" % iids10x) iids10x = ((row[0], [(ref, rec) for ref, rec in run_sql( "select id_bibxxx, id_bibrec " "from bibrec_bib10x " "where id_bibrec = '%s' " "and field_number = '%s'" % row[1:]) if ref in refs10x]) for row in iids10x) else: iids10x = () if iids70x: iids70x = list_2_SQL_str(iids70x, lambda x: str(x[0])) iids70x = run_sql("select id_bibxxx, id_bibrec, field_number " "from bibrec_bib70x " "where id_bibxxx in %s" % iids70x) iids70x = ((row[0], [(ref, rec) for ref, rec in run_sql( "select id_bibxxx, id_bibrec " "from bibrec_bib70x " "where id_bibrec = '%s' " "and field_number = '%s'" % (row[1:])) if ref in refs70x]) for row in iids70x) else: iids70x = () # [(iids, [bibs])] inspired = list(chain(((iid, list(set(('100',) + bib for bib in bibs))) for iid, bibs in iids10x), ((iid, list(set(('700',) + bib for bib in bibs))) for iid, bibs in iids70x))) assert all(len(x[1]) == 1 for x in inspired) inspired = ((k, map(itemgetter(0), map(itemgetter(1), d))) for k, d in groupby(sorted(inspired, key=itemgetter(0)), key=itemgetter(0))) # [(inspireid, [bibs])] inspired = [([(run_sql("select personid " "from aidPERSONIDPAPERS " "where bibref_table = %s " "and bibref_value = %s " "and bibrec = %s " "and flag = '2'" , bib), bib) for bib in cluster[1]], cluster[0]) for cluster in inspired] # [([([pid], bibs)], inspireid)] for cluster, iid in inspired: pids = set(chain.from_iterable(imap(itemgetter(0), cluster))) if len(pids) > 1: print "InspireID: %s links the following papers:" % iid print map(itemgetter(1), cluster) print "More than one personid claimed them:" print list(pids) print continue if len(pids) == 0: # not even one paper with this inspireid has been # claimed, screw it continue pid = list(pids)[0][0] # The last step is to check all non-claimed papers for being # claimed by the person on some different signature. problem = (run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where bibrec = %s " "and personid = %s " "and flag = %s" , (bib[2], pid, 2)) for bib in (bib for lpid, bib in cluster if not lpid)) problem = list(chain.from_iterable(problem)) if problem: print "A personid has claimed a paper from an inspireid cluster and a contradictory paper." print "Personid %d" % pid print "Inspireid cluster %s" % str(map(itemgetter(1), cluster)) print "Contradicting claims: %s" % str(problem) print def get_all_bibrecs(): return [x[0] for x in run_sql("select distinct bibrec from aidPERSONIDPAPERS")] def get_bibrefrec_to_pid_flag_mapping(): whole_table = run_sql("select bibref_table,bibref_value,bibrec,personid,flag from aidPERSONIDPAPERS") ret = {} for x in whole_table: sig = (x[0], x[1], x[2]) pid_flag = (x[3], x[4]) ret[sig] = ret.get(sig , []) + [pid_flag] return ret def remove_all_bibrecs(bibrecs): bibrecs_s = list_2_SQL_str(bibrecs) run_sql("delete from aidPERSONIDPAPERS where bibrec in %s" % bibrecs_s) def empty_results_table(): run_sql("TRUNCATE aidRESULTS") def save_cluster(named_cluster): name, cluster = named_cluster for bib in cluster.bibs: run_sql("INSERT INTO aidRESULTS " "(personid, bibref_table, bibref_value, bibrec) " "VALUES (%s, %s, %s, %s) " , (name, str(bib[0]), bib[1], bib[2])) def remove_result_cluster(name): run_sql("DELETE FROM aidRESULTS " "WHERE personid like '%s.%%'" % name) def personid_name_from_signature(sig): ret = run_sql("select personid, name " "from aidPERSONIDPAPERS " "where bibref_table = %s and bibref_value = %s and bibrec = %s " "and flag > '-2'" , sig) assert len(ret) < 2, ret return ret def personid_from_signature(sig): ret = run_sql("select personid, flag " "from aidPERSONIDPAPERS " "where bibref_table = %s and bibref_value = %s and bibrec = %s " "and flag > '-2'" , sig) assert len(ret) < 2, ret return ret def get_signature_info(sig): ret = run_sql("select personid, flag " "from aidPERSONIDPAPERS " "where bibref_table = %s and bibref_value = %s and bibrec = %s " "order by flag" , sig) return ret def get_claimed_papers(pid): return run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS " "where personid = %s " "and flag > %s", (pid, 1)) def copy_personids(): run_sql("DROP TABLE IF EXISTS `aidPERSONIDDATA_copy`") run_sql("CREATE TABLE `aidPERSONIDDATA_copy` ( " "`personid` BIGINT( 8 ) UNSIGNED NOT NULL , " "`tag` VARCHAR( 64 ) NOT NULL , " "`data` VARCHAR( 256 ) NOT NULL , " "`opt1` MEDIUMINT( 8 ) DEFAULT NULL , " "`opt2` MEDIUMINT( 8 ) DEFAULT NULL , " "`opt3` VARCHAR( 256 ) DEFAULT NULL , " "KEY `personid-b` ( `personid` ) , " "KEY `tag-b` ( `tag` ) , " "KEY `data-b` ( `data` ) , " "KEY `opt1` ( `opt1` ) " ") ENGINE = MYISAM DEFAULT CHARSET = utf8") run_sql("INSERT INTO `aidPERSONIDDATA_copy` " "SELECT * " "FROM `aidPERSONIDDATA`") run_sql("DROP TABLE IF EXISTS `aidPERSONIDPAPERS_copy`") run_sql("CREATE TABLE `aidPERSONIDPAPERS_copy` ( " "`personid` bigint( 8 ) unsigned NOT NULL , " "`bibref_table` enum( '100', '700' ) NOT NULL , " "`bibref_value` mediumint( 8 ) unsigned NOT NULL , " "`bibrec` mediumint( 8 ) unsigned NOT NULL , " "`name` varchar( 256 ) NOT NULL , " "`flag` smallint( 2 ) NOT NULL DEFAULT '0', " "`lcul` smallint( 2 ) NOT NULL DEFAULT '0', " "`last_updated` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP , " "KEY `personid-b` ( `personid` ) , " "KEY `reftable-b` ( `bibref_table` ) , " "KEY `refvalue-b` ( `bibref_value` ) , " "KEY `rec-b` ( `bibrec` ) , " "KEY `name-b` ( `name` ) , " "KEY `timestamp-b` ( `last_updated` ) , " "KEY `ptvrf-b` ( `personid` , `bibref_table` , `bibref_value` , `bibrec` , `flag` ) " ") ENGINE = MyISAM DEFAULT CHARSET = utf8") run_sql("INSERT INTO `aidPERSONIDPAPERS_copy` " "SELECT * " "FROM `aidPERSONIDPAPERS") def delete_empty_persons(): pp = run_sql("select personid from aidPERSONIDPAPERS") pp = set(p[0] for p in pp) pd = run_sql("select personid from aidPERSONIDDATA") pd = set(p[0] for p in pd) fpd = run_sql("select personid from aidPERSONIDDATA where tag <> 'canonical_name'") fpd = set(p[0] for p in fpd) to_delete = pd - (pp | fpd) if to_delete: run_sql("delete from aidPERSONIDDATA where personid in %s" % list_2_SQL_str(to_delete)) def restore_personids(): run_sql("TRUNCATE `aidPERSONIDDATA`") run_sql("INSERT INTO `aidPERSONIDDATA` " "SELECT * " "FROM `aidPERSONIDDATA_copy`") run_sql("TRUNCATE `aidPERSONIDPAPERS`") run_sql("INSERT INTO `aidPERSONIDPAPERS` " "SELECT * " "FROM `aidPERSONIDPAPERS_copy") def resolve_affiliation(ambiguous_aff_string): """ This is a method available in the context of author disambiguation in ADS only. No other platform provides the db table used by this function. @warning: to be used in an ADS context only. @param ambiguous_aff_string: Ambiguous affiliation string @type ambiguous_aff_string: str @return: The normalized version of the name string as presented in the database @rtype: str """ if not ambiguous_aff_string or not bconfig.CFG_ADS_SITE: return "None" aff_id = run_sql("select aff_id from ads_affiliations where affstring=%s", (ambiguous_aff_string,)) if aff_id: return aff_id[0][0] else: return "None" def get_free_pids(): ''' Returns an iterator with all free personids. It's cool, because it fills holes. ''' all_pids = frozenset(x[0] for x in chain( run_sql("select personid from aidPERSONIDPAPERS") , run_sql("select personid from aidPERSONIDDATA"))) return ifilter(lambda x: x not in all_pids, count()) def remove_results_outside(many_names): many_names = frozenset(many_names) res_names = frozenset(x[0].split(".")[0] for x in run_sql("select personid from aidRESULTS")) for name in res_names - many_names: run_sql("delete from aidRESULTS where personid like '%s.%%'" % name) def get_signatures_from_bibrefs(bibrefs): bib10x = ifilter(lambda x: x[0] == 100, bibrefs) bib10x_s = list_2_SQL_str(bib10x, lambda x: x[1]) bib70x = ifilter(lambda x: x[0] == 700, bibrefs) bib70x_s = list_2_SQL_str(bib70x, lambda x: x[1]) valid_recs = set(get_all_valid_bibrecs()) if bib10x_s != '()': sig10x = run_sql("select 100, id_bibxxx, id_bibrec " "from bibrec_bib10x " "where id_bibxxx in %s" % (bib10x_s,)) else: sig10x = () if bib70x_s != '()': sig70x = run_sql("select 700, id_bibxxx, id_bibrec " "from bibrec_bib70x " "where id_bibxxx in %s" % (bib70x_s,)) else: sig70x = () return ifilter(lambda x: x in valid_recs, chain(sig10x, sig70x)) def get_all_valid_bibrecs(): collection_restriction_pattern = " or ".join(["980__a:\"%s\"" % x for x in bconfig.LIMIT_TO_COLLECTIONS]) return perform_request_search(p="%s" % collection_restriction_pattern, rg=0) def get_coauthor_pids(pid, exclude_bibrecs=None): papers = get_person_bibrecs(pid) if exclude_bibrecs: papers = set(papers) - set(exclude_bibrecs) if not papers: return [] papers_s = list_2_SQL_str(papers) pids = run_sql("select personid,bibrec from aidPERSONIDPAPERS " "where bibrec in %s and flag > -2" % papers_s) pids = set((int(p[0]), int(p[1])) for p in pids) pids = sorted([p[0] for p in pids]) pids = groupby(pids) pids = [(key, len(list(val))) for key, val in pids if key != pid] pids = sorted(pids, key=lambda x: x[1], reverse=True) return pids diff --git a/modules/bibauthorid/lib/bibauthorid_rabbit.py b/modules/bibauthorid/lib/bibauthorid_rabbit.py index 8b75f847e..9930608a7 100644 --- a/modules/bibauthorid/lib/bibauthorid_rabbit.py +++ b/modules/bibauthorid/lib/bibauthorid_rabbit.py @@ -1,188 +1,187 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011, 2012 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. from itertools import cycle, imap, chain, izip from operator import itemgetter from bibtask import task_sleep_now_if_required import bibauthorid_config as bconfig from bibauthorid_comparison import cached_sym from bibauthorid_name_utils import compare_names as comp_names from bibauthorid_name_utils import split_name_parts from bibauthorid_name_utils import create_normalized_name from bibauthorid_general_utils import update_status \ , update_status_final from bibauthorid_matrix_optimization import maximized_mapping from bibauthorid_backinterface import get_all_valid_bibrecs from bibauthorid_backinterface import filter_bibrecs_outside from bibauthorid_backinterface import get_deleted_papers from bibauthorid_backinterface import delete_paper_from_personid from bibauthorid_backinterface import get_authors_from_paper from bibauthorid_backinterface import get_coauthors_from_paper from bibauthorid_backinterface import get_signatures_from_rec from bibauthorid_backinterface import modify_signature from bibauthorid_backinterface import remove_sigs from bibauthorid_backinterface import find_pids_by_exact_name as _find_pids_by_exact_name from bibauthorid_backinterface import new_person_from_signature as _new_person_from_signature from bibauthorid_backinterface import add_signature as _add_signature from bibauthorid_backinterface import update_personID_canonical_names from bibauthorid_backinterface import update_personID_external_ids from bibauthorid_backinterface import get_name_by_bibrecref from bibauthorid_backinterface import populate_partial_marc_caches from bibauthorid_backinterface import destroy_partial_marc_caches from bibauthorid_backinterface import get_inspire_id from bibauthorid_backinterface import get_person_with_extid from bibauthorid_backinterface import get_name_string_to_pid_dictionary from bibauthorid_backinterface import get_new_personid -from bibauthorid_backinterface import get_bibrecref_to_pid_dictuonary USE_EXT_IDS = bconfig.RABBIT_USE_EXTERNAL_IDS USE_INSPIREID = bconfig.RABBIT_USE_EXTERNAL_ID_INSPIREID def rabbit(bibrecs, check_invalid_papers=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec,)) if inspire_id: matched_pids = list(get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() diff --git a/modules/bibauthorid/lib/bibauthorid_tortoise.py b/modules/bibauthorid/lib/bibauthorid_tortoise.py index 072a23130..26ce7d9f9 100644 --- a/modules/bibauthorid/lib/bibauthorid_tortoise.py +++ b/modules/bibauthorid/lib/bibauthorid_tortoise.py @@ -1,208 +1,206 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. ## Copyright (C) 2011, 2012 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. import bibauthorid_config as bconfig from datetime import datetime import os from bibauthorid_cluster_set import delayed_cluster_sets_from_marktables from bibauthorid_cluster_set import delayed_cluster_sets_from_personid from bibauthorid_wedge import wedge from bibauthorid_name_utils import generate_last_name_cluster_str from bibauthorid_backinterface import empty_results_table from bibauthorid_backinterface import remove_result_cluster from bibauthorid_general_utils import bibauthor_print from bibauthorid_prob_matrix import prepare_matirx -from bibauthorid_scheduler import schedule \ - , matrix_coefs \ - , wedge_coefs +from bibauthorid_scheduler import schedule, matrix_coefs from bibauthorid_least_squares import to_function as create_approx_func ''' There are three main entry points to tortoise i) tortoise Performs disambiguation iteration. The arguemnt pure indicates whether to use the claims and the rejections or not. Use pure=True only to test the accuracy of tortoise. ii) tortoise_from_scratch NOT RECOMMENDED! Use this function only if you have just installed invenio and this is your first disambiguation or if personid is broken. iii) tortoise_last_name Computes the clusters for only one last name group. Is is primary used for testing. It may also be used to fix a broken last name cluster. It does not involve multiprocessing so it is convinient to debug with pdb. ''' # Exit codes: # The standard ones are not well documented # so we are using random numbers. def tortoise_from_scratch(): bibauthor_print("Preparing cluster sets.") cluster_sets, lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix( cluster_sets, sizes, force=True) assert len(exit_statuses) == len(cluster_sets) assert all(stat == os.EX_OK for stat in exit_statuses) empty_results_table() bibauthor_print("Preparing cluster sets.") cluster_sets, lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store( cluster_sets, sizes) assert len(exit_statuses) == len(cluster_sets) assert all(stat == os.EX_OK for stat in exit_statuses) def tortoise(pure=False, force_matrix_creation=False, skip_matrix_creation=False, last_run=None): assert not force_matrix_creation or not skip_matrix_creation # The computation must be forced in case we want # to compute pure results force_matrix_creation = force_matrix_creation or pure if not skip_matrix_creation: bibauthor_print("Preparing cluster sets.") clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix( clusters, sizes, force=force_matrix_creation) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses) bibauthor_print("Preparing cluster sets.") clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store( clusters, sizes) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses) def tortoise_last_name(name, from_mark=False, pure=False): assert not(from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: clusters, lnames, sizes = delayed_cluster_sets_from_marktables() else: clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) cluster_set = cluster() create_matrix(cluster_set, True) wedge_and_store(cluster_set) except IndexError: bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname)) def create_matrix(cluster_set, force): bibs = cluster_set.num_all_bibs expected = bibs * (bibs - 1) / 2 bibauthor_print("Start building matrix for %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) return prepare_matirx(cluster_set, force) def force_create_matrix(cluster_set, force): bibauthor_print("Building a cluster set.") return create_matrix(cluster_set(), force) def wedge_and_store(cluster_set): bibs = cluster_set.num_all_bibs expected = bibs * (bibs - 1) / 2 bibauthor_print("Start working on %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) wedge(cluster_set) remove_result_cluster(cluster_set.last_name) cluster_set.store() return True def force_wedge_and_store(cluster_set): bibauthor_print("Building a cluster set.") return wedge_and_store(cluster_set()) def schedule_create_matrix(cluster_sets, sizes, force): def create_job(cluster): def ret(): return force_create_matrix(cluster, force) return ret memfile_path = None if bconfig.DEBUG_PROCESS_PEAK_MEMORY: tt = datetime.now() tt = (tt.hour, tt.minute, tt.day, tt.month, tt.year) memfile_path = ('%smatrix_memory_%d:%d_%d-%d-%d.log' % ((bconfig.TORTOISE_FILES_PATH,) + tt)) return schedule(map(create_job, cluster_sets), sizes, create_approx_func(matrix_coefs), memfile_path) def schedule_wedge_and_store(cluster_sets, sizes): def create_job(cluster): def ret(): return force_wedge_and_store(cluster) return ret memfile_path = None if bconfig.DEBUG_PROCESS_PEAK_MEMORY: tt = datetime.now() tt = (tt.hour, tt.minute, tt.day, tt.month, tt.year) memfile_path = ('%swedge_memory_%d:%d_%d-%d-%d.log' % ((bconfig.TORTOISE_FILES_PATH,) + tt)) return schedule(map(create_job, cluster_sets), sizes, create_approx_func(matrix_coefs), memfile_path)