Page MenuHomec4science

06_oacct_sherpa.py
No OneTemporary

File Metadata

Created
Wed, Nov 6, 23:32

06_oacct_sherpa.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet d'extraire les données de Sherpa/Romeo obtenues par API et les traiter pour les rendre exploitables dans l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 16.07.2021
# ## Données de Sherpa/Romeo
#
# ### Exemple
#
# https://v2.sherpa.ac.uk/cgi/retrieve_by_id?item-type=publication&api-key=EEE6F146-678E-11EB-9C3A-202F3DE2659A&format=Json&identifier=17601
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
import os
# afficher toutes les colonnes
pd.set_option('display.max_columns', None)
# ## Table publisher_sherpa
# In[2]:
# creation du DF
col_names = ['journal',
'publisher_id',
'name',
'country',
'type',
'url'
]
publisher_sherpa = pd.DataFrame(columns = col_names)
publisher_sherpa
# ## Table sherpa match issn
# In[3]:
# creation du DF
col_names = ['issn',
'sherpa_match',
]
sherpa_match_issn = pd.DataFrame(columns = col_names)
sherpa_match_issn
# ## Table sherpa issns
# In[4]:
# creation du DF
col_names = ['issn',
'type',
]
sherpa_issn = pd.DataFrame(columns = col_names)
sherpa_issn
# ## Table sherpa journals
# In[5]:
# creation du DF
col_names = ['journal',
'title',
'url',
]
sherpa_journal = pd.DataFrame(columns = col_names)
sherpa_journal
# ## Import table Journals et ISSN
# In[6]:
journal = pd.read_csv('sample/journals_publishers_brut.tsv', encoding='utf-8', header=0, sep='\t')
journal
# In[7]:
issn = pd.read_csv('sample/issn_brut.tsv', encoding='utf-8', header=0, sep='\t')
issn
# In[8]:
issn_ids = pd.read_csv('sample/issn_ids.tsv', encoding='utf-8', header=0, sep='\t')
issn_ids
# ## Extraction de Sherpa Romeo
# In[9]:
# extraction des informations à partir des données Sherpa/Romeo
for index, row in issn.iterrows():
journal_id = row['journal']
journal_issn = row['issn']
# if (((index/10) - int(index/10)) == 0) :
# print(index)
# initialisation des variables à extraire
publisher_id = np.nan
publisher_name = ''
publisher_country = ''
publisher_type = ''
publisher_url = ''
# boucle des fichiers json
# test d'existance du fichier
# print(row['issn'])
if os.path.exists('sherpa/data/' + journal_issn + '.json'):
with open('sherpa/data/' + journal_issn + '.json', 'r', encoding='utf-8') as f:
data = json.load(f)
if (len(data['items']) > 0):
publisher_id = data['items'][0]['publishers'][0]['publisher']['id']
if ('country' in data['items'][0]['publishers'][0]['publisher']):
publisher_country = data['items'][0]['publishers'][0]['publisher']['country']
if ('relationship_type' in data['items'][0]['publishers'][0]):
publisher_type = data['items'][0]['publishers'][0]['relationship_type']
if ('url' in data['items'][0]['publishers'][0]['publisher']):
publisher_url = data['items'][0]['publishers'][0]['publisher']['url']
if ('name' in data['items'][0]['publishers'][0]['publisher']['name'][0]):
publisher_name = data['items'][0]['publishers'][0]['publisher']['name'][0]['name']
sherpa_match = 'OK'
publisher_sherpa = publisher_sherpa.append({'journal' : journal_id, 'publisher_id' : publisher_id,
'name' : publisher_name, 'country' : publisher_country,
'type' : publisher_type, 'url' : publisher_url}, ignore_index=True)
else :
print(row['issn'] + ' - trouvé mais vide')
sherpa_match = 'empty'
else :
print(row['issn'] + ' - pas trouvé')
sherpa_match = 'missing'
sherpa_match_issn = sherpa_match_issn.append({'issn' : row['issn'], 'sherpa_match' : sherpa_match}, ignore_index=True)
# In[10]:
publisher_sherpa
# In[11]:
sherpa_match_issn
# In[12]:
# dedup
publisher_sherpa_dedup = publisher_sherpa.drop_duplicates()
publisher_sherpa_dedup
# In[13]:
sherpa_match_issn
# In[14]:
# ajout du issnl et du titre
sherpa_match_issn = pd.merge(sherpa_match_issn, issn_ids, on='issn', how='left')
sherpa_match_issn = pd.merge(sherpa_match_issn, journal[['issnl', 'title']], on='issnl', how='left')
sherpa_match_issn
# In[15]:
sherpa_match_results = sherpa_match_issn[['id', 'issnl', 'sherpa_match']].groupby(['issnl', 'sherpa_match']).count()
sherpa_match_results
# In[16]:
sherpa_match_results = sherpa_match_results.reset_index()
sherpa_match_results
# In[17]:
sherpa_match_results_ok = sherpa_match_results.loc[sherpa_match_results['sherpa_match'] == 'OK']
issn_ids_issnl = issn_ids[['issnl', 'journal']].drop_duplicates(subset='issnl')
issn_ids_issnl = pd.merge(issn_ids_issnl, sherpa_match_results_ok, on='issnl', how='left')
issn_ids_issnl = pd.merge(issn_ids_issnl, journal[['issnl', 'title']], on='issnl', how='left')
issn_ids_issnl
# In[18]:
journals_not_sherpa = issn_ids_issnl.loc[issn_ids_issnl['sherpa_match'].isna()]
journals_not_sherpa
# In[19]:
sherpa_match_results_empty = sherpa_match_results.loc[sherpa_match_results['sherpa_match'] == 'empty']
sherpa_match_results_missing = sherpa_match_results.loc[sherpa_match_results['sherpa_match'] == 'missing']
del journals_not_sherpa['sherpa_match']
del journals_not_sherpa['id']
journals_not_sherpa = pd.merge(journals_not_sherpa, sherpa_match_results_empty, on='issnl', how='left')
del journals_not_sherpa['id']
journals_not_sherpa = pd.merge(journals_not_sherpa, sherpa_match_results_missing, on='issnl', how='left')
del journals_not_sherpa['id']
journals_not_sherpa
# In[20]:
# extraction des informations des journaux à partir des données Sherpa/Romeo
for index, row in issn.iterrows():
journal_id = row['journal']
journal_issn = row['issn']
# boucle des fichiers json
# test d'existance du fichier
# print(row['format'])
if (((index/10) - int(index/10)) == 0) :
print(index)
if os.path.exists('sherpa/data/' + journal_issn + '.json'):
with open('sherpa/data/' + journal_issn + '.json', 'r', encoding='utf-8') as f:
data = json.load(f)
title = np.nan
url = np.nan
if (len(data['items']) > 0):
if ('url' in data['items'][0]):
url = data['items'][0]['url']
if ('title' in data['items'][0]['title'][0]):
title = data['items'][0]['title'][0]['title']
sherpa_journal = sherpa_journal.append({'journal' : journal_id, 'title' : title, 'url' : url}, ignore_index=True)
# In[21]:
sherpa_journal
# In[22]:
# extraction des informations à partir des données Sherpa/Romeo
for index, row in issn.iterrows():
journal_id = row['journal']
journal_issn = row['issn']
# boucle des fichiers json
# test d'existance du fichier
# print(row['format'])
if (((index/10) - int(index/10)) == 0) :
print(index)
if os.path.exists('sherpa/data/' + journal_issn + '.json'):
with open('sherpa/data/' + journal_issn + '.json', 'r', encoding='utf-8') as f:
myissn = np.nan
mytype = np.nan
data = json.load(f)
if (len(data['items']) > 0):
if ('issns' in data['items'][0]):
issns = data['items'][0]['issns']
for i in issns:
if ('issn' in i):
myissn = i['issn']
if ('type' in i):
mytype = i['type']
sherpa_issn = sherpa_issn.append({'issn' : myissn, 'type' : mytype}, ignore_index=True)
# In[23]:
sherpa_issn
# In[24]:
# dedup
sherpa_issn = sherpa_issn.drop_duplicates()
sherpa_issn
# In[25]:
# completer le fichier des issns avec les types de sherpa
issn2 = pd.merge(issn, sherpa_issn, on='issn', how='left')
issn2
# In[26]:
# exports csv
publisher_sherpa_dedup.to_csv('sample/publisher_sherpa.tsv', sep='\t', encoding='utf-8', index=False)
sherpa_match_issn.to_csv('sample/sherpa_match_issn.tsv', sep='\t', encoding='utf-8', index=False)
sherpa_journal.to_csv('sample/sherpa_journal.tsv', sep='\t', encoding='utf-8', index=False)
issn2.to_csv('sample/issn_sherpa.tsv', sep='\t', encoding='utf-8', index=False)
journals_not_sherpa.to_csv('sample/journals_not_sherpa.tsv', sep='\t', encoding='utf-8', index=False)
# In[27]:
# exports excel
publisher_sherpa_dedup.to_excel('sample/publisher_sherpa.xlsx', index=False)
sherpa_match_issn.to_excel('sample/sherpa_match_issn.xlsx', index=False)
sherpa_journal.to_excel('sample/sherpa_journal.xlsx', index=False)
issn2.to_excel('sample/issn_sherpa.xlsx', index=False)
journals_not_sherpa.to_excel('sample/journals_not_sherpa.xlsx', index=False)
# In[28]:
# ajout des titres Sherpa a la table des revues
# renommer les colonnes
sherpa_journal = sherpa_journal.rename(columns={'journal' : 'id'})
journal = pd.merge(journal, sherpa_journal, on='id', how='left')
journal
# In[29]:
# choix du titre et url
journal['url'] = journal['url_y']
journal.loc[journal['url_y'].isna(), 'url'] = journal['url_x']
journal['title'] = journal['title_y']
journal.loc[journal['title_y'].isna(), 'title'] = journal['title_x']
journal
# In[30]:
journals_export = journal[['id', 'title', 'name_short_iso_4', 'starting_year', 'end_year', 'url', 'country', 'language', 'oa_status', 'publisher', 'doaj_seal', 'doaj_status', 'lockss', 'portico', 'nlch', 'qoam_av_score']]
journals_export
# In[31]:
# renommage des champs finaux
journals_export = journals_export.rename(columns={'title' : 'name', 'url' : 'website'})
# remplacement des vides et id à int
journals_export['starting_year'] = journals_export['starting_year'].fillna(0)
journals_export['end_year'] = journals_export['end_year'].fillna(9999)
journals_export['name_short_iso_4'] = journals_export['name_short_iso_4'].fillna('')
journals_export['website'] = journals_export['website'].fillna('')
journals_export['doaj_seal'] = journals_export['doaj_seal'].fillna('0')
journals_export['country'] = journals_export['country'].fillna('999999')
journals_export['language'] = journals_export['language'].fillna('999999')
journals_export['doaj_status'] = journals_export['doaj_status'].astype(int)
journals_export['doaj_seal'] = journals_export['doaj_seal'].astype(int)
journals_export['lockss'] = journals_export['lockss'].astype(int)
journals_export['portico'] = journals_export['portico'].astype(int)
journals_export['nlch'] = journals_export['nlch'].astype(int)
journals_export
# In[32]:
journals_export = journals_export.drop_duplicates(subset='id')
journals_export
# In[33]:
# test journaux sans titre
journals_export.loc[journals_export['name'].isna()]
# In[34]:
# export et suppression des journaux sans titre
# export csv
journals_export.loc[journals_export['name'].isna()].to_csv('sample/sherpa_journals_without_title.tsv', sep='\t', encoding='utf-8', index=False)
# export excel
journals_export.loc[journals_export['name'].isna()].to_excel('sample/sherpa_journals_without_title.xlsx', index=False)
journals_export = journals_export.loc[journals_export['name'].notna()]
journals_export
# In[35]:
journals_export.loc[journals_export['name'].str.contains('(Print)')]
# In[36]:
journals_export.loc[journals_export['name'].str.contains('(Online)')]
# In[37]:
# remplacement des mentions " (Print)" et " (Online)" dans les titres
journals_export['name'] = journals_export['name'].str.replace('(Print)', '')
journals_export['name'] = journals_export['name'].str.replace('(Online)', '')
journals_export
# In[38]:
journals_export.loc[journals_export['name'].str.contains('(Print)')]
# In[39]:
journals_export.loc[journals_export['name'].str.contains('(Online)')]
# ## Table sherpa_policies
# In[40]:
# creation du DF
col_names = ['journal',
'issn',
'sherpa_id',
'sherpa_uri',
'open_access_prohibited',
'additional_oa_fee',
'article_version',
'license',
'embargo',
'prerequisites',
'prerequisite_funders',
'prerequisite_funders_name',
'prerequisite_funders_fundref',
'prerequisite_funders_ror',
'prerequisite_funders_country',
'prerequisite_funders_url',
'prerequisite_funders_sherpa_id',
'prerequisite_subjects',
'location',
'locations_ir',
'locations_not_ir',
'named_repository',
'named_academic_social_network',
'copyright_owner',
'publisher_deposit',
'archiving',
'conditions',
'public_notes'
]
sherpa_policies = pd.DataFrame(columns = col_names)
sherpa_policies
# In[41]:
# dédoublonage par journal id
issn_dedup = issn.drop_duplicates(subset='journal')
issn_dedup
# In[42]:
# type de repositories qui provoquent archiving = 1 :
# tous les types : 'academic_social_network', 'any_repository', 'any_website', 'authors_homepage',
# 'funder_designated_location', 'institutional_repository', 'institutional_website', 'named_academic_social_network',
# 'named_repository', 'non_commercial_institutional_repository', 'non_commercial_repository',
# 'non_commercial_social_network', 'non_commercial_subject_repository', 'non_commercial_website',
# 'preprint_repository', 'subject_repository', 'this_journal'
repositories_archiving = ['any_repository',
'institutional_repository',
'institutional_website',
'non_commercial_institutional_repository',
'non_commercial_repository',
'any_website',
'non_commercial_website']
# extraction des termes
for index, row in issn_dedup.iterrows():
journal_id = row['journal']
journal_issn = row['issn']
# boucle des fichiers json
# print(row['format'])
if (((index/10) - int(index/10)) == 0) :
print(index)
# test d'existance du fichier
if os.path.exists('sherpa/data/' + journal_issn + '.json'):
with open('sherpa/data/' + journal_issn + '.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# initialisation des variables à extraire
sherpa_id = np.nan
sherpa_uri = np.nan
open_access_prohibited = np.nan
location = np.nan
locations_ir = ''
locations_not_ir = ''
additional_oa_fee = np.nan
article_versions = np.nan
article_version = np.nan
licenses = []
embargo = 0
prerequisites = np.nan
prerequisite_funders = np.nan
prerequisite_funders_name = np.nan
prerequisite_funders_fundref = np.nan
prerequisite_funders_ror = np.nan
prerequisite_funders_country = np.nan
prerequisite_funders_url = np.nan
prerequisite_funders_sherpa_id = np.nan
prerequisite_subjects = np.nan
named_repository = np.nan
named_academic_social_network = np.nan
copyright_owner = np.nan
publisher_deposit = np.nan
archiving = np.nan
conditions = np.nan
public_notes = np.nan
if (len(data['items']) > 0):
if ('id' in data['items'][0]):
sherpa_id = data['items'][0]['id']
# test si l'id est déjà présent
if sherpa_id in sherpa_policies['sherpa_id'] :
print('SKIP ' + str(sherpa_id))
else :
poilicies = data['items'][0]['publisher_policy']
for poilicy in poilicies:
# initialisation des variables à extraire
sherpa_uri = np.nan
open_access_prohibited = np.nan
if ('uri' in poilicy):
sherpa_uri = poilicy['uri']
if ('open_access_prohibited' in poilicy):
open_access_prohibited = poilicy['open_access_prohibited']
if ('permitted_oa' in poilicy):
poas = poilicy['permitted_oa']
for poa in poas:
additional_oa_fee = np.nan
article_versions = np.nan
article_version = np.nan
licenses = []
embargo = 0
prerequisites = np.nan
prerequisite_funders = np.nan
prerequisite_funders_name = np.nan
prerequisite_funders_fundref = np.nan
prerequisite_funders_ror = np.nan
prerequisite_funders_country = np.nan
prerequisite_funders_url = np.nan
prerequisite_funders_sherpa_id = np.nan
prerequisite_subjects = np.nan
named_repository = np.nan
named_academic_social_network = np.nan
locations_ir = ''
locations_not_ir = ''
copyright_owner = np.nan
conditions = np.nan
public_notes = np.nan
if ('additional_oa_fee' in poa):
additional_oa_fee = poa['additional_oa_fee']
if ('location' in poa):
archiving = 0
location = ''
mylocations = poa['location']['location']
mylocations_text = poa['location']['location_phrases']
if (type(mylocations) is not list):
mylocations = [mylocations]
location = ' ; '.join(mylocations)
for locationi in mylocations:
if locationi in repositories_archiving :
archiving = archiving + 1
for locationi_text in mylocations_text:
if locationi_text['value'] == locationi :
if locations_ir == '':
locations_ir = locations_ir + locationi_text['phrase']
else :
if locationi_text['phrase'] not in locations_ir :
locations_ir = locations_ir + ' ; ' + locationi_text['phrase']
else :
for locationi_text in mylocations_text:
if locationi_text['value'] == locationi :
if locations_not_ir == '':
locations_not_ir = locations_not_ir + locationi_text['phrase']
else :
if locationi_text['phrase'] not in locations_not_ir :
locations_not_ir = locations_not_ir + ' ; ' + locationi_text['phrase']
# print (archiving)
if archiving > 0:
archiving = True
else :
archiving = False
if ('named_repository' in poa['location']):
if (type(poa['location']['named_repository']) is list):
named_repository = ' ; '.join(poa['location']['named_repository'])
else :
named_repository = poa['location']['named_repository']
locations_not_ir = locations_not_ir.replace('Named Repository', named_repository)
locations_ir = locations_ir.replace('Named Repository', named_repository)
if ('named_academic_social_network' in poa['location']):
if (type(poa['location']['named_academic_social_network']) is list):
named_academic_social_network = ' ; '.join(poa['location']['named_academic_social_network'])
else :
named_academic_social_network = poa['location']['named_academic_social_network']
locations_not_ir = locations_not_ir.replace('Named Academic Social Network', named_academic_social_network)
locations_ir = locations_ir.replace('Named Academic Social Network', named_academic_social_network)
if ('embargo' in poa):
# print(poa['embargo'])
embargo_amount = 0
if ('amount' in poa['embargo']):
embargo_amount = poa['embargo']['amount']
if ('units' in poa['embargo']):
if (poa['embargo']['units'] == 'months') :
embargo = embargo_amount
elif (poa['embargo']['units'] == 'years') :
embargo = embargo_amount*12
elif (poa['embargo']['units'] == 'weeks') :
embargo = int(embargo_amount/4)
if (embargo == 0):
embargo = 1
elif (poa['embargo']['units'] == 'days') :
embargo = int(embargo_amount/30)
if (embargo == 0):
embargo = 1
else :
embargo = embargo_amount
if ('prerequisites' in poa):
if 'prerequisites' in poa['prerequisites'] :
if (type(poa['prerequisites']['prerequisites']) is list):
prerequisites = ' ; '.join(poa['prerequisites']['prerequisites'])
else:
prerequisites = poa['prerequisites']['prerequisites']
if ('prerequisite_funders' in poa['prerequisites']):
prerequisite_funders = True
# prerequisite_funders = poa['prerequisites']['prerequisite_funders']
# if (type(poa['prerequisites']['prerequisite_funders']) is list):
# prerequisite_funders = ' ; '.join(poa['prerequisites']['prerequisite_funders'])
# else:
# prerequisite_funders = poa['prerequisites']['prerequisite_funders']
if ('prerequisite_subjects' in poa['prerequisites']):
prerequisite_subjects = True
# prerequisite_subjects = poa['prerequisites']['prerequisite_subjects']
# if (type(poa['prerequisite_subjects']) is list):
# prerequisite_subjects = ' ; '.join(poa['prerequisite_subjects'])
# else:
# prerequisite_subjects = poa['prerequisite_subjects']
if ('copyright_owner' in poa):
copyright_owner = poa['copyright_owner']
if ('publisher_deposit' in poa):
publisher_deposit = ''
if (type(poa['publisher_deposit']) is list):
for deposit in poa['publisher_deposit']:
if 'type' in deposit['repository_metadata']:
publisher_deposit = publisher_deposit + deposit['repository_metadata']['type']
if 'name' in deposit['repository_metadata']:
publisher_deposit = publisher_deposit + ' (' + deposit['repository_metadata']['name'][0]['name'] + ')'
else :
if 'name' in deposit['repository_metadata']:
publisher_deposit = publisher_deposit + deposit['repository_metadata']['name'][0]['name']
publisher_deposit = publisher_deposit + ' ; '
else :
deposit = poa['publisher_deposit']
if 'type' in deposit['repository_metadata']:
publisher_deposit = publisher_deposit + deposit['repository_metadata']['type']
if 'name' in deposit['repository_metadata']:
publisher_deposit = publisher_deposit + ' (' + deposit['repository_metadata']['name'][0]['name'] + ')'
else :
if 'name' in deposit['repository_metadata']:
publisher_deposit = publisher_deposit + deposit['repository_metadata']['name'][0]['name']
publisher_deposit = publisher_deposit + ' ; '
# print (publisher_deposit)
if ('conditions' in poa):
if (type(poa['conditions']) is list):
conditions = ' ; '.join(poa['conditions'])
else:
conditions = poa['conditions']
if ('public_notes' in poa):
if (type(poa['public_notes']) is list):
public_notes = ' ; '.join(poa['public_notes'])
else:
public_notes = poa['public_notes']
if ('license' in poa):
licenses = poa['license']
if (type(licenses) is not list):
licenses = [licenses]
else :
licenses = ['']
# avec article version
if ('article_version' in poa):
article_versions = poa['article_version']
for article_version in article_versions:
for license in licenses:
if ('license' in license):
mylicense = license['license']
else :
mylicense = ''
# avec prerequisites
if ('prerequisites' in poa) :
# avec prerequisites_funders
if ('prerequisite_funders' in poa['prerequisites']):
for prerequisite_fundersi in poa['prerequisites']['prerequisite_funders'] :
prerequisite_funders_name = prerequisite_fundersi['funder_metadata']['name'][0]['name']
if 'acronym' in prerequisite_fundersi['funder_metadata']['name'][0]:
prerequisite_funders_name = prerequisite_funders_name + ' (' + prerequisite_fundersi['funder_metadata']['name'][0]['acronym'] + ')'
if 'identifiers' in prerequisite_fundersi['funder_metadata'] :
for fund_identifier in prerequisite_fundersi['funder_metadata']['identifiers'] :
if fund_identifier['type'] == 'fundref':
prerequisite_funders_fundref = fund_identifier['identifier']
if fund_identifier['type'] == 'ror':
prerequisite_funders_ror = fund_identifier['identifier']
if 'country' in prerequisite_fundersi['funder_metadata']:
prerequisite_funders_country = prerequisite_fundersi['funder_metadata']['country']
if 'url' in prerequisite_fundersi['funder_metadata']:
prerequisite_funders_url = prerequisite_fundersi['funder_metadata']['url'][0]['url']
prerequisite_funders_sherpa_id = prerequisite_fundersi['funder_metadata']['id']
sherpa_policies = sherpa_policies.append({'journal' : journal_id,
'issn' : journal_issn,
'sherpa_id' : sherpa_id,
'sherpa_uri' : sherpa_uri,
'open_access_prohibited' : open_access_prohibited,
'additional_oa_fee' : additional_oa_fee,
'article_version' : article_version,
'license' : mylicense,
'embargo' : embargo,
'prerequisites' : prerequisites,
'prerequisite_funders' : prerequisite_funders,
'prerequisite_funders_name' : prerequisite_funders_name,
'prerequisite_funders_fundref' : prerequisite_funders_fundref,
'prerequisite_funders_ror' : prerequisite_funders_ror,
'prerequisite_funders_country' : prerequisite_funders_country,
'prerequisite_funders_url' : prerequisite_funders_url,
'prerequisite_funders_sherpa_id' : prerequisite_funders_sherpa_id,
'prerequisite_subjects' : prerequisite_subjects,
'location' : location,
'locations_ir' : locations_ir,
'locations_not_ir' : locations_not_ir,
'named_repository' : named_repository,
'named_academic_social_network' : named_academic_social_network,
'copyright_owner' : copyright_owner,
'publisher_deposit' : publisher_deposit,
'archiving' : archiving,
'conditions' : conditions,
'public_notes' : public_notes
}, ignore_index=True)
# sans prerequisites_funders
else :
sherpa_policies = sherpa_policies.append({'journal' : journal_id,
'issn' : journal_issn,
'sherpa_id' : sherpa_id,
'sherpa_uri' : sherpa_uri,
'open_access_prohibited' : open_access_prohibited,
'additional_oa_fee' : additional_oa_fee,
'article_version' : article_version,
'license' : mylicense,
'embargo' : embargo,
'prerequisites' : prerequisites,
'prerequisite_funders' : prerequisite_funders,
'prerequisite_funders_name' : prerequisite_funders_name,
'prerequisite_funders_fundref' : prerequisite_funders_fundref,
'prerequisite_funders_ror' : prerequisite_funders_ror,
'prerequisite_funders_country' : prerequisite_funders_country,
'prerequisite_funders_url' : prerequisite_funders_url,
'prerequisite_funders_sherpa_id' : prerequisite_funders_sherpa_id,
'prerequisite_subjects' : prerequisite_subjects,
'location' : location,
'locations_ir' : locations_ir,
'locations_not_ir' : locations_not_ir,
'named_repository' : named_repository,
'named_academic_social_network' : named_academic_social_network,
'copyright_owner' : copyright_owner,
'publisher_deposit' : publisher_deposit,
'archiving' : archiving,
'conditions' : conditions,
'public_notes' : public_notes
}, ignore_index=True)
# sans prerequisites
else :
sherpa_policies = sherpa_policies.append({'journal' : journal_id,
'issn' : journal_issn,
'sherpa_id' : sherpa_id,
'sherpa_uri' : sherpa_uri,
'open_access_prohibited' : open_access_prohibited,
'additional_oa_fee' : additional_oa_fee,
'article_version' : article_version,
'license' : mylicense,
'embargo' : embargo,
'prerequisites' : prerequisites,
'prerequisite_funders' : prerequisite_funders,
'prerequisite_funders_name' : prerequisite_funders_name,
'prerequisite_funders_fundref' : prerequisite_funders_fundref,
'prerequisite_funders_ror' : prerequisite_funders_ror,
'prerequisite_funders_country' : prerequisite_funders_country,
'prerequisite_funders_url' : prerequisite_funders_url,
'prerequisite_funders_sherpa_id' : prerequisite_funders_sherpa_id,
'prerequisite_subjects' : prerequisite_subjects,
'location' : location,
'locations_ir' : locations_ir,
'locations_not_ir' : locations_not_ir,
'named_repository' : named_repository,
'named_academic_social_network' : named_academic_social_network,
'copyright_owner' : copyright_owner,
'publisher_deposit' : publisher_deposit,
'archiving' : archiving,
'conditions' : conditions,
'public_notes' : public_notes
}, ignore_index=True)
# sans article version
else :
if (type(licenses) is not list):
licenses = [licenses]
for license in licenses:
if ('license' in license):
mylicense = license['license']
else :
mylicense = ''
# avec prerequisites
if ('prerequisites' in poa) :
# avec prerequisites_funders
if ('prerequisite_funders' in poa['prerequisites']):
for prerequisite_fundersi in poa['prerequisites']['prerequisite_funders'] :
prerequisite_funders_name = prerequisite_fundersi['funder_metadata']['name'][0]['name']
if 'acronym' in prerequisite_fundersi['funder_metadata']['name'][0]:
prerequisite_funders_name = prerequisite_funders_name + ' (' + prerequisite_fundersi['funder_metadata']['name'][0]['acronym'] + ')'
if 'identifiers' in prerequisite_fundersi['funder_metadata'] :
for fund_identifier in prerequisite_fundersi['funder_metadata']['identifiers'] :
if fund_identifier['type'] == 'fundref':
prerequisite_funders_fundref = fund_identifier['identifier']
if fund_identifier['type'] == 'ror':
prerequisite_funders_ror = fund_identifier['identifier']
if 'country' in prerequisite_fundersi['funder_metadata']:
prerequisite_funders_country = prerequisite_fundersi['funder_metadata']['country']
if 'url' in prerequisite_fundersi['funder_metadata']:
prerequisite_funders_url = prerequisite_fundersi['funder_metadata']['url'][0]['url']
prerequisite_funders_sherpa_id = prerequisite_fundersi['funder_metadata']['id']
sherpa_policies = sherpa_policies.append({'journal' : journal_id,
'issn' : journal_issn,
'sherpa_id' : sherpa_id,
'sherpa_uri' : sherpa_uri,
'open_access_prohibited' : open_access_prohibited,
'additional_oa_fee' : additional_oa_fee,
'article_version' : article_version,
'license' : mylicense,
'embargo' : embargo,
'prerequisites' : prerequisites,
'prerequisite_funders' : prerequisite_funders,
'prerequisite_funders_name' : prerequisite_funders_name,
'prerequisite_funders_fundref' : prerequisite_funders_fundref,
'prerequisite_funders_ror' : prerequisite_funders_ror,
'prerequisite_funders_country' : prerequisite_funders_country,
'prerequisite_funders_url' : prerequisite_funders_url,
'prerequisite_funders_sherpa_id' : prerequisite_funders_sherpa_id,
'prerequisite_subjects' : prerequisite_subjects,
'location' : location,
'locations_ir' : locations_ir,
'locations_not_ir' : locations_not_ir,
'named_repository' : named_repository,
'named_academic_social_network' : named_academic_social_network,
'copyright_owner' : copyright_owner,
'publisher_deposit' : publisher_deposit,
'archiving' : archiving,
'conditions' : conditions,
'public_notes' : public_notes
}, ignore_index=True)
# sans prerequisites_funders
else :
sherpa_policies = sherpa_policies.append({'journal' : journal_id,
'issn' : journal_issn,
'sherpa_id' : sherpa_id,
'sherpa_uri' : sherpa_uri,
'open_access_prohibited' : open_access_prohibited,
'additional_oa_fee' : additional_oa_fee,
'article_version' : article_version,
'license' : mylicense,
'embargo' : embargo,
'prerequisites' : prerequisites,
'prerequisite_funders' : prerequisite_funders,
'prerequisite_funders_name' : prerequisite_funders_name,
'prerequisite_funders_fundref' : prerequisite_funders_fundref,
'prerequisite_funders_ror' : prerequisite_funders_ror,
'prerequisite_funders_country' : prerequisite_funders_country,
'prerequisite_funders_url' : prerequisite_funders_url,
'prerequisite_funders_sherpa_id' : prerequisite_funders_sherpa_id,
'prerequisite_subjects' : prerequisite_subjects,
'location' : location,
'locations_ir' : locations_ir,
'locations_not_ir' : locations_not_ir,
'named_repository' : named_repository,
'named_academic_social_network' : named_academic_social_network,
'copyright_owner' : copyright_owner,
'publisher_deposit' : publisher_deposit,
'archiving' : archiving,
'conditions' : conditions,
'public_notes' : public_notes
}, ignore_index=True)
# sans prerequisites
else :
sherpa_policies = sherpa_policies.append({'journal' : journal_id,
'issn' : journal_issn,
'sherpa_id' : sherpa_id,
'sherpa_uri' : sherpa_uri,
'open_access_prohibited' : open_access_prohibited,
'additional_oa_fee' : additional_oa_fee,
'article_version' : article_version,
'license' : mylicense,
'embargo' : embargo,
'prerequisites' : prerequisites,
'prerequisite_funders' : prerequisite_funders,
'prerequisite_funders_name' : prerequisite_funders_name,
'prerequisite_funders_fundref' : prerequisite_funders_fundref,
'prerequisite_funders_ror' : prerequisite_funders_ror,
'prerequisite_funders_country' : prerequisite_funders_country,
'prerequisite_funders_url' : prerequisite_funders_url,
'prerequisite_funders_sherpa_id' : prerequisite_funders_sherpa_id,
'prerequisite_subjects' : prerequisite_subjects,
'location' : location,
'locations_ir' : locations_ir,
'locations_not_ir' : locations_not_ir,
'named_repository' : named_repository,
'named_academic_social_network' : named_academic_social_network,
'copyright_owner' : copyright_owner,
'publisher_deposit' : publisher_deposit,
'archiving' : archiving,
'conditions' : conditions,
'public_notes' : public_notes
}, ignore_index=True)
# sans permitted_oa
else :
print ('permitted_oa MISSING')
else :
print ('id MISSING')
# In[43]:
sherpa_policies
# In[44]:
# convertir l'index en id
sherpa_policies = sherpa_policies.reset_index()
# ajout de l'id avec l'index + 1
sherpa_policies['id'] = sherpa_policies['index'] + 1
del sherpa_policies['index']
sherpa_policies
# In[45]:
# export csv
sherpa_policies.to_csv('sample/sherpa_policies_brut.tsv', sep='\t', encoding='utf-8', index=False)
# In[46]:
# export excel
sherpa_policies.to_excel('sample/sherpa_policies_brut.xlsx', index=False)
# ## Calcul de la catégorie "green" et export final des journaux
# In[47]:
sherpa_policies
# In[48]:
sherpa_policies_ir = sherpa_policies.loc[(sherpa_policies['archiving'] == True) & (sherpa_policies['article_version'] == 'published') & (sherpa_policies['prerequisite_funders'].isna())][['journal', 'embargo', 'license', 'conditions']]
sherpa_policies_ir
# In[49]:
# dedup
sherpa_policies_ir_id = sherpa_policies_ir[['journal', 'embargo']].sort_values(by=['journal', 'embargo'])
sherpa_policies_ir_dedup = sherpa_policies_ir_id.drop_duplicates(subset='journal')
sherpa_policies_ir_dedup
# In[50]:
# ajout de la ctégorie green (2)
sherpa_policies_ir_dedup['oa_status'] = 2
sherpa_policies_ir_dedup
# In[51]:
# merge avec les revues
sherpa_policies_ir_dedup = sherpa_policies_ir_dedup.rename(columns={'journal' : 'id'})
journals_export = pd.merge(journals_export, sherpa_policies_ir_dedup, on='id', how='left')
journals_export
# In[52]:
# choix de la catégorie OA
journals_export['oa_status'] = journals_export['oa_status_x']
journals_export.loc[(journals_export['oa_status_x'] == 1) & (journals_export['oa_status_y'].notna()), 'oa_status'] = journals_export['oa_status_y']
journals_export
# In[53]:
# 6 : Diamond
# 5 : Gold
# 4 : Full
# 3 : Hybrid
# 2 : Green
# 1 : UNKNOWN
journals_export['oa_status'].value_counts()
# In[54]:
del journals_export['embargo']
del journals_export['oa_status_x']
del journals_export['oa_status_y']
journals_export
# In[55]:
journals_export['oa_status'] = journals_export['oa_status'].astype(int)
journals_export
# In[56]:
# export csv
journals_export.to_csv('sample/journal_fin_sherpa.tsv', sep='\t', encoding='utf-8', index=False)
# In[57]:
# export excel
journals_export.to_excel('sample/journal_fin_sherpa.xlsx', index=False)
# In[58]:
# export csv
sherpa_policies_ir_dedup.to_csv('sample/journal_ir.tsv', sep='\t', encoding='utf-8', index=False)
# In[59]:
# export excel
sherpa_policies_ir_dedup.to_excel('sample/journal_ir.xlsx', index=False)
# In[ ]:

Event Timeline