Page MenuHomec4science

09_oacct_read_and_publish.py
No OneTemporary

File Metadata

Created
Thu, May 30, 01:05

09_oacct_read_and_publish.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet de modifier les données extraites des differentes sources et les exporter dans les tables de l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 08.09.2021
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
import os
# afficher toutes les colonnes
pd.set_option('display.max_columns', None)
# definir le debut des ids
id_start = 1
# ## Ajout des rabais pour les revues des licences Read & Publish
#
# Journals list by publisher :
# * https://consortium.ch/elsevier_titlelist_publication
# * https://consortium.ch/springer_titlelist_publication
# * https://consortium.ch/wiley_titlelist_publish
# * https://consortium.ch/tandf_titlelist_publish
# * https://consortium.ch/sage_titlelist_publish
# * https://consortium.ch/cup_titlelist_publish
#
# Licence term :
# * Elsevier : 2020-2023
# * Springer Nature : 2020-2022
# * Wiley : 2021-2024
# * Taylor & Francis : 2021-2023
# * Cambridge University Press (CUP) : 2021-2023
#
# CC licences :
# * Elsevier : CC-BY, CC-BY-NC-ND
# * Springer Nature : CC-BY, CC-BY-NC
# * Wiley : CC-BY, CC-BY-NC, CC-BY-NC-ND
# * Taylor & Francis : CC-BY
# * Cambridge University Press (CUP) : CC-BY, CC-BY-NC, CC-BY-NC-ND, CC-BY-NC-SA
#
# Special conditions :
# * Cambridge University Press (CUP) : Only the following article types are covered: Research Articles, Review Articles, Rapid Communication, Brief Reports and Case Reports
#
#
# ## Import du fichier des issns
# In[2]:
issn = pd.read_csv('sample/issn.tsv', encoding='utf-8', header=0, sep='\t')
issn
# In[3]:
# open publishers
publisher = pd.read_csv('sample/publisher.tsv', encoding='utf-8', header=0, sep='\t')
publisher
# In[4]:
publisher.loc[publisher['name'] == 'Elsevier']
# In[5]:
publisher.loc[(publisher['name'] == 'Springer Verlag') | (publisher['name'] == 'Nature Research')]
# In[6]:
publisher.loc[publisher['name'] == 'Wiley']
# In[7]:
publisher.loc[publisher['name'] == 'Taylor and Francis']
# In[8]:
publisher.loc[publisher['name'] == 'Cambridge University Press']
# In[9]:
# ouvrir la liste d'organisations
participants = pd.read_csv('agreements/consortium_institutions_participation_read_and_publish.csv', encoding='utf-8', header=0, sep='\t')
participants
# In[10]:
# suppression de Lib4RI qui est une bibliothèque
participants = participants.loc[participants['Institution'] != 'Lib4RI']
participants
# In[11]:
# ajout de TF et CUP pour tous (TODO : obtenir la liste des bibliothèques pour ces deux licences)
participants['TF'] = 'x'
participants['CUP'] = 'x'
participants
# In[12]:
# ouvrir la liste des journaux Elsevier
elsevier = pd.read_excel('agreements/Elsevier_titlelist_publication.xlsx', skiprows=7)
elsevier
# In[13]:
# ajout du champ version
elsevier['article_version'] = 'published'
elsevier
# In[14]:
# ajout des dates
elsevier['valid_from'] = '2020-01-01'
elsevier['valid_until'] = '2023-12-31'
elsevier
# In[15]:
# ajout du embargo et archiving
elsevier['embargo_months'] = 0
elsevier['archiving'] = True
elsevier
# In[16]:
elsevier.iloc[elsevier.shape[0]-1]
# In[17]:
# ajout du champ license
# cc_by, cc_by_nc_nd
rp = pd.DataFrame()
elsevier['article_version'] = 'published'
elsevier['license'] = 'cc_by'
elsevier['Elsevier'] = 'x'
rp = rp.append(elsevier, ignore_index=True)
elsevier['license'] = 'cc_by_nc_nd'
rp = rp.append(elsevier, ignore_index=True)
rp
# In[18]:
# ouvrir la liste des journaux Springer Nature
springer = pd.read_excel('agreements/Springer_titlelist_publication.xlsx', skiprows=7)
springer
# In[19]:
# ajout du champ license
# cc_by, cc_by_nc
springer['article_version'] = 'published'
springer['license'] = 'cc_by'
springer['Springer Nature'] = 'x'
# ajout des dates
springer['valid_from'] = '2020-01-01'
springer['valid_until'] = '2022-12-31'
# ajout du embargo et archiving
springer['embargo_months'] = 0
springer['archiving'] = True
# In[20]:
# append
rp = rp.append(springer, ignore_index=True)
springer['license'] = 'cc_by_nc'
rp = rp.append(springer, ignore_index=True)
rp
# In[21]:
# ouvrir la liste des journaux Wiley
wiley = pd.read_excel('agreements/Wiley_titlelist_publish.xlsx', skiprows=7)
wiley
# In[22]:
# ajout du champ license
# cc_by, cc_by_nc, cc_by_nc_nd
wiley['article_version'] = 'published'
wiley['license'] = 'cc_by'
wiley['Wiley'] = 'x'
# ajout des dates
wiley['valid_from'] = '2021-01-01'
wiley['valid_until'] = '2024-12-31'
# ajout du embargo et archiving
wiley['embargo_months'] = 0
wiley['archiving'] = True
rp = rp.append(wiley, ignore_index=True)
# append avec une autre licence
wiley['license'] = 'cc_by_nc'
rp = rp.append(wiley, ignore_index=True)
# append avec une autre licence
wiley['license'] = 'cc_by_nc_nd'
rp = rp.append(wiley, ignore_index=True)
rp
# In[23]:
# ouvrir la liste des journaux TF
tf = pd.read_excel('agreements/TandF_titlelist_publish.xlsx', skiprows=7)
tf
# In[24]:
# ajout du champ license
# cc_by, cc_by_nc, cc_by_nc_nd
tf['article_version'] = 'published'
tf['license'] = 'cc_by'
tf['TF'] = 'x'
# ajout des dates
tf['valid_from'] = '2021-01-01'
tf['valid_until'] = '2023-12-31'
# ajout du embargo et archiving
tf['embargo_months'] = 0
tf['archiving'] = True
# In[25]:
# append
rp = rp.append(tf, ignore_index=True)
rp
# In[26]:
# ouvrir la liste des journaux CUP
cup = pd.read_excel('agreements/CUP_Journals_titlelist_publish.xlsx', skiprows=7)
cup
# In[27]:
# renommer l'ISSN
cup = cup.rename(columns = {'e-ISSN' : 'ISSN'})
cup
# In[28]:
# ajout du champ license
# cc_by, cc_by_nc, cc_by_nc_nd, cc_by_nc_sa
cup['article_version'] = 'published'
cup['license'] = 'cc_by'
cup['CUP'] = 'x'
# ajout des dates
cup['valid_from'] = '2021-01-01'
cup['valid_until'] = '2023-12-31'
# ajout du embargo et archiving
cup['embargo_months'] = 60
cup['archiving'] = True
# In[29]:
# append
rp = rp.append(cup, ignore_index=True)
cup['license'] = 'cc_by_nc'
rp = rp.append(cup, ignore_index=True)
cup['license'] = 'cc_by_nc_nd'
rp = rp.append(cup, ignore_index=True)
cup['license'] = 'cc_by_nc_sa'
rp = rp.append(cup, ignore_index=True)
rp
# In[30]:
# test des lignes sans embargo
rp.loc[rp['embargo_months'].isna()]
# In[31]:
# ajout des ISSN-L
issnl = pd.read_csv('issn/20171102.ISSN-to-ISSN-L.txt', encoding='utf-8', header=0, sep='\t')
issnl
# In[32]:
# renommer les colonnes
issnl = issnl.rename(columns={'ISSN' : 'issn', 'ISSN-L' : 'issnl'})
rp = rp.rename(columns={'ISSN' : 'issn'})
# In[33]:
# merge
rp = pd.merge(rp, issnl, on='issn', how='left')
rp
# In[34]:
# cummuler les issns pour le merge
# rp_1 = rp.loc[rp['issnl'].notna()][['issnl', 'article_version', 'license', 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP']]
# rp_1 = rp_1.rename(columns = {'issnl' : 'issn'})
# rp_2 = rp.loc[rp['issn'].notna()][['issn', 'article_version', 'license', 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP']]
# rp_all = rp_1.append(rp_2, ignore_index=True)
rp_all = rp
# In[35]:
# ajouter les champs manquants
# valeur discount (id 2) à 100% pour les licences read & publish
# elsevier['amount'] = 100
# elsevier['symbol'] = '%'
# elsevier['cost_factor_type'] = 2
# elsevier['comment'] = 'Source: swissuniversities'
# elsevier
# In[36]:
# merge avec les organisations
# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
participants_elsevier = participants.loc[participants['Elsevier'].notna()][['Elsevier', 'ROR']]
rp_elsevier = rp_all.loc[rp_all['Elsevier'].notna()]
rp_1 = pd.merge(rp_elsevier, participants_elsevier, on='Elsevier', how='outer')
rp_1
# In[37]:
rp_elsevier
# In[38]:
participants_elsevier
# In[39]:
# merge avec les organisations
# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
participants_springer = participants.loc[participants['Springer Nature'].notna()][['Springer Nature', 'ROR']]
rp_springer = rp_all.loc[rp_all['Springer Nature'].notna()]
rp_2 = pd.merge(rp_springer, participants_springer, on='Springer Nature', how='outer')
rp_2
# In[40]:
# merge avec les organisations
# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
participants_wiley = participants.loc[participants['Wiley'].notna()][['Wiley', 'ROR']]
rp_wiley = rp_all.loc[rp_all['Wiley'].notna()]
rp_3 = pd.merge(rp_wiley, participants_wiley, on='Wiley', how='outer')
rp_3
# In[41]:
rp_wiley
# In[42]:
# merge avec les organisations
# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
participants_tf = participants.loc[participants['TF'].notna()][['TF', 'ROR']]
rp_tf = rp_all.loc[rp_all['TF'].notna()]
rp_4 = pd.merge(rp_tf, participants_tf, on='TF', how='outer')
rp_4
# In[43]:
# merge avec les organisations
# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
participants_cup = participants.loc[participants['CUP'].notna()][['CUP', 'ROR']]
rp_cup = rp_all.loc[rp_all['CUP'].notna()]
rp_5 = pd.merge(rp_cup, participants_cup, on='CUP', how='outer')
rp_5
# In[44]:
# concat des 5
rp_fin = rp_1.append(rp_2, ignore_index=True)
rp_fin = rp_fin.append(rp_3, ignore_index=True)
rp_fin = rp_fin.append(rp_4, ignore_index=True)
rp_fin = rp_fin.append(rp_5, ignore_index=True)
rp_fin
# In[45]:
# supprimer les doublons et les vides
rp_fin = rp_fin.dropna(subset=['issn'])
rp_fin = rp_fin.drop_duplicates(subset=['issn', 'license', 'ROR'])
rp_fin
# In[46]:
# reindex et ajout de l'id avec l'index + 1
rp_fin = rp_fin.reset_index()
del rp_fin['index']
rp_fin = rp_fin.reset_index()
rp_fin['rp_id'] = rp_fin.index + 1
rp_fin
# In[47]:
rp_fin['embargo_months'].value_counts()
# In[48]:
# test des lignes sans embargo
rp_fin.loc[rp_fin['embargo_months'].isna()]
# In[49]:
issn
# In[50]:
# merge pour avoir l'issnl
issn = pd.merge(issn, issnl, on='issn', how='left')
issn
# In[51]:
issn.loc[issn['issnl'].isna()]
# In[52]:
# merge dans l'autre sens pour garder que les lignes du fichier
rp_fin = pd.merge(rp_fin, issn[['id', 'journal', 'issnl']], on='issnl', how='left')
rp_fin
# In[53]:
# test des lignes sans embargo
rp_fin.loc[rp_fin['embargo_months'].isna() & rp_fin['id'].notna()]
# In[54]:
# garder les lignes avec merge
rp_fin_merge = rp_fin.loc[rp_fin['id'].notna()]
rp_fin_merge
# In[55]:
# supprimer les doublons et les vides
rp_fin_merge = rp_fin_merge.drop_duplicates(subset=['rp_id'])
rp_fin_merge
# In[56]:
# test des lignes sans journal
rp_fin_merge.loc[rp_fin_merge['journal'].isna()]
# In[57]:
# convertir l'index en id
del rp_fin_merge['id']
del rp_fin_merge['index']
del rp_fin_merge['rp_id']
rp_fin_merge = rp_fin_merge.reset_index()
# ajout de l'id avec l'index + 1
rp_fin_merge['rp_id'] = rp_fin_merge['index'] + 1
del rp_fin_merge['index']
rp_fin_merge
# In[58]:
# convertir l'index en id
del rp_fin_merge['rp_id']
rp_fin_merge = rp_fin_merge.reset_index()
# ajout de l'id avec l'index + 1
rp_fin_merge['rp_id'] = rp_fin_merge['index'] + 1
del rp_fin_merge['index']
rp_fin_merge
# In[59]:
rp_fin_merge['embargo_months'].value_counts()
# In[60]:
# test des lignes sans embargo
rp_fin_merge.loc[rp_fin_merge['embargo_months'].isna()]
# In[61]:
# export excel
rp_fin_merge.to_excel('sample/read_publish_brut_merge.xlsx', index=False)
# In[62]:
# export csv
rp_fin_merge.to_csv('sample/read_publish_brut_merge.tsv', sep='\t', index=False)

Event Timeline