09_oacct_read_and_publish.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, May 30, 01:05

09_oacct_read_and_publish.py
View Options

	#!/usr/bin/env python
	# coding: utf-8

	# # Projet Open Access Compliance Check Tool (OACCT)
	#
	# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
	#
	# Ce notebook permet de modifier les données extraites des differentes sources et les exporter dans les tables de l'application OACCT.
	#
	# Auteur : Pablo Iriarte, Université de Genève (pablo.iriarte@unige.ch)
	# Date de dernière mise à jour : 08.09.2021

	# In[1]:


	import pandas as pd
	import csv
	import json
	import numpy as np
	import os
	# afficher toutes les colonnes
	pd.set_option('display.max_columns', None)
	# definir le debut des ids
	id_start = 1


	# ## Ajout des rabais pour les revues des licences Read & Publish
	#
	# Journals list by publisher :
	# * https://consortium.ch/elsevier_titlelist_publication
	# * https://consortium.ch/springer_titlelist_publication
	# * https://consortium.ch/wiley_titlelist_publish
	# * https://consortium.ch/tandf_titlelist_publish
	# * https://consortium.ch/sage_titlelist_publish
	# * https://consortium.ch/cup_titlelist_publish
	#
	# Licence term :
	# * Elsevier : 2020-2023
	# * Springer Nature : 2020-2022
	# * Wiley : 2021-2024
	# * Taylor & Francis : 2021-2023
	# * Cambridge University Press (CUP) : 2021-2023
	#
	# CC licences :
	# * Elsevier : CC-BY, CC-BY-NC-ND
	# * Springer Nature : CC-BY, CC-BY-NC
	# * Wiley : CC-BY, CC-BY-NC, CC-BY-NC-ND
	# * Taylor & Francis : CC-BY
	# * Cambridge University Press (CUP) : CC-BY, CC-BY-NC, CC-BY-NC-ND, CC-BY-NC-SA
	#
	# Special conditions :
	# * Cambridge University Press (CUP) : Only the following article types are covered: Research Articles, Review Articles, Rapid Communication, Brief Reports and Case Reports
	#
	#

	# ## Import du fichier des issns

	# In[2]:


	issn = pd.read_csv('sample/issn.tsv', encoding='utf-8', header=0, sep='\t')
	issn


	# In[3]:


	# open publishers
	publisher = pd.read_csv('sample/publisher.tsv', encoding='utf-8', header=0, sep='\t')
	publisher


	# In[4]:


	publisher.loc[publisher['name'] == 'Elsevier']


	# In[5]:


	publisher.loc[(publisher['name'] == 'Springer Verlag') \| (publisher['name'] == 'Nature Research')]


	# In[6]:


	publisher.loc[publisher['name'] == 'Wiley']


	# In[7]:


	publisher.loc[publisher['name'] == 'Taylor and Francis']


	# In[8]:


	publisher.loc[publisher['name'] == 'Cambridge University Press']


	# In[9]:


	# ouvrir la liste d'organisations
	participants = pd.read_csv('agreements/consortium_institutions_participation_read_and_publish.csv', encoding='utf-8', header=0, sep='\t')
	participants


	# In[10]:


	# suppression de Lib4RI qui est une bibliothèque
	participants = participants.loc[participants['Institution'] != 'Lib4RI']
	participants


	# In[11]:


	# ajout de TF et CUP pour tous (TODO : obtenir la liste des bibliothèques pour ces deux licences)
	participants['TF'] = 'x'
	participants['CUP'] = 'x'
	participants


	# In[12]:


	# ouvrir la liste des journaux Elsevier
	elsevier = pd.read_excel('agreements/Elsevier_titlelist_publication.xlsx', skiprows=7)
	elsevier


	# In[13]:


	# ajout du champ version
	elsevier['article_version'] = 'published'
	elsevier


	# In[14]:


	# ajout des dates
	elsevier['valid_from'] = '2020-01-01'
	elsevier['valid_until'] = '2023-12-31'
	elsevier


	# In[15]:


	# ajout du embargo et archiving
	elsevier['embargo_months'] = 0
	elsevier['archiving'] = True
	elsevier


	# In[16]:


	elsevier.iloc[elsevier.shape[0]-1]


	# In[17]:


	# ajout du champ license
	# cc_by, cc_by_nc_nd
	rp = pd.DataFrame()
	elsevier['article_version'] = 'published'
	elsevier['license'] = 'cc_by'
	elsevier['Elsevier'] = 'x'
	rp = rp.append(elsevier, ignore_index=True)
	elsevier['license'] = 'cc_by_nc_nd'
	rp = rp.append(elsevier, ignore_index=True)
	rp


	# In[18]:


	# ouvrir la liste des journaux Springer Nature
	springer = pd.read_excel('agreements/Springer_titlelist_publication.xlsx', skiprows=7)
	springer


	# In[19]:


	# ajout du champ license
	# cc_by, cc_by_nc
	springer['article_version'] = 'published'
	springer['license'] = 'cc_by'
	springer['Springer Nature'] = 'x'
	# ajout des dates
	springer['valid_from'] = '2020-01-01'
	springer['valid_until'] = '2022-12-31'
	# ajout du embargo et archiving
	springer['embargo_months'] = 0
	springer['archiving'] = True


	# In[20]:


	# append
	rp = rp.append(springer, ignore_index=True)
	springer['license'] = 'cc_by_nc'
	rp = rp.append(springer, ignore_index=True)
	rp


	# In[21]:


	# ouvrir la liste des journaux Wiley
	wiley = pd.read_excel('agreements/Wiley_titlelist_publish.xlsx', skiprows=7)
	wiley


	# In[22]:


	# ajout du champ license
	# cc_by, cc_by_nc, cc_by_nc_nd
	wiley['article_version'] = 'published'
	wiley['license'] = 'cc_by'
	wiley['Wiley'] = 'x'
	# ajout des dates
	wiley['valid_from'] = '2021-01-01'
	wiley['valid_until'] = '2024-12-31'
	# ajout du embargo et archiving
	wiley['embargo_months'] = 0
	wiley['archiving'] = True
	rp = rp.append(wiley, ignore_index=True)
	# append avec une autre licence
	wiley['license'] = 'cc_by_nc'
	rp = rp.append(wiley, ignore_index=True)
	# append avec une autre licence
	wiley['license'] = 'cc_by_nc_nd'
	rp = rp.append(wiley, ignore_index=True)
	rp


	# In[23]:


	# ouvrir la liste des journaux TF
	tf = pd.read_excel('agreements/TandF_titlelist_publish.xlsx', skiprows=7)
	tf


	# In[24]:


	# ajout du champ license
	# cc_by, cc_by_nc, cc_by_nc_nd
	tf['article_version'] = 'published'
	tf['license'] = 'cc_by'
	tf['TF'] = 'x'
	# ajout des dates
	tf['valid_from'] = '2021-01-01'
	tf['valid_until'] = '2023-12-31'
	# ajout du embargo et archiving
	tf['embargo_months'] = 0
	tf['archiving'] = True


	# In[25]:


	# append
	rp = rp.append(tf, ignore_index=True)
	rp


	# In[26]:


	# ouvrir la liste des journaux CUP
	cup = pd.read_excel('agreements/CUP_Journals_titlelist_publish.xlsx', skiprows=7)
	cup


	# In[27]:


	# renommer l'ISSN
	cup = cup.rename(columns = {'e-ISSN' : 'ISSN'})
	cup


	# In[28]:


	# ajout du champ license
	# cc_by, cc_by_nc, cc_by_nc_nd, cc_by_nc_sa
	cup['article_version'] = 'published'
	cup['license'] = 'cc_by'
	cup['CUP'] = 'x'
	# ajout des dates
	cup['valid_from'] = '2021-01-01'
	cup['valid_until'] = '2023-12-31'
	# ajout du embargo et archiving
	cup['embargo_months'] = 60
	cup['archiving'] = True


	# In[29]:


	# append
	rp = rp.append(cup, ignore_index=True)
	cup['license'] = 'cc_by_nc'
	rp = rp.append(cup, ignore_index=True)
	cup['license'] = 'cc_by_nc_nd'
	rp = rp.append(cup, ignore_index=True)
	cup['license'] = 'cc_by_nc_sa'
	rp = rp.append(cup, ignore_index=True)
	rp


	# In[30]:


	# test des lignes sans embargo
	rp.loc[rp['embargo_months'].isna()]


	# In[31]:


	# ajout des ISSN-L
	issnl = pd.read_csv('issn/20171102.ISSN-to-ISSN-L.txt', encoding='utf-8', header=0, sep='\t')
	issnl


	# In[32]:


	# renommer les colonnes
	issnl = issnl.rename(columns={'ISSN' : 'issn', 'ISSN-L' : 'issnl'})
	rp = rp.rename(columns={'ISSN' : 'issn'})


	# In[33]:


	# merge
	rp = pd.merge(rp, issnl, on='issn', how='left')
	rp


	# In[34]:


	# cummuler les issns pour le merge
	# rp_1 = rp.loc[rp['issnl'].notna()][['issnl', 'article_version', 'license', 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP']]
	# rp_1 = rp_1.rename(columns = {'issnl' : 'issn'})
	# rp_2 = rp.loc[rp['issn'].notna()][['issn', 'article_version', 'license', 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP']]
	# rp_all = rp_1.append(rp_2, ignore_index=True)
	rp_all = rp


	# In[35]:


	# ajouter les champs manquants
	# valeur discount (id 2) à 100% pour les licences read & publish
	# elsevier['amount'] = 100
	# elsevier['symbol'] = '%'
	# elsevier['cost_factor_type'] = 2
	# elsevier['comment'] = 'Source: swissuniversities'
	# elsevier


	# In[36]:


	# merge avec les organisations
	# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
	participants_elsevier = participants.loc[participants['Elsevier'].notna()][['Elsevier', 'ROR']]
	rp_elsevier = rp_all.loc[rp_all['Elsevier'].notna()]
	rp_1 = pd.merge(rp_elsevier, participants_elsevier, on='Elsevier', how='outer')
	rp_1


	# In[37]:


	rp_elsevier


	# In[38]:


	participants_elsevier


	# In[39]:


	# merge avec les organisations
	# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
	participants_springer = participants.loc[participants['Springer Nature'].notna()][['Springer Nature', 'ROR']]
	rp_springer = rp_all.loc[rp_all['Springer Nature'].notna()]
	rp_2 = pd.merge(rp_springer, participants_springer, on='Springer Nature', how='outer')
	rp_2


	# In[40]:


	# merge avec les organisations
	# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
	participants_wiley = participants.loc[participants['Wiley'].notna()][['Wiley', 'ROR']]
	rp_wiley = rp_all.loc[rp_all['Wiley'].notna()]
	rp_3 = pd.merge(rp_wiley, participants_wiley, on='Wiley', how='outer')
	rp_3


	# In[41]:


	rp_wiley


	# In[42]:


	# merge avec les organisations
	# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
	participants_tf = participants.loc[participants['TF'].notna()][['TF', 'ROR']]
	rp_tf = rp_all.loc[rp_all['TF'].notna()]
	rp_4 = pd.merge(rp_tf, participants_tf, on='TF', how='outer')
	rp_4


	# In[43]:


	# merge avec les organisations
	# 'Elsevier', 'Springer Nature', 'Wiley', 'TF', 'CUP'
	participants_cup = participants.loc[participants['CUP'].notna()][['CUP', 'ROR']]
	rp_cup = rp_all.loc[rp_all['CUP'].notna()]
	rp_5 = pd.merge(rp_cup, participants_cup, on='CUP', how='outer')
	rp_5


	# In[44]:


	# concat des 5
	rp_fin = rp_1.append(rp_2, ignore_index=True)
	rp_fin = rp_fin.append(rp_3, ignore_index=True)
	rp_fin = rp_fin.append(rp_4, ignore_index=True)
	rp_fin = rp_fin.append(rp_5, ignore_index=True)
	rp_fin


	# In[45]:


	# supprimer les doublons et les vides
	rp_fin = rp_fin.dropna(subset=['issn'])
	rp_fin = rp_fin.drop_duplicates(subset=['issn', 'license', 'ROR'])
	rp_fin


	# In[46]:


	# reindex et ajout de l'id avec l'index + 1
	rp_fin = rp_fin.reset_index()
	del rp_fin['index']
	rp_fin = rp_fin.reset_index()
	rp_fin['rp_id'] = rp_fin.index + 1
	rp_fin


	# In[47]:


	rp_fin['embargo_months'].value_counts()


	# In[48]:


	# test des lignes sans embargo
	rp_fin.loc[rp_fin['embargo_months'].isna()]


	# In[49]:


	issn


	# In[50]:


	# merge pour avoir l'issnl
	issn = pd.merge(issn, issnl, on='issn', how='left')
	issn


	# In[51]:


	issn.loc[issn['issnl'].isna()]


	# In[52]:


	# merge dans l'autre sens pour garder que les lignes du fichier
	rp_fin = pd.merge(rp_fin, issn[['id', 'journal', 'issnl']], on='issnl', how='left')
	rp_fin


	# In[53]:


	# test des lignes sans embargo
	rp_fin.loc[rp_fin['embargo_months'].isna() & rp_fin['id'].notna()]


	# In[54]:


	# garder les lignes avec merge
	rp_fin_merge = rp_fin.loc[rp_fin['id'].notna()]
	rp_fin_merge


	# In[55]:


	# supprimer les doublons et les vides
	rp_fin_merge = rp_fin_merge.drop_duplicates(subset=['rp_id'])
	rp_fin_merge


	# In[56]:


	# test des lignes sans journal
	rp_fin_merge.loc[rp_fin_merge['journal'].isna()]


	# In[57]:


	# convertir l'index en id
	del rp_fin_merge['id']
	del rp_fin_merge['index']
	del rp_fin_merge['rp_id']
	rp_fin_merge = rp_fin_merge.reset_index()
	# ajout de l'id avec l'index + 1
	rp_fin_merge['rp_id'] = rp_fin_merge['index'] + 1
	del rp_fin_merge['index']
	rp_fin_merge


	# In[58]:


	# convertir l'index en id
	del rp_fin_merge['rp_id']
	rp_fin_merge = rp_fin_merge.reset_index()
	# ajout de l'id avec l'index + 1
	rp_fin_merge['rp_id'] = rp_fin_merge['index'] + 1
	del rp_fin_merge['index']
	rp_fin_merge


	# In[59]:


	rp_fin_merge['embargo_months'].value_counts()


	# In[60]:


	# test des lignes sans embargo
	rp_fin_merge.loc[rp_fin_merge['embargo_months'].isna()]


	# In[61]:


	# export excel
	rp_fin_merge.to_excel('sample/read_publish_brut_merge.xlsx', index=False)


	# In[62]:


	# export csv
	rp_fin_merge.to_csv('sample/read_publish_brut_merge.tsv', sep='\t', index=False)

09_oacct_read_and_publish.pyNo OneTemporaryActions

File Metadata

09_oacct_read_and_publish.pyView Options

Event Timeline

09_oacct_read_and_publish.py
No OneTemporary
Actions

09_oacct_read_and_publish.py
View Options