08_oacct_sherpa_issns.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, May 30, 14:33

08_oacct_sherpa_issns.py
View Options

	#!/usr/bin/env python
	# coding: utf-8

	# # Projet Open Access Compliance Check Tool (OACCT)
	#
	# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
	#
	# Ce notebook permet d'extraire les données choisis parmis les sources obtenues par API et les traiter pour les rendre exploitables dans l'application OACCT.
	#
	# Auteur : Pablo Iriarte, Université de Genève (pablo.iriarte@unige.ch)
	# Date de dernière mise à jour : 16.07.2021

	# ## Table ISSNs

	# In[1]:


	import pandas as pd
	import csv
	import json
	import numpy as np


	# In[2]:


	issns = pd.read_csv('sample/issn_brut.tsv', encoding='utf-8', sep='\t')
	issns


	# ## Ajout du format à partir de Sherpa

	# In[3]:


	# ajout du format par sherpa
	issn_sherpa = pd.read_csv('sample/issn_sherpa.tsv', encoding='utf-8', sep='\t')
	issn_sherpa


	# In[4]:


	issn_sherpa['type'] = issn_sherpa['type'].str.upper()
	issn_sherpa


	# In[5]:


	issns = pd.merge(issns, issn_sherpa[['issn', 'type']], on='issn', how='outer')
	issns


	# In[6]:


	issns['format'].value_counts()


	# In[7]:


	issns['type'].value_counts()


	# In[8]:


	# tester les lignes sans type
	issns.loc[issns['format'].isnull()].loc[issns['type'].isnull()]


	# In[9]:


	# tester les lignes avec type égal
	issns.loc[issns['format'] == issns['type']]


	# In[10]:


	# tester les lignes avec type diff
	issns.loc[issns['format'] != issns['type']]


	# In[11]:


	# attribution de l'id du type avec préference par ISSN.org puis Sherpa
	# PRINT = 1
	# ELECTRONIC = 2
	# OTHER = 3
	issns['issn_type'] = issns['format']
	issns.loc[issns['format'].isna(), 'issn_type'] = issns['type']
	issns['issn_type'] = issns['issn_type'].str.replace('PRINT', '1')
	issns['issn_type'] = issns['issn_type'].str.replace('ELECTRONIC', '2')
	issns['issn_type'] = issns['issn_type'].str.replace('OTHER', '3')
	issns['issn_type'] = issns['issn_type'].fillna(3)
	issns


	# In[12]:


	# test de diffs
	issns.loc[issns['format'] == 'PRINT'].loc[issns['type'] == 'ELECTRONIC']


	# In[13]:


	# test de diffs
	issns.loc[issns['format'] == 'ELECTRONIC'].loc[issns['type'] == 'PRINT']


	# In[14]:


	# test de diffs
	issns.loc[issns['format'].isna()].loc[issns['type'] == 'PRINT']


	# In[15]:


	# convertir journal en int
	issns['journal'] = issns['journal'].astype(int)


	# In[16]:


	# convertir l'index en id
	issns = issns.reset_index()
	issns['id'] = issns['index'] + 1
	del issns['index']
	issns


	# In[17]:


	issns['issn_type'] = issns['issn_type'].astype(int)


	# In[18]:


	issns_export = issns[['id', 'issn', 'journal', 'issn_type']]
	issns_export


	# In[19]:


	# supprimer les doublons par ISSN
	issns_export = issns_export.drop_duplicates(subset='issn')
	issns_export


	# In[20]:


	# esport JSON
	result = issns_export.to_json(orient='records', force_ascii=False)
	parsed = json.loads(result)
	with open('sample/issn.json', 'w', encoding='utf-8') as file:
	json.dump(parsed, file, indent=2, ensure_ascii=False)


	# In[21]:


	# export csv
	issns_export.to_csv('sample/issn.tsv', sep='\t', encoding='utf-8', index=False)


	# In[22]:


	# export excel
	issns_export.to_excel('sample/issn.xlsx', index=False)

08_oacct_sherpa_issns.pyNo OneTemporaryActions

File Metadata

08_oacct_sherpa_issns.pyView Options

Event Timeline

08_oacct_sherpa_issns.py
No OneTemporary
Actions

08_oacct_sherpa_issns.py
View Options