Page MenuHomec4science

05_oacct_issns.py
No OneTemporary

File Metadata

Created
Tue, Nov 19, 18:29

05_oacct_issns.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet d'extraire les données choisis parmis les sources obtenues par API et les traiter pour les rendre exploitables dans l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 16.07.2021
# ## Table ISSNs
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
import os
# In[2]:
# ajout des ISSN-L
issns = pd.read_csv('issn/20171102.ISSN-to-ISSN-L.txt', encoding='utf-8', header=0, sep='\t')
issns
# In[3]:
# renommer les colonnes
issns = issns.rename(columns={'ISSN' : 'issn', 'ISSN-L' : 'issnl'})
issns
# In[4]:
journals = pd.read_csv('sample/journals_brut.tsv', encoding='utf-8', sep='\t', usecols=(['id', 'issn', 'issnl']))
journals
# In[5]:
# renomer les colonnes id
journals = journals.rename(columns = {'id' : 'journal'})
journals
# In[6]:
# test journals sans issn
journals.loc[journals['issn'].isna()]
# In[7]:
journals.loc[journals['journal'] == 5]
# ## Extraction du format
# In[8]:
# creation du DF
col_names = ['issn',
'format'
]
journals_format = pd.DataFrame(columns = col_names)
journals_format
# In[9]:
# extraction des informations à partir des données ISSN.org
for index, row in journals.iterrows():
# myid = row['journal']
myissn = row['issn']
# myissnl = row['issnl']
if (((index/10) - int(index/10)) == 0) :
print(index)
# initialisation des variables à extraire
myformat = np.nan
# export en json
if os.path.exists('issn/data/' + myissn + '.json'):
with open('issn/data/' + myissn + '.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for x in data['@graph']:
if ('@id' in x):
if (x['@id'] == 'resource/ISSN/' + myissn):
if ('format' in x):
myformats = x['format']
if type(myformats) is list:
myformat = myformats[0].replace('vocabularies/medium#', '')
else :
myformat = myformats.replace('vocabularies/medium#', '')
# journals_format.at[index,'journal'] = myid
journals_format.at[index,'issn'] = myissn
# journals2.at[index,'issnl'] = myissnl
journals_format.at[index,'format'] = myformat
else :
print(row['issn'] + ' - pas trouvé')
# In[10]:
journals_format
# In[11]:
# test
journals_format.loc[journals_format['format'].isnull()]
# In[12]:
journals_format['format'].value_counts()
# In[13]:
del journals['issn']
# In[14]:
issns = pd.merge(issns, journals, on='issnl', how='outer')
issns
# In[15]:
# tester les lignes sans issn
issns.loc[issns['issn'].isna()]
# In[16]:
# garder les lilgnes non null
issns = issns.loc[issns['issn'].notna()]
# In[17]:
# isoler les lignes avec marge
issns2 = issns.loc[issns['journal'].notna()]
issns2
# In[18]:
# ajout du format par ISSN
issns2 = pd.merge(issns2, journals_format, on='issn', how='outer')
issns2
# In[19]:
# isoler les lignes avec marge
issns2 = issns2.loc[issns2['journal'].notna()]
issns2
# In[20]:
issns2['format'] = issns2['format'].str.upper()
issns2['format'] = issns2['format'].str.replace('ONLINE', 'ELECTRONIC')
# DigitalCarrier
issns2['format'] = issns2['format'].str.replace('DIGITALCARRIER', 'ELECTRONIC')
issns2
# In[21]:
issns2['format'].value_counts()
# In[22]:
# tester les lignes sans issn
issns2.loc[issns2['format'].isnull()]
# In[23]:
# attribution de l'id du type
# PRINT = 1
# ELECTRONIC = 2
# OTHER = 3
issns2['issn_type'] = issns2['format']
issns2['issn_type'] = issns2['issn_type'].str.replace('PRINT', '1')
issns2['issn_type'] = issns2['issn_type'].str.replace('ELECTRONIC', '2')
issns2['issn_type'] = issns2['issn_type'].str.replace('OTHER', '3')
issns2['issn_type'] = issns2['issn_type'].fillna(3)
issns2
# In[24]:
# convertir journal en int
issns2['journal'] = issns2['journal'].astype(int)
# In[25]:
# convertir l'index en id
issns2 = issns2.reset_index()
issns2['id'] = issns2['index'] + 1
del issns2['index']
issns2
# In[26]:
issns2['issn_type'] = issns2['issn_type'].astype(int)
# In[27]:
# supprimer les doublons par ISSN
issns2 = issns2.drop_duplicates(subset='issn')
issns2
# In[28]:
# export csv
issns2.to_csv('sample/issn_brut.tsv', sep='\t', encoding='utf-8', index=False)
# In[29]:
# export excel
issns2.to_excel('sample/issn_brut.xlsx', index=False)
# In[30]:
# export CSV des IDs
issns2[['id', 'issn', 'issnl', 'journal']].to_csv('sample/issn_ids.tsv', sep='\t', encoding='utf-8', index=False)
# In[31]:
# export excel des IDs
issns2[['id', 'issn', 'issnl', 'journal']].to_excel('sample/issn_ids.xlsx', index=False)

Event Timeline