Page MenuHomec4science

07_oacct_sherpa_publishers.py
No OneTemporary

File Metadata

Created
Sat, Nov 9, 11:43

07_oacct_sherpa_publishers.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet d'extraire les données choisis parmis les sources obtenues par API et les traiter pour les rendre exploitables dans l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 16.07.2021
# ## Table Journals Publishers : ajout des informations de Sherpa
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
# In[2]:
publishers_issn = pd.read_csv('sample/publishers_brut.tsv', encoding='utf-8', header=0, sep='\t')
publishers_issn
# In[3]:
# import ids
publisher_ids = pd.read_csv('sample/journals_publishers_ids.tsv', encoding='utf-8', header=0, sep='\t')
publisher_ids
# In[4]:
# renommage id
publisher_ids = publisher_ids.rename(columns = {'id': 'journal'})
publisher_ids = publisher_ids.rename(columns = {'publisher': 'id'})
# In[5]:
# dédoublonage par publisher id
publisher_ids_dedup = publisher_ids.drop_duplicates(subset='id')
publisher_ids_dedup
# In[6]:
# merge avec journals
publisher = pd.merge(publishers_issn, publisher_ids_dedup, on='id', how='left')
publisher
# In[7]:
# ajout des valeurs de sherpa
publisher_sherpa = pd.read_csv('sample/publisher_sherpa.tsv', encoding='utf-8', header=0, sep='\t')
publisher_sherpa
# In[8]:
# renommage ids
publisher_sherpa = publisher_sherpa.rename(columns = {'publisher_id': 'publisher_id_sherpa', 'url': 'website_sherpa', 'country': 'iso_code'})
# In[9]:
# merge avec ids journals
publisher = pd.merge(publisher, publisher_sherpa, on='journal', how='left')
publisher
# In[10]:
# renommage names
publisher = publisher.rename(columns = {'name_x': 'name_issn', 'name_y': 'name_sherpa'})
# In[11]:
# ajout des informations à partir des revues
publisher_journals = pd.read_csv('sample/journals_publishers_brut.tsv', encoding='utf-8', header=0, sep='\t', usecols=['id', 'url'])
publisher_journals
# In[12]:
# renommage id
publisher_journals = publisher_journals.rename(columns = {'id': 'journal'})
# In[13]:
# merge avec ids journals
publisher = pd.merge(publisher, publisher_journals, on='journal', how='left')
publisher
# In[14]:
# renommage names
del publisher['publisher_id']
del publisher['publisher_id_sherpa']
del publisher['type']
publisher = publisher.rename(columns = {'url' : 'website_issn_journal'})
publisher
# In[15]:
# ajout des champs vides des vides et int
publisher['city'] = ''
publisher['state'] = ''
publisher['oa_policies'] = ''
publisher['starting_year'] = 0
publisher
# In[16]:
# iso_code en majuscules
publisher['iso_code'] = publisher['iso_code'].str.upper()
# ajout de la valeur pour unknown
publisher['iso_code'] = publisher['iso_code'].fillna('__')
publisher
# In[17]:
# merge avec countries
country = pd.read_csv('sample/country.tsv', usecols=('iso_code', 'id'), encoding='utf-8', header=0, sep='\t')
country
# In[18]:
country = country.rename(columns={'id': 'country'})
country
# In[19]:
publisher = pd.merge(publisher, country, on='iso_code', how='left')
publisher
# In[20]:
# garder sherpa puis issn.org
publisher.loc[publisher['name_sherpa'].notna(), 'name'] = publisher['name_sherpa']
publisher.loc[publisher['name_sherpa'].isna(), 'name'] = publisher['name_issn']
publisher.loc[publisher['website_sherpa'].notna(), 'website'] = publisher['website_sherpa']
publisher.loc[publisher['website_sherpa'].isna(), 'website'] = publisher['website_issn_journal']
publisher
# In[21]:
# garder les champs utiles pour l'éditeur
publisher_export = publisher[['id', 'name', 'country', 'city', 'state', 'starting_year', 'website', 'oa_policies']]
# In[22]:
# supprimer les doublons
publisher_export = publisher_export.drop_duplicates(subset='id')
publisher_export
# In[23]:
# remplacement des vides et id à int
publisher_export['website'] = publisher_export['website'].fillna('')
publisher_export
# In[24]:
# merge pour avoir les titres
publisher_ids_dedup = pd.merge(publisher_ids_dedup, publisher_export[['id', 'name']], on='id', how='left')
publisher_ids_dedup
# In[25]:
# garder les ids avant le dédoublonage pour la correction du publisher_ids_dedup
publisher_ids_dedup = publisher_ids_dedup.rename(columns = {'id': 'publisher_av_dedup'})
publisher_ids_dedup
# In[26]:
publisher_export_dedup = publisher_export.drop_duplicates(subset='name')
publisher_export_dedup
# In[27]:
del publisher_export_dedup['id']
# convertir l'index en id
publisher_export_dedup = publisher_export_dedup.reset_index()
# ajout de l'id avec l'index + 1
publisher_export_dedup['id'] = publisher_export_dedup['index'] + 1
del publisher_export_dedup['index']
publisher_export_dedup
# In[28]:
del publisher_export_dedup['id']
# convertir l'index en id
publisher_export_dedup = publisher_export_dedup.reset_index()
# ajout de l'id avec l'index + 1
publisher_export_dedup['id'] = publisher_export_dedup['index'] + 1
del publisher_export_dedup['index']
publisher_export_dedup
# In[29]:
# merge avec les ids d'avant Sherpa
publisher_ids_dedup = pd.merge(publisher_ids_dedup, publisher_export_dedup[['id', 'name']], on='name', how='left')
publisher_ids_dedup = publisher_ids_dedup.rename(columns = {'id': 'publisher'})
publisher_ids_dedup = publisher_ids_dedup.rename(columns = {'journal': 'id'})
publisher_ids_dedup
# In[30]:
# concat valeurs avec même id
del publisher_ids_dedup['publisher_av_dedup']
del publisher_ids_dedup['name']
publisher_ids_dedup['publisher'] = publisher_ids_dedup['publisher'].astype(str)
publisher_ids_dedup_grouped = publisher_ids_dedup.groupby('id').agg({'publisher': lambda x: ', '.join(x)})
publisher_ids_dedup_grouped
# In[31]:
# modifs dans les journaux
journal = pd.read_csv('sample/journal_fin_sherpa.tsv', encoding='utf-8', header=0, sep='\t')
journal
# In[32]:
# merge avec les journaux journal_fin_sherpa
journal = pd.merge(journal, publisher_ids_dedup_grouped, on='id', how='left')
journal
# In[33]:
del journal['publisher_x']
journal = journal.rename(columns = {'publisher_y': 'publisher'})
journal
# In[34]:
# esport JSON publisher
result = journal.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/journal.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[35]:
# export csv
journal.to_csv('sample/journal.tsv', sep='\t', encoding='utf-8', index=False)
# In[36]:
# export excel
journal.to_excel('sample/journal.xlsx', index=False)
# In[37]:
# esport JSON publisher
result = publisher_export_dedup.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/publisher.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[38]:
# export csv
publisher_export_dedup.to_csv('sample/publisher.tsv', sep='\t', encoding='utf-8', index=False)
# In[39]:
# export excel
publisher_export_dedup.to_excel('sample/publisher.xlsx', index=False)
# In[ ]:

Event Timeline