Page MenuHomec4science

04_oacct_publishers.py
No OneTemporary

File Metadata

Created
Sun, May 19, 16:28

04_oacct_publishers.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet d'extraire les données choisis parmis les sources obtenues par API et les traiter pour les rendre exploitables dans l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 16.07.2021
# ## Extraction des données des éditeurs
#
# Sources :
# 1. Données de ISSN.org (JSON)
#
# ### Format des données source
#
# * Noeud : "@graph"
# * spatial & publisher :
# * "@id": "resource/ISSN/0140-6736",
# * "spatial": [
# "http://id.loc.gov/vocabulary/countries/ne",
# "https://www.iso.org/obp/ui/#iso:code:3166:NL"
# ],
#
# Exemple avec plusieurs éditeurs dans le temps :
#
# "publisher": [
# "resource/ISSN/0140-6736#Publisher-Elsevier",
# "resource/ISSN/0140-6736#Publisher-J._Onwhyn"
# ],
#
# {
# "@id": "resource/ISSN/0140-6736#LatestPublicationEvent",
# "@type": "http://schema.org/PublicationEvent",
# "publishedBy": "resource/ISSN/0140-6736#Publisher-Elsevier",
# "location": "resource/ISSN/0140-6736#PublicationPlace-Amsterdam"
# },
#
# {
# "@id": "resource/ISSN/0140-6736#Publisher-Elsevier",
# "@type": "http://schema.org/Organization",
# "name": "Elsevier"
# },
#
# Exemple avec un seul éditeur dans le temps :
#
# "publisher": "resource/ISSN/0899-8418#Publisher-Wiley",
#
# {
# "@id": "resource/ISSN/0899-8418#EarliestPublicationEvent",
# "@type": "http://schema.org/PublicationEvent",
# "publishedBy": "resource/ISSN/0899-8418#Publisher-Wiley",
# "temporal": "c1989-",
# "location": [
# "resource/ISSN/0899-8418#PublicationPlace-New_York",
# "resource/ISSN/0899-8418#PublicationPlace-Chichester"
# ]
# },
#
# {
# "@id": "resource/ISSN/0899-8418#Publisher-Wiley",
# "@type": "http://schema.org/Organization",
# "name": "Wiley"
# },
#
# Exemple avec une liste d'éditeurs finaux :
#
# {
# "@id": "resource/ISSN/2174-8454#LatestPublicationEvent",
# "@type": "http://schema.org/PublicationEvent",
# "publishedBy": [
# "resource/ISSN/2174-8454#Publisher-The_Global_Studies_Institute_de_l’Université_de_Genève",
# "resource/ISSN/2174-8454#Publisher-Universitat_de_València,_Departamento_de_Teoría_de_los_Lenguajes_y_Ciencias_de_la_Comunicación"
# ],
# "location": "resource/ISSN/2174-8454#PublicationPlace-Valencia"
# },
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
import os
# ## Table Publishers
# In[2]:
# creation du DF
# 'country' supprimé pour l'ajouter aux journaux
# 'oa_status' supprimé pour le moment
col_names = ['id',
'name',
'publisher_id_issn',
]
publisher_issn = pd.DataFrame(columns = col_names)
publisher_issn
# ## Table Journals
# In[3]:
journal = pd.read_csv('sample/journals_brut.tsv', encoding='utf-8', header=0, sep='\t')
journal
# ## Table Journals Publishers
# In[4]:
# creation du DF
col_names = ['journal',
'publisher_id_issn'
]
journal_publisher = pd.DataFrame(columns = col_names)
journal_publisher
# In[5]:
# extraction des informations à partir des données ISSN.org
for index, row in journal.iterrows():
journal_id = row['id']
journal_issn = row['issn']
if (((index/10) - int(index/10)) == 0) :
print(index)
# initialisation des variables à extraire
publisher_name = ''
publisher_country = ''
publisher_id = ''
publisher_id_first = ''
publisher_id_last = ''
# export en json
if os.path.exists('issn/data/' + journal_issn + '.json'):
with open('issn/data/' + journal_issn + '.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for x in data['@graph']:
if ('@id' in x):
if (x['@id'] == 'resource/ISSN/' + journal_issn + '#LatestPublicationEvent'):
if ('publishedBy' in x):
publisher_id_last = x['publishedBy']
elif (x['@id'] == 'resource/ISSN/' + journal_issn + '#EarliestPublicationEvent'):
if ('publishedBy' in x):
publisher_id_first = x['publishedBy']
if (publisher_id_last != ''):
publisher_id = publisher_id_last
else :
publisher_id = publisher_id_first
if type(publisher_id) is list:
for pid in publisher_id:
if (pid != ''):
for x in data['@graph']:
if ('@id' in x):
if (x['@id'] == pid):
if ('name' in x):
publisher_name = x['name']
publisher_issn = publisher_issn.append({'publisher_id_issn' : pid, 'name' : publisher_name}, ignore_index=True)
journal_publisher = journal_publisher.append({'journal' : journal_id, 'publisher_id_issn' : pid}, ignore_index=True)
else :
if (publisher_id != ''):
for x in data['@graph']:
if ('@id' in x):
if (x['@id'] == publisher_id):
if ('name' in x):
publisher_name = x['name']
publisher_issn = publisher_issn.append({'publisher_id_issn' : publisher_id, 'name' : publisher_name}, ignore_index=True)
journal_publisher = journal_publisher.append({'journal' : journal_id, 'publisher_id_issn' : publisher_id}, ignore_index=True)
else :
print(row['issn'] + ' - pas trouvé')
# In[6]:
publisher_issn
# In[7]:
# simlification des IDs
publisher_issn[['publisher_id_racine', 'publisher_id_fin']] = publisher_issn['publisher_id_issn'].str.split('#Publisher-', n=1, expand=True)
publisher_issn
# In[8]:
# simplifications
del publisher_issn['publisher_id_issn']
del publisher_issn['publisher_id_racine']
del publisher_issn['id']
publisher_issn = publisher_issn.rename(columns={'publisher_id_fin': 'publisher_id_issn'})
publisher_issn
# In[9]:
# supprimer les crochets et supprimer les doublons
# publisher['publisher_id'] = publisher['publisher_id'].str.replace('[', '')
# publisher['publisher_id'] = publisher['publisher_id'].str.replace(']', '')
# publisher['name'] = publisher['name'].str.replace('[', '')
# publisher['name'] = publisher['name'].str.replace(']', '')
publisher_issn = publisher_issn.drop_duplicates(subset=['publisher_id_issn'])
publisher_issn
# In[10]:
# test publishers sans nom
publisher_issn.loc[publisher_issn['name'] == '']
# In[11]:
journal_publisher
# In[12]:
# simlification des IDs
journal_publisher[['publisher_id_racine', 'publisher_id_fin']] = journal_publisher['publisher_id_issn'].str.split('#Publisher-', n=1, expand=True)
# simplifications
del journal_publisher['publisher_id_issn']
del journal_publisher['publisher_id_racine']
journal_publisher = journal_publisher.rename(columns={'publisher_id_fin': 'publisher_id_issn'})
journal_publisher
# In[13]:
# merge avec journals
journal_publisher = pd.merge(journal_publisher, publisher_issn, on='publisher_id_issn', how='left')
journal_publisher
# In[14]:
journal_publisher = journal_publisher.rename(columns={'publisher_id_issn': 'publisher_id'})
journal_publisher
# In[15]:
publisher = journal_publisher[['publisher_id', 'name']]
publisher
# In[16]:
# supprimer les doublons
publisher = publisher.drop_duplicates(subset='publisher_id')
publisher
# In[17]:
# convertir l'index en id
publisher = publisher.reset_index()
# ajout de l'id avec l'index + 1
publisher['id'] = publisher['index'] + 1
del publisher['index']
publisher
# In[18]:
# convertir l'index en id
publisher = publisher.reset_index()
# ajout de l'id avec l'index + 1
publisher['id'] = publisher['index'] + 1
del publisher['index']
publisher
# In[19]:
# ajout de la valeur UNKNOWN
# 'country': 999999
publisher = publisher.append({'id' : 999999, 'name' : 'UNKNOWN', 'publisher_id': '999999'}, ignore_index=True)
publisher
# In[20]:
# recuperation de l'id du publisher
journal_publisher = pd.merge(journal_publisher, publisher[['publisher_id', 'id']], on='publisher_id', how='left')
journal_publisher
# In[21]:
journal_publisher = journal_publisher.rename(columns={'id': 'publisher'})
journal_publisher
# In[22]:
# ajout du publisher id au journals_brut
journal_publisher_ids = journal_publisher[['journal', 'publisher']]
journal_publisher_ids = journal_publisher_ids.rename(columns={'journal': 'id'})
journal_publisher_ids['publisher'] = journal_publisher_ids['publisher'].astype(str)
journal_publisher_ids
# In[23]:
# concat valeurs avec même id
journal_publisher_grouped = journal_publisher_ids.groupby('id').agg({'publisher': lambda x: ', '.join(x)})
journal_publisher_grouped
# In[24]:
# recuperation de l'id du publisher
journals = pd.merge(journal, journal_publisher_grouped, on='id', how='left')
journals
# In[25]:
# export csv
publisher.to_csv('sample/publishers_brut.tsv', sep='\t', encoding='utf-8', index=False)
# In[26]:
# export excel
publisher.to_excel('sample/publishers_brut.xlsx', index=False)
# In[27]:
# export csv brut des journals
journals.to_csv('sample/journals_publishers_brut.tsv', sep='\t', encoding='utf-8', index=False)
# In[28]:
# export excel brut
journals.to_excel('sample/journals_publishers_brut.xlsx', index=False)
# In[29]:
# export csv brut des ids
journal_publisher_ids.to_csv('sample/journals_publishers_ids.tsv', sep='\t', encoding='utf-8', index=False)
# In[30]:
# export excel brut des ids
journal_publisher_ids.to_excel('sample/journals_publishers_ids.xlsx', index=False)

Event Timeline