Page MenuHomec4science

03_oacct_journals.py
No OneTemporary

File Metadata

Created
Sat, Jun 22, 10:28

03_oacct_journals.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet d'extraire les données choisis parmis les sources obtenues par API et les traiter pour les rendre exploitables dans l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 16.07.2021
# ## Extraction des données des revues
#
#
# ## Corpus initial
#
# ISSNs des revues des publication archivées sur l'AoU UNIGE et sur Infoscience EPFL
#
# * Fichier des ISSNs de l'AoU exporté le 16.10.2020
# * Fichier des ISSNs de Infoscience exporté le 28.01.2021
# * Données extraits à partir du JSON de ISSN.org
#
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
import os
# paramètre pour le nombre de journaux dans le sample (0 pour prendre tout)
journals_sample_n = 1000
# ## Table OA categories
#
# * 1 : UNKNOWN
# * 2 : Green
# * 3 : Hybrid
# * 4 : Full
# * 5 : Gold
# * 6 : Diamond
# In[2]:
# creation du DF
col_names = ['id',
'status',
'description',
'subscription',
'accepted_manuscript',
'apc',
'final_version'
]
oas = pd.DataFrame(columns = col_names)
oas
# In[3]:
# ajout des valeurs
oas = oas.append({'id' : 1, 'status' : 'UNKNOWN', 'description' : '', 'subscription' : 0, 'accepted_manuscript' : 0, 'apc' : 0, 'final_version' : 0}, ignore_index=True)
oas = oas.append({'id' : 2, 'status' : 'Green', 'description' : 'Paywalled access journal, usually allows the archive of submitted or accepted version on institutional repositories (embargo periods may apply)', 'subscription' : 1, 'accepted_manuscript' : 1, 'apc' : 0, 'final_version' : 0}, ignore_index=True)
oas = oas.append({'id' : 3, 'status' : 'hybrid', 'description' : 'Paywalled access journal, offers several Open Access upon payment of APCs. It allows offten the archive of published version on institutional repositories (embargo periods can apply)', 'subscription' : 1, 'accepted_manuscript' : 1, 'apc' : 1, 'final_version' : 1}, ignore_index=True)
# oas = oas.append({'id' : 4, 'status' : 'Full', 'description' : 'No subscription, Green or Gold', 'subscription' : 0, 'accepted_manuscript' : 1, 'apc' : 0, 'final_version' : 1}, ignore_index=True)
oas = oas.append({'id' : 5, 'status' : 'Gold', 'description' : 'Open Access journal (payment of APCs may apply). It allows offten the archive of published version on institutional repositories (embargo periods can apply)', 'subscription' : 0, 'accepted_manuscript' : 1, 'apc' : 1, 'final_version' : 1}, ignore_index=True)
oas = oas.append({'id' : 6, 'status' : 'Diamond', 'description' : 'Open Access journal (without payment of APCs). It allows offten the archive of published version on institutional repositories (embargo periods can apply)', 'subscription' : 0, 'accepted_manuscript' : 1, 'apc' : 0, 'final_version' : 1}, ignore_index=True)
# In[4]:
oas
# In[5]:
# esport JSON
result = oas.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/oa.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[6]:
# export csv
oas.to_csv('sample/oa.tsv', sep='\t', encoding='utf-8', index=False)
# In[7]:
# export excel
oas.to_excel('sample/oa.xlsx', index=False)
# ## Table Journals
# In[8]:
issns = pd.read_csv('issn/issns_count.tsv', encoding='utf-8', header=0, sep='\t')
issns
# In[9]:
# ajout des colonnes
issns.insert(0, 'id', '', False)
issns
# In[10]:
# convertir l'index en id
issns = issns.reset_index()
issns
# In[11]:
# ajout de l'id avec l'index + 1
issns['id'] = issns['index'] + 1
del issns['index']
issns
# In[12]:
# reduction à X journaux pour l'échantillon de test
if journals_sample_n > 0 :
issns = issns.loc[:journals_sample_n]
issns
# In[13]:
# ajout des ISSN-L
df_issnl = pd.read_csv('issn/20171102.ISSN-to-ISSN-L.txt', encoding='utf-8', header=0, sep='\t')
df_issnl
# In[14]:
# renommer les colonnes
df_issnl = df_issnl.rename(columns={'ISSN' : 'issn', 'ISSN-L' : 'issnl'})
# In[15]:
issns = pd.merge(issns, df_issnl, on='issn', how='left')
issns
# In[16]:
# creation du DF
# 'oa_status' supprimé pour le moment
col_names = ['id',
'issn',
'issnl',
'title',
'starting_year',
'end_year',
'url',
'name_short_iso_4'
]
journals = pd.DataFrame(columns = col_names)
journals
# In[17]:
# creation du DF
col_names = ['id', 'iso_code']
journals_languages = pd.DataFrame(columns = col_names)
journals_languages
# In[18]:
# creation du DF
# 'oa_status' supprimé
col_names = ['id', 'iso_code']
journals_countries = pd.DataFrame(columns = col_names)
journals_countries
# In[19]:
# extraction des informations à partir des données ISSN.org
for index, row in issns.iterrows():
myid = row['id']
myissn = row['issn']
if (((index/10) - int(index/10)) == 0) :
print(index)
# initialisation des variables à extraire
issnl = np.nan
title = ''
keytitle = ''
starting_year = np.nan
end_year = np.nan
myurl = np.nan
journal_country = np.nan
journal_language = np.nan
keytitle_abbr = np.nan
# export en json
if os.path.exists('issn/data/' + myissn + '.json'):
with open('issn/data/' + myissn + '.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for x in data['@graph']:
if ('@id' in x):
if (x['@id'] == 'resource/ISSN/' + myissn):
if ('mainTitle' in x):
title = x['mainTitle']
else :
if ('name' in x):
title = x['name']
# print(myissn)
if ('startDate' in x):
starting_year = x['startDate']
if ('endDate' in x):
end_year = x['endDate']
if ('url' in x):
urls = x['url']
if type(urls) is list:
for url in urls:
# Filtrer les URLs des archives :
# www.ncbi.nlm.nih.gov/pmc/*
# www.pubmedcentral.gov/*
# pubmedcentral.nih.gov/*
# bibpurl.oclc.org/*
# www.jstor.org/*
# ieeexplore.ieee.org
# ovidsp.ovid.com
# et garder le premier des restants
myurl = url
if ('ncbi.nlm.nih.gov' not in url
and 'pubmedcentral' not in url
and 'bibpurl.oclc.org' not in url
and 'jstor.org' not in url
and 'ieeexplore.ieee.org' not in url
and 'ovidsp.ovid.com' not in url):
break
else :
myurl = x['url']
if ('spatial' in x):
countries = x['spatial']
if type(countries) is list:
for country in countries:
if ('https://www.iso.org/obp/ui/#iso:code:3166:' in country):
journal_country = country[-2:]
journals_countries = journals_countries.append({'id' : myid, 'iso_code' : journal_country}, ignore_index=True)
else :
if ('https://www.iso.org/obp/ui/#iso:code:3166:' in countries):
journal_country = countries[-2:]
journals_countries = journals_countries.append({'id' : myid, 'iso_code' : journal_country}, ignore_index=True)
# langue "inLanguage": "http://id.loc.gov/vocabulary/iso639-2/eng",
if ('inLanguage' in x):
languages = x['inLanguage']
if type(languages) is list:
for language in languages:
journal_language = language[-3:]
journals_languages = journals_languages.append({'id' : myid, 'iso_code' : journal_language}, ignore_index=True)
else :
journal_language = languages[-3:]
journals_languages = journals_languages.append({'id' : myid, 'iso_code' : journal_language}, ignore_index=True)
if (x['@id'] == 'resource/ISSN/' + myissn + '#KeyTitle'):
if ('value' in x):
keytitle = x['value']
if (x['@id'] == 'resource/ISSN/' + myissn + '#ISSN-L'):
if ('value' in x):
issnl = x['value']
# "@id": "resource/ISSN/1098-0121#AbbreviatedKeyTitle",
if (x['@id'] == 'resource/ISSN/' + myissn + '#AbbreviatedKeyTitle'):
if ('value' in x):
mykeytitle_abbrs = x['value']
if type(mykeytitle_abbrs) is list:
for mykeytitle_abbr in mykeytitle_abbrs:
print(myissn + ' - AbbreviatedKeyTitle is a list ' + mykeytitle_abbr)
keytitle_abbr = mykeytitle_abbr
with open('sample/03_journals_issn_multiple_titles.txt', 'a', encoding='utf-8') as g:
g.write(myissn + ' AbbreviatedKeyTitle is a list ' + mykeytitle_abbr + '\n')
break
else :
keytitle_abbr = mykeytitle_abbrs
if keytitle != '' :
title = keytitle
if title != '' :
# supprimer le point à la fin
if (title[-1] == '.'):
title = title[0:-1]
# remplacer les caractères spéciaux ˜The œ
if type(title) is list:
for mytitlei in title:
print(myissn + ' - title is a list ' + mytitlei)
title = str.replace(mytitlei, '˜The œ', 'The ')
with open('sample/03_journals_issn_multiple_titles.txt', 'a', encoding='utf-8') as g:
g.write(myissn + ' title is a list ' + mytitlei + '\n')
break
else :
title = str.replace(title, '˜The œ', 'The ')
else :
print(row['issn'] + ' - not found')
with open('sample/03_journals_issn_errors.txt', 'a', encoding='utf-8') as g:
g.write(row['issn'] + ' not found \n')
journals.at[index,'id'] = myid
journals.at[index,'title'] = title
journals.at[index,'issn'] = myissn
journals.at[index,'issnl'] = issnl
journals.at[index,'starting_year'] = starting_year
journals.at[index,'end_year'] = end_year
journals.at[index,'url'] = myurl
journals.at[index,'name_short_iso_4'] = keytitle_abbr
# In[20]:
journals
# In[21]:
# titres vides
journals.loc[journals['title'] == '']
# In[22]:
# export csv des titres vides
journals.loc[journals['title'] == ''].to_csv('sample/journals_sans_titre.tsv', sep='\t', encoding='utf-8', index=False)
# In[23]:
# export excel des ids
journals.loc[journals['title'] == ''].to_excel('sample/journals_sans_titre.xlsx', index=False)
# In[24]:
# garder les lignes avec titre
journals = journals.loc[journals['title'] != '']
journals
# In[25]:
journals.shape[0]
# ## Languages
# In[26]:
journals_languages
# In[27]:
# ouvrir la table des langues
languages = pd.read_csv('sample/language.tsv', encoding='utf-8', header=0, sep='\t')
languages
# In[28]:
# renommer les colonnes
del languages['name']
languages = languages.rename(columns={'id' : 'language'})
# In[29]:
# merge avec languages
journals_languages = pd.merge(journals_languages, languages, on='iso_code', how='left')
journals_languages
# In[30]:
# concat valeurs avec même id
journals_languages['language'] = journals_languages['language'].astype(str)
journals_languages = journals_languages.groupby('id').agg({'language': lambda x: ', '.join(x)})
journals_languages
# In[31]:
# recuperation de l'id des langues
journals = pd.merge(journals, journals_languages, on='id', how='left')
journals
# ## Countries
# In[32]:
journals_countries
# In[33]:
# ouvrir la table des pays
country = pd.read_csv('sample/country.tsv', encoding='utf-8', header=0, sep='\t')
country
# In[34]:
# renommer les colonnes
del country['name']
country = country.rename(columns={'id' : 'country'})
# In[35]:
# merge avec countries
journals_countries = pd.merge(journals_countries, country, on='iso_code', how='left')
journals_countries
# In[36]:
# concat valeurs avec même id
journals_countries['country'] = journals_countries['country'].astype(str)
journals_countries = journals_countries.groupby('id').agg({'country': lambda x: ', '.join(x)})
journals_countries
# In[37]:
# recuperation de l'id des langues
journals = pd.merge(journals, journals_countries, on='id', how='left')
journals
# ### DOAJ
# In[38]:
# ajout de DOAJ info
doaj = pd.read_csv('doaj/journalcsv__doaj_20210312_0636_utf8.csv', encoding='utf-8', header=0)
doaj
# In[39]:
# ajout ISSNL
doaj['issn'] = doaj['Journal ISSN (print version)']
doaj.loc[doaj['issn'].isna(), 'issn'] = doaj['Journal EISSN (online version)']
doaj
# In[40]:
doaj = pd.merge(doaj, df_issnl, on='issn', how='left')
doaj
# In[41]:
doaj.columns
# In[42]:
doaj['Preservation Services']
# In[43]:
doaj['DOAJ Seal']
# In[44]:
doaj['issnl']
# In[45]:
doaj['APC'].value_counts()
# In[46]:
# ajout des infos de DOAJ :
# Journal title
# DOAJ Seal
doaj_for_merge = doaj[['issnl', 'Journal title', 'DOAJ Seal', 'APC']]
doaj_for_merge
# In[47]:
# renommer les colonnes
doaj_for_merge = doaj_for_merge.rename(columns={'Journal title' : 'doaj_title', 'DOAJ Seal' : 'doaj_seal'})
doaj_for_merge
# In[48]:
# merge avec journals
journals = pd.merge(journals, doaj_for_merge, on='issnl', how='left')
journals
# In[49]:
# ajouter info sur la presence sur DOAJ ou du seal
journals.loc[journals['doaj_title'].isna(), 'doaj_status'] = 0
journals.loc[~journals['doaj_title'].isna(), 'doaj_status'] = 1
journals.loc[journals['doaj_seal'] == 'Yes', 'doaj_seal'] = 1
journals.loc[journals['doaj_seal'] == 'No', 'doaj_seal'] = 0
journals
# ### LOCKSS
# In[50]:
# ajout des infos de preservation LOCKSS, Portico et Licences Nationales
lockss = pd.read_csv('lockss/keepers-LOCKSS-report.csv', encoding='utf-8', header=0, skiprows=1)
lockss
# In[51]:
# ajout ISSNL
lockss['issn'] = lockss['eISSN']
lockss.loc[lockss['eISSN'].isna(), 'issn'] = lockss['ISSN']
lockss
# In[52]:
lockss = pd.merge(lockss, df_issnl, on='issn', how='left')
lockss
# In[53]:
lockss.columns
# In[54]:
# test des lignes sans merge
lockss.loc[lockss['issnl'].isna()]
# In[55]:
# utiliser l'ISSN à la place sur ces lignes
lockss.loc[lockss['issnl'].isna(), 'issnl'] = lockss['issn']
# In[56]:
# test des lignes sans merge
lockss.loc[lockss['issnl'].isna()]
# In[57]:
# ajout des infos de LOCKSS :
# Title
lockss_for_merge = lockss[['issnl', 'Title']]
lockss_for_merge
# In[58]:
# renommer les colonnes
lockss_for_merge = lockss_for_merge.rename(columns={'Title' : 'lockss_title'})
lockss_for_merge
# In[59]:
# merge avec journals
journals = pd.merge(journals, lockss_for_merge, on='issnl', how='left')
journals
# In[60]:
# suppression des doublons
journals = journals.drop_duplicates(subset=['id'])
journals
# In[61]:
# ajouter info sur la presence sur LOCKSS
journals.loc[journals['lockss_title'].isna(), 'lockss'] = 0
journals.loc[~journals['lockss_title'].isna(), 'lockss'] = 1
journals
# ### Portico
# In[62]:
# ajout des infos de preservation Portico
portico = pd.read_excel('portico/e-journals.xlsx', sheet_name='Details', skiprows=2)
portico
# In[63]:
# ajout ISSNL
portico['issn'] = portico['e-ISSN']
portico.loc[portico['e-ISSN'].isna(), 'issn'] = portico['Print ISSN']
portico
# In[64]:
portico = pd.merge(portico, df_issnl, on='issn', how='left')
portico
# In[65]:
portico.columns
# In[66]:
# test des lignes sans merge
portico.loc[portico['issnl'].isna()]
# In[67]:
# utiliser l'ISSN à la place sur ces lignes
portico.loc[portico['issnl'].isna(), 'issnl'] = portico['issn']
# In[68]:
# test des lignes sans merge
portico.loc[portico['issnl'].isna()]
# In[69]:
# ajout des infos de Portico :
# Status
portico_for_merge = portico[['issnl', 'Status']]
portico_for_merge
# In[70]:
# garder les lignes "preserved"
portico_for_merge = portico_for_merge.loc[portico_for_merge['Status'] == 'preserved']
portico_for_merge
# In[71]:
# renommer les colonnes
portico_for_merge = portico_for_merge.rename(columns={'Status' : 'portico_status'})
portico_for_merge
# In[72]:
# merge avec journals
journals = pd.merge(journals, portico_for_merge, on='issnl', how='left')
journals
# In[73]:
# suppression des doublons
journals = journals.drop_duplicates(subset=['id'])
journals
# In[74]:
# ajouter info sur la presence sur portico
journals.loc[journals['portico_status'].isna(), 'portico'] = 0
journals.loc[~journals['portico_status'].isna(), 'portico'] = 1
journals
# ### Licences Nationales
# In[75]:
# ajout des infos de preservation des Licences nationales
nlch1 = pd.read_excel('licences_nationales/cambridge_Switzerland_NationalLicences_2020-08-17.xlsx')
nlch1
# In[76]:
# ajout des infos de preservation des Licences nationales
nlch2 = pd.read_excel('licences_nationales/gruyter_Switzerland_NationalLicences_2020-11-30.xlsx')
nlch2
# In[77]:
# ajout des infos de preservation des Licences nationales
nlch3 = pd.read_excel('licences_nationales/oxford_Switzerland_NationalLicences_2020-09-24.xlsx')
nlch3
# In[78]:
# ajout des infos de preservation des Licences nationales
nlch4 = pd.read_excel('licences_nationales/springer_Switzerland_NationalLicences_2020-08-12.xlsx')
nlch4
# In[79]:
# concatener les 4
nlch = pd.concat([nlch1, nlch2, nlch3, nlch4], ignore_index=True)
nlch
# In[80]:
nlch.columns
# In[81]:
# ajout ISSNL
nlch['issn'] = nlch['online_identifier']
nlch.loc[nlch['online_identifier'].isna(), 'issn'] = nlch['print_identifier']
nlch
# In[82]:
nlch = pd.merge(nlch, df_issnl, on='issn', how='left')
nlch
# In[83]:
# test des lignes sans merge
nlch.loc[nlch['issnl'].isna()]
# In[84]:
# utiliser l'ISSN à la place sur ces lignes
nlch.loc[nlch['issnl'].isna(), 'issnl'] = nlch['issn']
# In[85]:
# test des lignes sans merge
nlch.loc[nlch['issnl'].isna()]
# In[86]:
# ajout des infos de nlch :
# publication_title
nlch_for_merge = nlch[['issnl', 'publication_title']]
nlch_for_merge
# In[87]:
# renommer les colonnes
nlch_for_merge = nlch_for_merge.rename(columns={'publication_title' : 'nlch_title'})
nlch_for_merge
# In[88]:
# merge avec journals
journals = pd.merge(journals, nlch_for_merge, on='issnl', how='left')
journals
# In[89]:
# ajouter info sur la presence sur portico
journals.loc[journals['nlch_title'].isna(), 'nlch'] = 0
journals.loc[~journals['nlch_title'].isna(), 'nlch'] = 1
journals
# ### QOAM
# In[90]:
# ouverture du fichier
qoam = pd.read_csv('qoam/qoam_not_zero.tsv', encoding='utf-8', header=0, sep='\t')
qoam
# In[91]:
qoam = pd.merge(qoam, df_issnl, on='issn', how='left')
qoam
# In[92]:
# test des lignes sans merge
qoam.loc[qoam['issnl'].isna()]
# In[93]:
# utiliser l'ISSN à la place sur ces lignes
qoam.loc[qoam['issnl'].isna(), 'issnl'] = qoam['issn']
# In[94]:
# test des lignes sans merge
qoam.loc[qoam['issnl'].isna()]
# In[95]:
# ajout des infos de qoam :
# publication_title
qoam_for_merge = qoam[['issnl', 'qoam_av_score']]
qoam_for_merge
# In[96]:
# merge avec journals
journals = pd.merge(journals, qoam_for_merge, on='issnl', how='left')
journals
# In[97]:
# suppression des doublons
journals = journals.drop_duplicates(subset=['id'])
journals
# ## Finalisation de la table journals
# In[98]:
# test des doublons
journals_doublons = journals[['issn', 'issnl', 'title']].loc[journals.duplicated(subset='issnl')].sort_values(by='issnl')
journals_doublons
# In[99]:
journals_doublons = journals_doublons.loc[journals_doublons['issnl'].notna()]
# In[100]:
# merge pour voir les lignes avec doublon
journals_doublons['doublon_issnl'] = 1
journals = pd.merge(journals, journals_doublons[['issnl', 'doublon_issnl']], on='issnl', how='left')
journals.loc[journals['doublon_issnl'] == 1]
# In[101]:
journals.loc[journals['doublon_issnl'] == 1].sort_values(by='issnl')
# In[102]:
# export csv des doublons
journals.loc[journals['doublon_issnl'] == 1].sort_values(by='issnl').to_csv('sample/journals_duplicates.tsv', sep='\t', encoding='utf-8', index=False)
# In[103]:
# export excel des doublons
journals.loc[journals['doublon_issnl'] == 1].sort_values(by='issnl').to_excel('sample/journals_duplicates.xlsx', index=False)
# In[104]:
# suppression des doublons
journals = journals.drop_duplicates(subset=['issnl'])
journals
# In[105]:
# ajout du oa_status
# 6 : Diamond
# 5 : Gold
# 4 : Full
# 3 : Hybrid
# 2 : Green
# 1 : UNKNOWN
journals['oa_status'] = 1
journals
# In[106]:
# status 5 pour les revues DOAJ
journals.loc[journals['doaj_status'] == 1, 'oa_status'] = 5
# status 6 pour les revues DOAJ avec APC = 0
journals.loc[(journals['doaj_status'] == 1) & (journals['APC'] == 'No'), 'oa_status'] = 6
journals
# In[107]:
journals['oa_status'].value_counts()
# In[108]:
# export csv brut
journals.to_csv('sample/journals_brut.tsv', sep='\t', encoding='utf-8', index=False)
# In[109]:
# export excel brut
journals.to_excel('sample/journals_brut.xlsx', index=False)
# In[110]:
# export csv des ids
journals[['id', 'title', 'issn', 'issnl']].to_csv('sample/journals_ids.tsv', sep='\t', encoding='utf-8', index=False)
# In[111]:
# export excel des ids
journals[['id', 'title', 'issn', 'issnl']].to_excel('sample/journals_ids.xlsx', index=False)
# In[ ]:

Event Timeline