Page MenuHomec4science

10_oacct_terms.py
No OneTemporary

File Metadata

Created
Wed, May 29, 23:14

10_oacct_terms.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet de modifier les données extraites des differentes sources et les exporter dans les tables de l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 08.09.2021
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
import os
# afficher toutes les colonnes
pd.set_option('display.max_columns', None)
# definir le debut des ids
id_start = 1
# ## Import du fichier extrait de Sherpa
# In[2]:
sherpa = pd.read_csv('sample/sherpa_policies_brut.tsv', encoding='utf-8', header=0, sep='\t')
sherpa
# In[3]:
# test des valeurs pour les versions
sherpa['article_version'].value_counts()
# In[4]:
# test des valeurs pour les issns
sherpa.loc[sherpa['issn'].isna()]
# In[5]:
# ajout des ISSN-L
issns = pd.read_csv('issn/20171102.ISSN-to-ISSN-L.txt', encoding='utf-8', header=0, sep='\t')
issns
# In[6]:
# renommer les colonnes
issns = issns.rename(columns={'ISSN' : 'issn', 'ISSN-L' : 'issnl'})
issns
# In[7]:
# merge avec la table sherpa
sherpa = pd.merge(sherpa, issns, on='issn', how='left')
sherpa
# In[8]:
# test des valeurs pour les issnl
sherpa.loc[sherpa['issnl'].isna()]
# In[9]:
# extraction des données IR Archiving + Embargo par ISSN
sherpa_ir = sherpa[['issnl', ]]
# ## Import du fichier des licences Read & Publish
# In[10]:
rp = pd.read_csv('sample/read_publish_brut_merge.tsv', encoding='utf-8', header=0, sep='\t')
rp
# In[11]:
rp['embargo_months'].value_counts()
# In[12]:
# ajout de l'éditeur dans un seul champ
# rp.loc[rp['Elsevier'] == 'x', 'public_notes'] = 'Elsevier Read & Publish agreement'
rp.loc[rp['Elsevier'] == 'x', 'rp_publisher'] = 'Elsevier'
rp.loc[rp['Springer Nature'] == 'x', 'rp_publisher'] = 'Springer Nature'
rp.loc[rp['Wiley'] == 'x', 'rp_publisher'] = 'Wiley'
rp.loc[rp['TF'] == 'x', 'rp_publisher'] = 'TF'
rp.loc[rp['CUP'] == 'x', 'rp_publisher'] = 'CUP'
rp
# In[13]:
# test des valeurs pour les versions
rp['rp_publisher'].value_counts()
# In[14]:
# test des valeurs pour les versions
rp['license'].value_counts()
# In[15]:
# supprimer les champs inutiles et renommer les colonnes
del rp['Elsevier']
del rp['Springer Nature']
del rp['Wiley']
del rp['TF']
del rp['CUP']
del rp['URL']
rp
# In[16]:
# renommer les colonnes
rp = rp.rename(columns = {'Title' : 'title', 'ROR' : 'ror', 'read_publish_id' : 'rp_id'})
rp
# ## Table applicable_version
# In[17]:
# creation du DF
col_names = ['id',
'type',
'description'
]
applicable_version = pd.DataFrame(columns = col_names)
# 3 values : published, accepted, submitted
new_row1 = {'id':1, 'type':'submitted', 'description' : 'Submitted version'}
new_row2 = {'id':2, 'type':'accepted', 'description' : 'Accepted version'}
new_row3 = {'id':3, 'type':'published', 'description' : 'Published version'}
#append row to the dataframe
applicable_version = applicable_version.append(new_row1, ignore_index=True)
applicable_version = applicable_version.append(new_row2, ignore_index=True)
applicable_version = applicable_version.append(new_row3, ignore_index=True)
applicable_version
# In[18]:
# ajout de la valeur UNKNOWN
applicable_version = applicable_version.append({'id' : 999999, 'type' : 'UNKNOWN', 'description' : 'UNKNOWN'}, ignore_index=True)
applicable_version
# In[19]:
# renommage des champs finaux
applicable_version_export = applicable_version[['id', 'description']]
# In[20]:
# export de la table applicable_version
result = applicable_version_export.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/version.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[21]:
# export csv
applicable_version_export.to_csv('sample/version.tsv', sep='\t', encoding='utf-8', index=False)
# In[22]:
# export excel
applicable_version_export.to_excel('sample/version.xlsx', index=False)
# In[23]:
# merge avec la table sherpa
sherpa = pd.merge(sherpa, applicable_version[['id', 'type']], left_on='article_version', right_on='type', how='left')
sherpa
# In[24]:
sherpa = sherpa.rename(columns = {'id_x' : 'id', 'id_y' : 'version'})
del sherpa['type']
sherpa
# In[25]:
# merge avec la table read & publish
rp = pd.merge(rp, applicable_version[['id', 'type']], left_on='article_version', right_on='type', how='left')
rp
# In[26]:
rp = rp.rename(columns = {'id' : 'version'})
del rp['type']
rp
# ## Table oa_licence
# In[27]:
# creation du DF
# 'version' n'est pas utilisée, on dédoublonne par nom sans la version
col_names = ['id',
'name',
'url'
]
oa_licence = pd.DataFrame(columns = col_names)
oa_licence
# In[28]:
# export des licences
sherpa['license'].value_counts()
# In[29]:
sherpa_licences = sherpa['license'].drop_duplicates()
sherpa_licences = sherpa_licences.dropna()
sherpa_licences
# In[30]:
oa_licence['sherpa_code'] = np.nan
oa_licence
# In[31]:
for code in sherpa_licences:
print (code)
oa_licence = oa_licence.append({'sherpa_code' : code}, ignore_index=True)
# In[32]:
oa_licence
# In[33]:
# convertir l'index en id
oa_licence = oa_licence.reset_index()
# ajout de l'id avec l'index + 1
oa_licence['id'] = oa_licence['index'] + 1
del oa_licence['index']
oa_licence
# In[34]:
# ajout du nom et des URLs
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by', 'name'] = 'CC BY'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by', 'url'] = 'https://creativecommons.org/licenses/by/4.0/'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_sa', 'name'] = 'CC BY-SA'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_sa', 'url'] = 'https://creativecommons.org/licenses/by-sa/4.0/'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nc', 'name'] = 'CC BY-NC'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nc', 'url'] = 'https://creativecommons.org/licenses/by-nc/4.0/'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nc_sa', 'name'] = 'CC BY-NC-SA'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nc_sa', 'url'] = 'https://creativecommons.org/licenses/by-nc-sa/4.0/'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nd', 'name'] = 'CC BY-ND'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nd', 'url'] = 'https://creativecommons.org/licenses/by-nd/4.0/'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nc_nd', 'name'] = 'CC BY-NC-ND'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_by_nc_nd', 'url'] = 'https://creativecommons.org/licenses/by-nc-nd/4.0/'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc0', 'name'] = 'CC0'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc0', 'url'] = 'https://creativecommons.org/publicdomain/zero/1.0/'
oa_licence.loc[oa_licence['sherpa_code'] == 'bespoke_license', 'name'] = 'Specific license'
oa_licence.loc[oa_licence['sherpa_code'] == 'bespoke_license', 'url'] = ''
oa_licence.loc[oa_licence['sherpa_code'] == 'all_rights_reserved', 'name'] = 'All rights reserved'
oa_licence.loc[oa_licence['sherpa_code'] == 'all_rights_reserved', 'url'] = ''
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_gnu_gpl', 'name'] = 'GNU GPL'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_gnu_gpl', 'url'] = 'http://gnugpl.org/'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_public_domain', 'name'] = 'Public domain'
oa_licence.loc[oa_licence['sherpa_code'] == 'cc_public_domain', 'url'] = 'https://creativecommons.org/share-your-work/public-domain/'
# oa_licence.loc[oa_licence['sherpa_code'] == 'bespoke_license', 'url'] = 'https://port.sas.ac.uk/mod/book/view.php?id=1340&chapterid=1003'
oa_licence
# In[35]:
# ajout de la valeur UNKNOWN
oa_licence = oa_licence.append({'id' : 999999, 'sherpa_code' : '___', 'name' : 'UNKNOWN', 'url' : ''}, ignore_index=True)
oa_licence
# In[36]:
# ajout aux tables sherpa et rp
sherpa = sherpa.rename(columns = {'license' : 'sherpa_code'})
sherpa
# In[37]:
# ajout aux tables sherpa et rp
rp = rp.rename(columns = {'license' : 'sherpa_code'})
rp
# In[38]:
# merge
sherpa = pd.merge(sherpa, oa_licence[['sherpa_code', 'id']], on='sherpa_code', how='left')
sherpa
# In[39]:
sherpa = sherpa.rename(columns = {'id_x' : 'id', 'id_y' : 'licence'})
sherpa
# In[40]:
# merge
rp = pd.merge(rp, oa_licence[['sherpa_code', 'id']], on='sherpa_code', how='left')
rp
# In[41]:
rp = rp.rename(columns = {'id' : 'licence'})
rp
# In[42]:
# renommage des champs finaux
oa_licence_export = oa_licence[['id', 'name', 'url']]
oa_licence_export = oa_licence_export.rename(columns={'name' : 'name_or_abbrev', 'url' : 'website'})
# In[43]:
# export de la table oa_licence
result = oa_licence_export.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/licence.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[44]:
# export csv
oa_licence_export.to_csv('sample/licence.tsv', sep='\t', encoding='utf-8', index=False)
# In[45]:
# export excel
oa_licence_export.to_excel('sample/licence.xlsx', index=False)
# ## Table cost_factor_type
# In[46]:
# creation du DF
col_names = ['id',
'name'
]
cost_factor_type = pd.DataFrame(columns = col_names)
cost_factor_type = cost_factor_type.append({'id' : 1, 'name' : 'APC'}, ignore_index=True)
cost_factor_type = cost_factor_type.append({'id' : 2, 'name' : 'Discount'}, ignore_index=True)
cost_factor_type = cost_factor_type.append({'id' : 3, 'name' : 'Refund'}, ignore_index=True)
cost_factor_type
# In[47]:
# ajout de la valeur UNKNOWN
cost_factor_type = cost_factor_type.append({'id' : 999999, 'name' : 'UNKNOWN'}, ignore_index=True)
cost_factor_type
# In[48]:
# export de la table
result = cost_factor_type.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/cost_factor_type.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[49]:
# export csv
cost_factor_type.to_csv('sample/cost_factor_type.tsv', sep='\t', encoding='utf-8', index=False)
# In[50]:
# export excel
cost_factor_type.to_excel('sample/cost_factor_type.xlsx', index=False)
# ## Table cost_factor
# ### Ajout des données des APCs depuis DOAJ
# In[51]:
# ajout de DOAJ info
doaj = pd.read_csv('doaj/journalcsv__doaj_20210312_0636_utf8.csv', encoding='utf-8', header=0)
doaj
# In[52]:
# garder les lignes avec APC
doaj_apc = doaj.loc[doaj['APC'] == 'Yes'][['Journal ISSN (print version)', 'Journal EISSN (online version)', 'APC amount']]
doaj_apc
# In[53]:
# garder les lignes avec APC no
doaj_apc_no = doaj.loc[doaj['APC'] == 'No'][['Journal ISSN (print version)', 'Journal EISSN (online version)']]
doaj_apc_no
# In[54]:
# attribuer la valeur 0
doaj_apc_no['APC amount'] = 0
doaj_apc_no
# In[55]:
# ajout à la table des APC
doaj_apc = doaj_apc.append(doaj_apc_no, ignore_index=True)
doaj_apc
# In[56]:
# découpage du prix en 'amount' et 'symbol'
doaj_apc[['amount', 'symbol']] = doaj_apc['APC amount'].str.split(' ', n=1, expand=True)
doaj_apc
# In[57]:
doaj_apc.loc[doaj_apc['APC amount'] == 0, 'amount'] = 0
doaj_apc.loc[doaj_apc['APC amount'] == 0, 'symbol'] = ''
doaj_apc
# In[58]:
# ajouter les champs manquants
doaj_apc['cost_factor_type'] = 1
doaj_apc['comment'] = 'Source: DOAJ'
doaj_apc
# In[59]:
# renommer les champs
doaj_apc = doaj_apc.rename(columns = {'Journal ISSN (print version)' : 'issn_print', 'Journal EISSN (online version)' : 'issn_electronic'})
doaj_apc
# In[60]:
# ajout du issn
doaj_apc['issn'] = doaj_apc['issn_electronic']
doaj_apc
# In[61]:
doaj_apc.loc[doaj_apc['issn'].isna()]
# In[62]:
# ajout du issnp quand c'est vide
doaj_apc.loc[doaj_apc['issn'].isna(), 'issn'] = doaj_apc['issn_print']
doaj_apc.loc[doaj_apc['issn'].isna()]
# In[63]:
doaj_apc = pd.merge(doaj_apc, issns, on='issn', how='left')
doaj_apc
# In[64]:
# renommer les colonnes
doaj_apc = doaj_apc.rename(columns={'issnl' : 'issn_link'})
doaj_apc
# ### Ajout des APCs depuis la base Journal Database (Zurich Open Repository and Archive)
#
# https://www.jdb.uzh.ch/
# In[65]:
# JDB base de Zurich
jdb = pd.read_csv('zora/jdb_apcs.tsv', encoding='utf-8', header=0, sep='\t')
jdb
# In[66]:
# renommer l'id
jdb = jdb.rename(columns = {'id' : 'jdb_id'})
jdb
# In[67]:
# ajouter les champs manquants
jdb['cost_factor_type'] = 1
jdb['comment'] = 'Source: JDB (' + jdb['apc_date'].astype(str) + ')'
jdb
# In[68]:
# renommer les champs
jdb = jdb.rename(columns = {'apc_fee' : 'amount', 'apc_currency' : 'symbol'})
jdb
# In[69]:
jdb = jdb.drop_duplicates(subset='jdb_id', keep='last')
# In[70]:
# import openapc avec les valeurs max
openapc = pd.read_csv('openapc/open_apc_max.tsv', encoding='utf-8', header=0, sep='\t')
openapc
# In[71]:
# renommer les champs
openapc = openapc.rename(columns = {'period' : 'apc_date', 'issn_l' : 'issn_link', 'euro' : 'amount'})
openapc
# In[72]:
# ajouter le lien avec le type et le symbole
openapc['cost_factor_type'] = 1
openapc['jdb_id'] = np.nan
openapc['symbol'] = 'EUR'
openapc['comment'] = 'Source: OpenAPC (' + openapc['apc_date'].astype(str) + ')'
openapc
# In[73]:
# ajout des lignes de openapc
jdb = jdb.append(openapc, ignore_index=True)
jdb
# In[74]:
# supprimer les doublons par issnl et date
jdb = jdb.drop_duplicates(subset=['issn_link', 'apc_date'], keep='first')
jdb
# In[75]:
# ajout de DOAJ
cost_factor = doaj_apc.append(jdb, ignore_index=True)
cost_factor
# In[76]:
# test issnl
cost_factor.loc[cost_factor['issn_link'].isna()]
# In[77]:
# merge avec issnl
cost_factor = pd.merge(cost_factor, issns, on='issn', how='left')
cost_factor
# In[78]:
# test issnl
cost_factor.loc[cost_factor['issnl'].isna()]
# In[79]:
#ajout des issn quand ça manque
cost_factor.loc[cost_factor['issn'].isna(), 'issn'] = cost_factor['issn_print']
cost_factor.loc[cost_factor['issn'].isna(), 'issn'] = cost_factor['issn_electronic']
cost_factor.loc[cost_factor['issn'].isna(), 'issn'] = cost_factor['issn_link']
cost_factor.loc[cost_factor['issn'].isna()]
# In[80]:
#ajout des issnl quand ça manque
cost_factor.loc[cost_factor['issnl'].isna(), 'issnl'] = cost_factor['issn_link']
cost_factor.loc[cost_factor['issnl'].isna(), 'issnl'] = cost_factor['issn_print']
cost_factor.loc[cost_factor['issnl'].isna(), 'issnl'] = cost_factor['issn_electronic']
cost_factor.loc[cost_factor['issnl'].isna(), 'issnl'] = cost_factor['issn']
cost_factor.loc[cost_factor['issnl'].isna()]
# In[81]:
# prendre les ids pour le merge
cost_factor_ids = cost_factor[['issn', 'issnl', 'cost_factor_type', 'amount', 'symbol', 'comment']]
# cost_factor_ids_1 = cost_factor_ids_1.rename(columns = {'issn_link' : 'issn'})
# cost_factor_ids_2 = cost_factor.loc[cost_factor['issn_electronic'].notna()][['issn_electronic', 'cost_factor_type', 'amount', 'symbol', 'comment']]
# cost_factor_ids_2 = cost_factor_ids_2.rename(columns = {'issn_electronic' : 'issn'})
# cost_factor_ids_3 = cost_factor.loc[cost_factor['issn_print'].notna()][['issn_print', 'cost_factor_type', 'amount', 'symbol', 'comment']]
# cost_factor_ids_3 = cost_factor_ids_3.rename(columns = {'issn_print' : 'issn'})
# cost_factor_ids_4 = cost_factor.loc[cost_factor['issn'].notna()][['issn', 'cost_factor_type', 'amount', 'symbol', 'comment']]
# cost_factor_ids = cost_factor_ids_1.append(cost_factor_ids_2)
# cost_factor_ids = cost_factor_ids.append(cost_factor_ids_3)
# cost_factor_ids = cost_factor_ids.append(cost_factor_ids_4)
cost_factor_ids
# In[82]:
# supprimer les doublons et les vides
cost_factor_ids = cost_factor_ids.drop_duplicates(subset=['issnl'])
cost_factor_ids
# In[83]:
# merge dans l'autre sens pour garder que les lignes du fichier
cost_factor_ids = pd.merge(cost_factor_ids, sherpa[['id', 'issnl']], on='issnl', how='left')
cost_factor_ids
# In[84]:
# garder les lignes avec merge
cost_factor_ids_all = cost_factor_ids.loc[cost_factor_ids['id'].notnull()]
cost_factor_ids_all
# In[85]:
# supprimer les doublons
cost_factor_ids_all = cost_factor_ids_all.drop_duplicates(subset=['id'])
cost_factor_ids_all
# In[86]:
# supprimer les doublons par issnl
cost_factor_ids_all = cost_factor_ids_all.drop_duplicates(subset=['issnl'])
del cost_factor_ids_all['id']
cost_factor_ids_all
# In[87]:
# convertir l'index en id
cost_factor_ids_all = cost_factor_ids_all.reset_index()
# ajout de l'id avec l'index + 1
cost_factor_ids_all['cost_factor'] = cost_factor_ids_all['index'] + id_start
del cost_factor_ids_all['index']
# convertir l'index en id
cost_factor_ids_all = cost_factor_ids_all.reset_index()
# ajout de l'id avec l'index + 1
cost_factor_ids_all['cost_factor'] = cost_factor_ids_all['index'] + id_start
del cost_factor_ids_all['index']
cost_factor_ids_all
# In[88]:
# merge avec la table sherpa
sherpa = pd.merge(sherpa, cost_factor_ids_all[['issnl', 'cost_factor']], on='issnl', how='left')
sherpa
# In[89]:
sherpa.loc[sherpa['cost_factor'].isna()]
# In[90]:
# garder les APCs pour la version published
sherpa.loc[sherpa['article_version'] != 'published', 'cost_factor'] = np.nan
sherpa.loc[sherpa['cost_factor'].notna()]
# In[91]:
# renommer l'id du fichier sherpa brut
# cost_factor_ids_all = cost_factor_ids_all.rename(columns = {'id' : 'id_sherpa'})
cost_factor_ids_all = cost_factor_ids_all.rename(columns = {'cost_factor' : 'id'})
cost_factor_ids_all
# In[92]:
cost_factor_ids_all['id'] = cost_factor_ids_all['id'].astype(int)
# In[93]:
cost_factor_ids_all
# In[94]:
cost_factor_export = cost_factor_ids_all[['id', 'cost_factor_type', 'amount', 'symbol', 'comment']]
cost_factor_export
# In[95]:
cost_factor_export.shape[0]
# In[96]:
# ajout de la valeur Rabais 100% pour les licences Read & Publish
rpid = cost_factor_export.shape[0] + 1
cost_factor_export = cost_factor_export.append({'id' : rpid, 'cost_factor_type' : 2, 'amount' : 100, 'symbol' : '%', 'comment' : 'Read & Publish agreement'}, ignore_index=True)
cost_factor_export
# In[97]:
# ajout de l'id dans la table read & publish
rp['cost_factor'] = rpid
rp
# In[98]:
# ajout de la valeur UNKNOWN
cost_factor_export = cost_factor_export.append({'id' : 999999, 'cost_factor_type' : 999999, 'amount' : 0, 'symbol' : '', 'comment' : 'UNKNOWN'}, ignore_index=True)
cost_factor_export
# In[99]:
# export de la table
result = cost_factor_export.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/cost_factor.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[100]:
# export csv
cost_factor_export.to_csv('sample/cost_factor.tsv', index=False)
# In[101]:
# export excel
cost_factor_export.to_excel('sample/cost_factor.xlsx', index=False)
# ## Table term
# In[102]:
sherpa
# In[103]:
# col_names = ['id', 'applicable_version', 'cost_factor', 'embargo', 'archiving']
term_sherpa = sherpa[['id', 'version', 'cost_factor', 'embargo', 'archiving', 'locations_ir', 'locations_not_ir', 'licence', 'journal', 'conditions', 'public_notes', 'prerequisite_funders', 'prerequisite_funders_ror']]
term_sherpa
# In[104]:
# renommer les champs
term_sherpa = term_sherpa.rename(columns = {'id' : 'id_sherpa', 'embargo' : 'embargo_months', 'prerequisite_funders_ror' : 'ror'})
term_sherpa
# In[105]:
# merge des champs dans le comment : conditions, public_notes, locations_not_ir
term_sherpa['conditions'] = term_sherpa['conditions'].fillna('')
term_sherpa['public_notes'] = term_sherpa['public_notes'].fillna('')
term_sherpa['locations_not_ir'] = term_sherpa['locations_not_ir'].fillna('')
term_sherpa['locations_ir'] = term_sherpa['locations_ir'].fillna('')
term_sherpa.loc[term_sherpa['locations_not_ir'] != '', 'locations_not_ir'] = 'Non institutional archiving locations: ' + term_sherpa['locations_not_ir']
term_sherpa.loc[term_sherpa['locations_ir'] != '', 'locations_ir'] = 'Institutional archiving locations: ' + term_sherpa['locations_ir']
term_sherpa.loc[term_sherpa['archiving'] == False, 'comment'] = term_sherpa['locations_not_ir']
term_sherpa.loc[term_sherpa['archiving'] == True, 'comment'] = term_sherpa['locations_ir']
term_sherpa.loc[term_sherpa['comment'] == '', 'comment'] = 'Conditions: ' + term_sherpa['conditions']
term_sherpa.loc[(term_sherpa['comment'] != '') & (term_sherpa['conditions'] != ''), 'comment'] = term_sherpa['comment'] + ' ; Conditions: ' + term_sherpa['conditions']
term_sherpa.loc[(term_sherpa['public_notes'] != '') & (term_sherpa['public_notes'] != term_sherpa['comment']), 'comment'] = term_sherpa['comment'] + ' ; Public notes: ' + term_sherpa['public_notes']
term_sherpa.loc[(term_sherpa['public_notes'] != '') & (term_sherpa['comment'] == ''), 'comment'] = 'Public notes: ' + term_sherpa['public_notes']
term_sherpa
# In[106]:
term_sherpa['prerequisite_funders'].value_counts()
# In[107]:
rp
# In[108]:
term_rp = rp[['rp_id', 'version', 'archiving', 'embargo_months', 'cost_factor', 'licence', 'journal', 'rp_publisher', 'ror', 'valid_from', 'valid_until']]
term_rp
# In[109]:
term_rp['rp_publisher'].value_counts()
# In[110]:
term_rp.loc[term_rp['rp_publisher'] == 'Elsevier', 'comment'] = 'Elsevier Read & Publish agreement'
term_rp.loc[term_rp['rp_publisher'] == 'Wiley', 'comment'] = 'Wiley Read & Publish agreement'
term_rp.loc[term_rp['rp_publisher'] == 'TF', 'comment'] = 'Taylor and Francis Read & Publish agreement'
term_rp.loc[term_rp['rp_publisher'] == 'Springer Nature ', 'comment'] = 'Springer Nature Read & Publish agreement'
term_rp.loc[term_rp['rp_publisher'] == 'CUP', 'comment'] = 'Cambridge University Press (CUP) Read & Publish agreement. Article types covered: Research Articles, Review Articles, Rapid Communication, Brief Reports and Case Reports'
del term_rp['rp_publisher']
term_rp
# In[111]:
# cocnat de deux tables
term_orig = term_sherpa[['id_sherpa', 'version', 'cost_factor', 'embargo_months', 'archiving', 'licence', 'journal', 'prerequisite_funders', 'ror', 'comment']]
term_orig
# In[112]:
term_orig = term_orig.append(term_rp, ignore_index=True, sort=False)
term_orig
# In[113]:
# ajout d'un hash unique pour chaque variante
term_orig['id_content_hash'] = term_orig.apply(lambda x: hash(tuple(x[['version', 'cost_factor', 'embargo_months', 'archiving', 'comment']])), axis = 1)
term_orig['id_content_hash_licence'] = term_orig.apply(lambda x: hash(tuple(x[['version', 'cost_factor', 'embargo_months', 'archiving', 'licence', 'comment']])), axis = 1)
# In[114]:
term_orig.sort_values(by='id_content_hash')
# In[115]:
# doublons
term_orig.loc[term_orig.duplicated(subset='id_content_hash')].sort_values(by='id_content_hash')
# In[116]:
term_orig['licence'] = term_orig['licence'].fillna(999999)
term_orig['licence'] = term_orig['licence'].astype(int)
term_orig['cost_factor'] = term_orig['cost_factor'].fillna(999999)
term_orig['cost_factor'] = term_orig['cost_factor'].astype(int)
# term_orig['embargo_months'] = term_orig['embargo_months'].fillna(0)
# term_orig['embargo_months'] = term_orig['embargo_months'].astype(int)
term_orig.loc[term_orig['archiving'] == True, 'ir_archiving'] = 1
term_orig.loc[term_orig['archiving'] == False, 'ir_archiving'] = 0
term_orig['ir_archiving'] = term_orig['ir_archiving'].fillna(0)
term_orig
# In[117]:
term_orig.loc[term_orig['ir_archiving'].isna()]
# In[118]:
term_orig['ir_archiving'].value_counts()
# In[119]:
term_orig['licence'] = term_orig['licence'].astype(int)
term_orig['ir_archiving'] = term_orig['ir_archiving'].astype(int)
term_orig['cost_factor'] = term_orig['cost_factor'].astype(int)
term_orig
# In[120]:
terms_export_dates = term_orig.loc[(term_orig['valid_from'].notna()) | (term_orig['valid_until'].notna())][['id_content_hash', 'ror', 'valid_from', 'valid_until']]
terms_export_dates
# In[121]:
terms_export = term_orig[['id_sherpa', 'rp_id', 'id_content_hash', 'id_content_hash_licence', 'version', 'cost_factor', 'embargo_months', 'ir_archiving', 'licence', 'comment']]
terms_export
# In[122]:
# test de doublons
terms_export.loc[terms_export.duplicated(subset='id_content_hash')].sort_values(by='id_content_hash')
# In[123]:
terms_export_dedup = terms_export.drop_duplicates(subset=['id_content_hash'])
terms_export_dedup
# In[124]:
terms_export_dedup_licence = terms_export.drop_duplicates(subset=['id_content_hash_licence'])
terms_export_dedup_licence
# In[125]:
# test de doublons
terms_export_dedup_licence.loc[terms_export_dedup_licence.duplicated(subset='id_content_hash')].sort_values(by='id_content_hash')
# In[126]:
# totaux pour les deux sources
terms_export_dedup.loc[terms_export_dedup['id_sherpa'].notna()].shape[0]
# In[127]:
terms_export_dedup.loc[terms_export_dedup['rp_id'].notna()].shape[0]
# In[128]:
terms_export_dedup.loc[terms_export_dedup['rp_id'].notna()]
# In[129]:
# convertir l'index en id
terms_export_dedup.reset_index(inplace=True)
del terms_export_dedup['index']
terms_export_dedup
# In[130]:
# ajout de l'id avec l'index + 1
terms_export_dedup['id'] = terms_export_dedup.index + 1
# del terms_export_dedup['index']
terms_export_dedup
# In[131]:
terms_export_dedup['source'] = ''
terms_export_dedup
# In[132]:
# grouper par licence
terms_export_dedup_licences = terms_export_dedup_licence[['licence', 'id_content_hash']]
terms_export_dedup_licences
# In[133]:
# concat valeurs avec même id
terms_export_dedup_licences['licence'] = terms_export_dedup_licences['licence'].astype(str)
terms_export_dedup_licences = terms_export_dedup_licences.groupby('id_content_hash').agg({'licence': lambda x: ', '.join(x)})
terms_export_dedup_licences
# In[134]:
# test des valeur multiples
terms_export_dedup_licences.loc[terms_export_dedup_licences['licence'].str.contains(',')]
# In[135]:
# ajout des licences groupées
terms_export_dedup_fin = pd.merge(terms_export_dedup, terms_export_dedup_licences, on='id_content_hash', how='left')
terms_export_dedup_fin
# In[136]:
# merge avec les dates pour avoir les terms ids
terms_export_dates = pd.merge(terms_export_dates, terms_export_dedup_fin[['id_content_hash', 'id']], on='id_content_hash')
terms_export_dates = terms_export_dates.rename(columns = {'id' : 'term'})
terms_export_dates
# In[137]:
# renommer les champs de licence
del terms_export_dedup_fin['licence_x']
terms_export_dedup_fin = terms_export_dedup_fin.rename(columns = {'licence_y' : 'licence'})
# In[138]:
terms_export_fin = terms_export_dedup_fin[['version', 'cost_factor', 'embargo_months', 'ir_archiving', 'licence', 'comment', 'id', 'source']]
terms_export_fin
# In[139]:
# export de la table
result = terms_export_fin.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/term.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[140]:
# export csv
terms_export_fin.to_csv('sample/term.tsv', index=False)
# In[141]:
# export excel
terms_export_fin.to_excel('sample/term.xlsx', index=False)
# ## Table condition_type
# In[142]:
# Journal-only, Organization-only, Journal-organization agreement
col_names = ['id',
'condition_issuer'
]
condition_type = pd.DataFrame(columns = col_names)
condition_type = condition_type.append({'id' : 1, 'condition_issuer' : 'Journal-only'}, ignore_index=True)
condition_type = condition_type.append({'id' : 2, 'condition_issuer' : 'Organization-only'}, ignore_index=True)
condition_type = condition_type.append({'id' : 3, 'condition_issuer' : 'Journal-organization agreement'}, ignore_index=True)
condition_type
# In[143]:
# export de la table
result = condition_type.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/condition_type.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[144]:
# export csv
condition_type.to_csv('sample/condition_type.tsv', index=False)
# In[145]:
# export excel
condition_type.to_excel('sample/condition_type.xlsx', index=False)
# ## Table organization
# In[146]:
# extraction des organizations (funders)
sherpa
# In[147]:
sherpa.loc[sherpa['prerequisite_funders'].notna()]
# In[148]:
sherpa['prerequisite_funders'].value_counts()
# In[149]:
funders = sherpa.loc[sherpa['prerequisite_funders'].notna()][['prerequisite_funders_name', 'prerequisite_funders_fundref', 'prerequisite_funders_ror', 'prerequisite_funders_country', 'prerequisite_funders_url', 'prerequisite_funders_sherpa_id']]
funders
# In[150]:
funders_dedup = funders.drop_duplicates(subset='prerequisite_funders_ror')
funders_dedup
# In[151]:
funders_dedup.shape[0]
# In[152]:
# export excel
funders_dedup.to_excel('sample/funders.xlsx', index=False)
# In[153]:
# export csv
funders_dedup.to_csv('sample/funders.tsv', index=False)
# In[154]:
# creation du DF
organization_funders = funders_dedup
organization_funders = organization_funders.rename(columns = {'prerequisite_funders_name' : 'name',
'prerequisite_funders_fundref' : 'fundref',
'prerequisite_funders_ror' : 'ror',
'prerequisite_funders_country' : 'iso_code',
'prerequisite_funders_url' : 'website',
'prerequisite_funders_sherpa_id' : 'sherpa_id'
})
organization_funders
# In[155]:
# lien avec les pays
country = pd.read_csv('sample/country.tsv', encoding='utf-8', header=0, sep='\t')
country
# In[156]:
# merge avec les pays
organization_funders['iso_code'] = organization_funders['iso_code'].str.upper()
organization_funders['is_funder'] = 1
organization_funders = pd.merge(organization_funders, country[['iso_code', 'id']], how='left', on='iso_code')
organization_funders
# In[157]:
organization_funders = organization_funders.rename(columns = {'id' : 'country'})
organization_funders
# In[158]:
# ajout des organizations suisses
organization = pd.read_csv('ror/ror_ch_hei_export.tsv', encoding='utf-8', header=0, sep='\t', dtype={'fundref': str, 'orgref': str}, na_filter=False)
organization
# In[159]:
# tri par nom
organization = organization.sort_values(by='name')
organization
# In[160]:
organization = organization.reset_index(drop=True)
organization
# In[161]:
# mettre l'EPFL en position 1 et UNIGE en 2
target_row = 32
# Move target row to first element of list.
idx = [target_row] + [i for i in range(len(organization)) if i != target_row]
organization = organization.iloc[idx]
organization
# In[162]:
organization = organization.reset_index(drop=True)
organization
# In[163]:
# mettre l'EPFL en position 1 et UNIGE en 2
target_row = 45
# Move target row to first element of list.
idx = [target_row] + [i for i in range(len(organization)) if i != target_row]
organization = organization.iloc[idx]
organization
# In[164]:
organization = organization.reset_index(drop=True)
organization
# In[165]:
# ajout des funders
organization = organization.append(organization_funders, ignore_index=True)
organization
# In[166]:
# remplacement dans le fundref id qui renvoie vers du JSON seulement
# URL actuel : http://data.crossref.org/fundingdata/funder/10.13039/[fundref id]
# ex : http://dx.doi.org/10.13039/501100007903
# redirigé sur : http://data.crossref.org/fundingdata/funder/10.13039/501100007903
# URL des publications financées : https://search.crossref.org/funding?q=[fundref id]&from_ui=yes
# ex : https://search.crossref.org/funding?q=501100003006&from_ui=yes
organization['fundref'] = organization['fundref'].str.replace('http://dx.doi.org/10.13039/', '')
organization
# In[167]:
# df pour l'export
organization_export = organization[['name', 'website', 'country', 'starting_year', 'is_funder', 'ror', 'fundref']]
organization_export
# In[168]:
# ajout des valeurs vides
organization_export['starting_year'] = organization_export['starting_year'].fillna(0)
organization_export['fundref'] = organization_export['fundref'].fillna('')
organization_export['ror'] = organization_export['ror'].fillna('')
organization_export
# In[169]:
# ajout de l'id avec l'index + 1
organization_export['id'] = organization_export.index + 1
# del terms_export_dedup['index']
organization_export
# In[170]:
# export de la table
result = organization_export.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/organization.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[171]:
# export excel
organization_export.to_excel('sample/organization.xlsx', index=False)
# In[172]:
# export csv
organization_export.to_csv('sample/organization.tsv', index=False)
# ## Table condition_set_term
# In[173]:
term_orig
# In[174]:
terms_export_dedup
# In[175]:
# merge des terms id
term_orig = pd.merge(term_orig, terms_export_dedup[['id_content_hash', 'id']], on='id_content_hash', how='left')
term_orig
# In[176]:
term_orig = term_orig.rename(columns = {'id' : 'term'})
term_orig
# In[177]:
condition_type
# In[178]:
# merge des condition type
term_orig['condition_type'] = 3
term_orig.loc[term_orig['ror'].isna(), 'condition_type'] = 1
term_orig
# In[179]:
organization_export
# In[180]:
# merge des organizations
term_orig = pd.merge(term_orig, organization_export[['ror', 'id']], on='ror', how='left')
term_orig
# In[181]:
term_orig = term_orig.rename(columns = {'id' : 'organization'})
term_orig
# In[182]:
# concat valeurs avec même id
condition_set_term_dedup_terms = term_orig[['term', 'id_content_hash']]
condition_set_term_dedup_terms_dedup = condition_set_term_dedup_terms.drop_duplicates()
condition_set_term_dedup_terms_dedup = condition_set_term_dedup_terms_dedup.loc[condition_set_term_dedup_terms_dedup['term'].notna()]
condition_set_term_dedup_terms_dedup['term'] = condition_set_term_dedup_terms_dedup['term'].astype(int)
condition_set_term_dedup_terms_dedup['term'] = condition_set_term_dedup_terms_dedup['term'].astype(str)
condition_set_term_dedup_terms_dedup = condition_set_term_dedup_terms_dedup.groupby('id_content_hash').agg({'term': lambda x: ', '.join(x)})
condition_set_term_dedup_terms_dedup
# In[183]:
# concat valeurs avec même id
condition_set_term_dedup_journals = term_orig[['journal', 'id_content_hash']]
condition_set_term_dedup_journals_dedup = condition_set_term_dedup_journals.drop_duplicates()
condition_set_term_dedup_journals_dedup = condition_set_term_dedup_journals_dedup.loc[condition_set_term_dedup_journals_dedup['journal'].notna()]
condition_set_term_dedup_journals_dedup['journal'] = condition_set_term_dedup_journals_dedup['journal'].astype(int)
condition_set_term_dedup_journals_dedup['journal'] = condition_set_term_dedup_journals_dedup['journal'].astype(str)
condition_set_term_dedup_journals_dedup = condition_set_term_dedup_journals_dedup.groupby('id_content_hash').agg({'journal': lambda x: ', '.join(x)})
condition_set_term_dedup_journals_dedup
# In[184]:
# concat valeurs avec même id
condition_set_term_dedup_organizations = term_orig[['organization', 'id_content_hash']]
condition_set_term_dedup_organizations_dedup = condition_set_term_dedup_organizations.drop_duplicates()
condition_set_term_dedup_organizations_dedup = condition_set_term_dedup_organizations_dedup.loc[condition_set_term_dedup_organizations_dedup['organization'].notna()]
condition_set_term_dedup_organizations_dedup['organization'] = condition_set_term_dedup_organizations_dedup['organization'].astype(int)
condition_set_term_dedup_organizations_dedup['organization'] = condition_set_term_dedup_organizations_dedup['organization'].astype(str)
condition_set_term_dedup_organizations_dedup = condition_set_term_dedup_organizations_dedup.groupby('id_content_hash').agg({'organization': lambda x: ', '.join(x)})
condition_set_term_dedup_organizations_dedup
# In[185]:
# concat valeurs avec même id : pas possible pour condition_type
condition_set_term_dedup_condition_types = term_orig[['condition_type', 'id_content_hash']]
condition_set_term_dedup_condition_types_dedup = condition_set_term_dedup_condition_types.drop_duplicates()
condition_set_term_dedup_condition_types_dedup = condition_set_term_dedup_condition_types_dedup.loc[condition_set_term_dedup_condition_types_dedup['condition_type'].notna()]
# condition_set_term_dedup_condition_types_dedup['condition_type'] = condition_set_term_dedup_condition_types_dedup['condition_type'].astype(int)
# condition_set_term_dedup_condition_types_dedup['condition_type'] = condition_set_term_dedup_condition_types_dedup['condition_type'].astype(str)
# condition_set_term_dedup_condition_types_dedup = condition_set_term_dedup_condition_types_dedup.groupby('id_content_hash').agg({'condition_type': lambda x: ', '.join(x)})
condition_set_term_dedup_condition_types_dedup
# In[186]:
# recuperation des ids groupés
terms_export_dedup = pd.merge(terms_export_dedup, condition_set_term_dedup_terms_dedup, on='id_content_hash', how='left')
terms_export_dedup = pd.merge(terms_export_dedup, condition_set_term_dedup_journals_dedup, on='id_content_hash', how='left')
terms_export_dedup = pd.merge(terms_export_dedup, condition_set_term_dedup_organizations_dedup, on='id_content_hash', how='left')
terms_export_dedup = pd.merge(terms_export_dedup, condition_set_term_dedup_condition_types_dedup, on='id_content_hash', how='left')
terms_export_dedup
# In[187]:
condition_sets_orig = terms_export_dedup[['term', 'condition_type', 'organization', 'journal']]
condition_sets_orig
# In[188]:
# ajout d'un hash unique pour chaque variante
condition_sets_orig['id_term_hash'] = condition_sets_orig.apply(lambda x: hash(tuple(x[['condition_type', 'organization', 'journal']])), axis = 1)
condition_sets_orig
# In[189]:
# grouper les termes qui ont les mêmes valeurs pour le reste
condition_sets_orig_terms = condition_sets_orig[['term', 'id_term_hash']]
condition_sets_orig_terms_dedup = condition_sets_orig_terms.drop_duplicates()
condition_sets_orig_terms_dedup = condition_sets_orig_terms_dedup.loc[condition_sets_orig_terms_dedup['term'].notna()]
condition_sets_orig_terms_dedup['term'] = condition_sets_orig_terms_dedup['term'].astype(int)
condition_sets_orig_terms_dedup['term'] = condition_sets_orig_terms_dedup['term'].astype(str)
condition_sets_orig_terms_dedup = condition_sets_orig_terms_dedup.groupby('id_term_hash').agg({'term': lambda x: ', '.join(x)})
condition_sets_orig_terms_dedup
# In[190]:
# ajout des ids groupées
condition_sets_orig_terms = pd.merge(condition_sets_orig, condition_sets_orig_terms_dedup, on='id_term_hash', how='left')
condition_sets_orig_terms
# In[191]:
# rename terms
del condition_sets_orig_terms['term_x']
condition_sets_orig_terms = condition_sets_orig_terms.rename(columns = {'term_y' : 'term'})
condition_sets_orig_terms
# In[192]:
# test duplicates
condition_sets_orig_terms.loc[condition_sets_orig_terms.duplicated()].sort_values(by='term')
# In[193]:
condition_sets_orig_terms.loc[condition_sets_orig_terms.duplicated()].shape[0]
# In[194]:
condition_sets_orig_terms_dedup = condition_sets_orig_terms.drop_duplicates()
condition_sets_orig_terms_dedup
# In[195]:
# ajout des champs manquants
condition_sets_orig_terms_dedup['comment'] = ''
# In[196]:
# remplacement des "nan"
condition_sets_orig_terms_dedup.loc[condition_sets_orig_terms_dedup['journal'].isna()]
# In[197]:
# remplacement des "nan"
condition_sets_orig_terms_dedup.loc[condition_sets_orig_terms_dedup['term'].isna()]
# In[198]:
# remplacement des "nan"
condition_sets_orig_terms_dedup.loc[condition_sets_orig_terms_dedup['condition_type'].isna()]
# In[199]:
# remplacement des "nan"
condition_sets_orig_terms_dedup.loc[condition_sets_orig_terms_dedup['organization'].isna()]
# In[200]:
# remplacement des "nan"
condition_sets_orig_terms_dedup['organization'] = condition_sets_orig_terms_dedup['organization'].fillna('')
condition_sets_orig_terms_dedup
# In[201]:
# convertir l'index en id
condition_sets_orig_terms_dedup = condition_sets_orig_terms_dedup.reset_index()
# ajout de l'id avec l'index + 1
condition_sets_orig_terms_dedup['id'] = condition_sets_orig_terms_dedup['index'] + 1
del condition_sets_orig_terms_dedup['index']
condition_sets_orig_terms_dedup
# In[202]:
# convertir l'index en id
condition_sets_orig_terms_dedup = condition_sets_orig_terms_dedup.reset_index()
# ajout de l'id avec l'index + 1
condition_sets_orig_terms_dedup['id'] = condition_sets_orig_terms_dedup['index'] + 1
del condition_sets_orig_terms_dedup['index']
condition_sets_orig_terms_dedup
# In[203]:
# export de la table
result = condition_sets_orig_terms_dedup[['id', 'condition_type', 'organization', 'journal', 'term', 'comment']].to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/condition_set.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[204]:
# export excel
condition_sets_orig_terms_dedup[['id', 'condition_type', 'organization', 'journal', 'term', 'comment']].to_excel('sample/condition_set.xlsx', index=False)
# In[205]:
# export csv
condition_sets_orig_terms_dedup[['id', 'condition_type', 'organization', 'journal', 'term', 'comment']].to_csv('sample/condition_set.tsv', index=False)
# ## Table organization_condition_set
# In[206]:
condition_sets_orig_terms_dedup
# In[207]:
condition_sets_orig_terms_dedup.loc[(condition_sets_orig_terms_dedup['organization'].notna()) & (condition_sets_orig_terms_dedup['organization'] != '')]
# In[208]:
# creation du DF
# col_names = ['id',
# 'organization',
# 'condition_set',
# 'valid_from',
# 'valid_until'
# ]
# organization_condition = pd.DataFrame(columns = col_names)
organization_condition = condition_sets_orig_terms_dedup.loc[(condition_sets_orig_terms_dedup['organization'].notna()) & (condition_sets_orig_terms_dedup['organization'] != '')][['id', 'organization', 'term']]
organization_condition
# In[209]:
# extraction des terms ids
organization_condition_split = organization_condition.assign(term = organization_condition.term.str.split(',')).explode('term')
organization_condition_split
# In[210]:
organization_condition_split.loc[organization_condition_split['organization'].isna()]
# In[211]:
organization_condition_split.loc[organization_condition_split['term'].isna()]
# In[212]:
organization_condition_split['term'] = organization_condition_split['term'].astype(int)
organization_condition_split
# In[213]:
# ajout du ROR
terms_export_dates
# In[214]:
# merge pour obtenir les dates
organization_condition_split = pd.merge(organization_condition_split, terms_export_dates[['term', 'valid_from', 'valid_until']], on='term', how='left')
organization_condition_split
# In[215]:
# dédoublonage
organization_condition_split_dedup = organization_condition_split.drop_duplicates()
organization_condition_split_dedup
# In[216]:
organization_condition = pd.merge(organization_condition, organization_condition_split_dedup[['id', 'valid_from', 'valid_until']], on='id', how='left')
organization_condition
# In[217]:
organization_condition = organization_condition.rename(columns = {'id' : 'condition_set'})
organization_condition['valid_from'] = organization_condition['valid_from'].fillna('')
organization_condition['valid_until'] = organization_condition['valid_until'].fillna('')
organization_condition
# In[218]:
# split final pour avoir une ligne par organization
organization_condition_fin = organization_condition.assign(organization = organization_condition.organization.str.split(',')).explode('organization')
organization_condition_fin
# In[219]:
# ajout de l'id avec l'index + 1
organization_condition_fin = organization_condition_fin.reset_index()
organization_condition_fin['id'] = organization_condition_fin.index + 1
del organization_condition_fin['index']
organization_condition_fin
# In[220]:
# export de la table
result = organization_condition_fin[['id', 'condition_set', 'organization', 'valid_from', 'valid_until']].to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/organization_condition.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[221]:
# export excel
organization_condition_fin[['id', 'condition_set', 'organization', 'valid_from', 'valid_until']].to_excel('sample/organization_condition.xlsx', index=False)
# In[222]:
# export csv
organization_condition_fin[['id', 'condition_set', 'organization', 'valid_from', 'valid_until']].to_csv('sample/organization_condition.tsv', index=False)
# ## Table journal_condition_set
# In[223]:
# creation du DF
# col_names = ['id',
# 'journal',
# 'condition_set',
# 'valid_from',
# 'valid_until'
# ]
# journal_condition = pd.DataFrame(columns = col_names)
journal_condition = condition_sets_orig_terms_dedup.loc[(condition_sets_orig_terms_dedup['journal'].notna()) & (condition_sets_orig_terms_dedup['journal'] != '')][['id', 'journal']]
journal_condition
# In[224]:
journal_condition = journal_condition.rename(columns = {'id' : 'condition_set'})
journal_condition['valid_from'] = ''
journal_condition['valid_until'] = ''
journal_condition
# In[225]:
# split final pour avoir une ligne par journal
journal_condition_fin = journal_condition.assign(journal = journal_condition.journal.str.split(',')).explode('journal')
journal_condition_fin
# In[226]:
# ajout de l'id avec l'index + 1
journal_condition_fin = journal_condition_fin.reset_index()
journal_condition_fin['id'] = journal_condition_fin.index + 1
del journal_condition_fin['index']
journal_condition_fin
# In[227]:
# export de la table
result = journal_condition_fin.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/journal_condition.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[228]:
# export excel
journal_condition_fin.to_excel('sample/journal_condition.xlsx', index=False)
# In[229]:
# export csv
journal_condition_fin.to_csv('sample/journal_condition.tsv', index=False)
# In[ ]:

Event Timeline