Page MenuHomec4science

02_oacct_languages.py
No OneTemporary

File Metadata

Created
Fri, Nov 1, 23:52

02_oacct_languages.py

#!/usr/bin/env python
# coding: utf-8
# # Projet Open Access Compliance Check Tool (OACCT)
#
# Projet P5 de la bibliothèque de l'EPFL en collaboration avec les bibliothèques des Universités de Genève, Lausanne et Berne : https://www.swissuniversities.ch/themen/digitalisierung/p-5-wissenschaftliche-information/projekte/swiss-mooc-service-1-1-1-1
#
# Ce notebook permet d'extraire les données choisis parmis les sources obtenues par API et les traiter pour les rendre exploitables dans l'application OACCT.
#
# Auteur : **Pablo Iriarte**, Université de Genève (pablo.iriarte@unige.ch)
# Date de dernière mise à jour : 16.07.2021
# In[1]:
import pandas as pd
import csv
import json
import numpy as np
# ## Table Language
# In[2]:
# https://www.loc.gov/standards/iso639-2/php/code_list.php
# ISO 639-2 Code ISO 639-1 Code English name of Language French name of Language German name of Language
language = pd.read_csv('ISO-639-2_utf-8.txt', encoding='utf-8', header=None, sep='|', na_filter=False, names=['ISO 639-2 Code', 'ISO 639-1 Code', 'ignore', 'English name of Language', 'French name of Language'], index_col=False)
language
# In[3]:
language.loc[language['ISO 639-2 Code'].isnull()]
# In[4]:
# convertir l'index en id
language = language.reset_index()
language
# In[5]:
language['id'] = language['index'] + 1
del language['index']
del language['ignore']
del language['French name of Language']
del language['ISO 639-1 Code']
language
# In[6]:
# renommer les colonnes
language = language.rename(columns={'ISO 639-2 Code' : 'iso_code', 'English name of Language' : 'name'})
# In[7]:
language
# In[8]:
# corriger la valeur trop longue qaa-qtz
language.loc[language['iso_code'] == 'qaa-qtz', 'iso_code'] = 'qaa'
# In[9]:
# ajout de la valeur UNKNOWN
language = language.append({'id' : 999999, 'iso_code' : '___', 'name' : 'UNKNOWN'}, ignore_index=True)
language
# In[10]:
# esport JSON
result = language.to_json(orient='records', force_ascii=False)
parsed = json.loads(result)
with open('sample/language.json', 'w', encoding='utf-8') as file:
json.dump(parsed, file, indent=2, ensure_ascii=False)
# In[11]:
# export csv
language.to_csv('language.tsv', sep='\t', encoding='utf-8', index=False)
# In[12]:
# export csv
language.to_csv('sample/language.tsv', sep='\t', encoding='utf-8', index=False)
# In[13]:
# export excel
language.to_excel('sample/language.xlsx', index=False)

Event Timeline