In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml.etree as ET
from xml.dom import minidom
import pathlib
import glob
import datetime
import ipywidgets as widgets



In [2]:
def read_oaixml(xmlfile):
    doc = minidom.parse(xmlfile)
    authors=[]
    for element in doc.getElementsByTagName("dc:creator"):
        authors.append(element.firstChild.data)
  
    keywords=[]
    for element in doc.getElementsByTagName("dc:subject"):
        keywords.append(element.firstChild.data)
    try:
        category=doc.getElementsByTagName("dc:type")[1].firstChild.data
    except :
        category=[]
    try:
        lic=doc.getElementsByTagName("dc:rights")[1].firstChild.data
    except:
        lic="None"

    output={
        'category':category,
        'date':doc.getElementsByTagName("dc:date")[0].firstChild.data,
        'title':doc.getElementsByTagName("dc:title")[0].firstChild.data,
        'authors':authors,
        'abstract':doc.getElementsByTagName("dc:description")[0].firstChild.data,
        'url':doc.getElementsByTagName("dc:identifier")[0].firstChild.data,
        'keywords':keywords,
        'license':lic,
    }
    return output

In [3]:
def makeprevurl(recordurl,filename):
    return recordurl+"/preview/"+filename

def getfiles(prevurl):
    r = requests.get(prevurl)
    prev=BeautifulSoup(r.text)
    files = pd.DataFrame(columns=['filename','extension','size'])
    for t in prev.select(".fa-file-o"):
        k=t.find_previous("li").span
        files=files.append({'filename':k.contents[1],
                      'extension':pathlib.Path(k.contents[1]).suffix,
                      'size':k.next_sibling.next_sibling.contents[0]}
                     ,ignore_index=True)
    return files

def getarchives(recordurl):
    r = requests.get(recordurl)
    page=BeautifulSoup(r.text)
    files = pd.DataFrame(columns=['filename','extension','size','contents','nfile_archive'])
    for t in page.select(".filename"):
        k=t.find_previous("td")
        filename=k.a.contents[0]
        ext=pathlib.Path(filename).suffix
        if ext==".zip":
            archive=getfiles(makeprevurl(recordurl,filename))
        else :
            archive=pd.DataFrame(columns=['filename','extension','size'])
            
        files=files.append({
            'filename':filename,
            'extension':ext,
            'size':k.next_sibling.next_sibling.contents[0],
            'contents':archive,
            'nfile_archive':archive.shape[0]
            }
            ,ignore_index=True)
    return files


In [4]:
datadir='./xml-oai/'
fxml = glob.glob(datadir + "*.xml")


In [5]:
def time_elapsed(start):
    end = datetime.datetime.now()

    time_to_run = end - start
    minutes = int(time_to_run.seconds/60)
    seconds = time_to_run.seconds % 60
    return "Total runtime: " + str(minutes) + " minutes, " + str(seconds) + " seconds"


start=datetime.datetime.now()
f=widgets.IntProgress(
    value=0,
    min=0,
    max=len(fxml),
    description='Loading:',
    orientation='horizontal',
    
)
n=widgets.BoundedIntText(
    value=0,
    min=0,
    max=len(fxml),
    step=1,
    description='n:',
    disabled=True
)
display(f)
display(n)

IntProgress(value=0, description='Loading:', max=1391410)

BoundedIntText(value=0, description='n:', disabled=True, max=1391410)

In [6]:
zenododata=pd.DataFrame(columns=['category','date','title','authors', 'abstract','keywords','license','url','files'])
for xmlfile in fxml:
    row = read_oaixml(xmlfile)
    row.update({'files':getarchives(row['url'])})
    zenododata=zenododata.append(row,ignore_index=True)
    f.value += 1
    n.value += 1
    if zenododata.shape[0] % 1000 is 0:
        print(zenododata.shape[0], time_elapsed(start))

1000 Total runtime: 16 minutes, 22 seconds
2000 Total runtime: 32 minutes, 39 seconds


KeyboardInterrupt: 

In [None]:
zenodo

In [None]:
zenododata.shape

In [None]:
date=datetime.datetime.now().strftime('%Y%m%d')
pathlib.Path("processed_data/" + date).mkdir(parents=True, exist_ok=True) 
zenododata.to_pickle("processed_data/" + date + "/zenododata.pkl",compression="gzip")