# XML Opening

* Open the the xml files created with ```oai-harvest https://zenodo.org/oai2d``` using [bloomonkey/oai-harvest](https://github.com/bloomonkey/oai-harvest)
* A ```zenododata.pkl``` (gziped) is created it's a dataframe that contains most of the XML fields 
* It's also possible to harvest the zenodo webpage (not recommanded) in order to have info about the files 

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml.etree as ET
from xml.dom import minidom
import pathlib
import glob
import datetime
import sys

## Settings Var
* reload : fo reusing a previous opening
* datereload : folder of the zenododataa

In [3]:
reload=False #TODO`

datereload='20190819'

## Functions
### Read XML

In [4]:
def read_oaixml(xmlfile):
    doc = minidom.parse(xmlfile)
    authors=[]
    for element in doc.getElementsByTagName("dc:creator"):
        authors.append(element.firstChild.data)
  
    keywords=[]
    for element in doc.getElementsByTagName("dc:subject"):
        keywords.append(element.firstChild.data)
    
    try:
        category=doc.getElementsByTagName("dc:type")[1].firstChild.data
    except :
        category="None"
    
    try:
        lic=doc.getElementsByTagName("dc:rights")[1].firstChild.data
    except:
        lic="None"
    
    try:
        abstract=doc.getElementsByTagName("dc:description")[0].firstChild.data
    except:
        abstract="None"

    output={
        'category':category,
        'date':doc.getElementsByTagName("dc:date")[0].firstChild.data,
        'title':doc.getElementsByTagName("dc:title")[0].firstChild.data,
        'authors':authors,
        'abstract':abstract,
        'url':doc.getElementsByTagName("dc:identifier")[0].firstChild.data,
        'keywords':keywords,
        'license':lic,
    }
    return output

## Main
### Reading XML Files

In [6]:
def time_elapsed(start):
    end = datetime.datetime.now()

    time_to_run = end - start
    minutes = int(time_to_run.seconds/60)
    seconds = time_to_run.seconds % 60
    return "Total runtime: " + str(minutes) + " minutes, " + str(seconds) + " seconds"

datadir='./xml-oai/'
fxml = glob.glob(datadir + "*.xml")
zenododata=pd.DataFrame(columns=['category','date','title','authors', 'abstract','keywords','license','url','files'])
date=datetime.datetime.now().strftime('%Y%m%d')
pathlib.Path("processed_data/" + date).mkdir(parents=True, exist_ok=True) 

### Do the work

In [7]:
start=datetime.datetime.now()
if reload:
    print("reolading from " + datereload)
    zenododata=pd.read_pickle("processed_data/" + datereload + "/zenododata.pkl",compression='gzip')
    print("from row:" + zenododata.size)

print(len(fxml))
dicta=[]
i=0
for xmlfile in fxml:
    row = read_oaixml(xmlfile)
    #if reload and zenododata.url.isin(row['url']):
    #    continue
   # if harvest_web:
    #    row.update({'files':getarchives(row['url'])})
    #zenododata=zenododata.append(row,ignore_index=True)
    dicta.append(row)
    #i +=1
    if len(dicta) % 50000 is 0:
        print(len(dicta), time_elapsed(start))
       # zenododata.to_pickle("processed_data/" + date + "/zenododata.pkl",compression='gzip')

1395378
50000 Total runtime: 5 minutes, 34 seconds
100000 Total runtime: 10 minutes, 52 seconds
150000 Total runtime: 16 minutes, 35 seconds
200000 Total runtime: 22 minutes, 51 seconds
250000 Total runtime: 29 minutes, 17 seconds
300000 Total runtime: 35 minutes, 11 seconds
350000 Total runtime: 40 minutes, 58 seconds
400000 Total runtime: 46 minutes, 47 seconds
450000 Total runtime: 52 minutes, 42 seconds
500000 Total runtime: 58 minutes, 35 seconds
550000 Total runtime: 64 minutes, 39 seconds
600000 Total runtime: 70 minutes, 49 seconds
650000 Total runtime: 76 minutes, 56 seconds
700000 Total runtime: 83 minutes, 1 seconds
750000 Total runtime: 89 minutes, 11 seconds
800000 Total runtime: 95 minutes, 19 seconds
850000 Total runtime: 101 minutes, 28 seconds
900000 Total runtime: 107 minutes, 20 seconds
950000 Total runtime: 113 minutes, 13 seconds
1000000 Total runtime: 119 minutes, 9 seconds
1050000 Total runtime: 125 minutes, 5 seconds
1100000 Total runtime: 131 minutes, 5 seconds

In [16]:
len(dicta)


1395378

## Save the work

In [17]:
zenododata=zenododata.append(dicta,ignore_index=True) 
zenododata
zenododata.to_pickle("processed_data/" + date + "/zenododata.pkl",compression='gzip')

    