{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# XML Opening\n",
    "\n",
    "* Open the the xml files created with ```oai-harvest https://zenodo.org/oai2d``` using [bloomonkey/oai-harvest](https://github.com/bloomonkey/oai-harvest)\n",
    "* A ```zenododata.pkl``` (gziped) is created it's a dataframe that contains most of the XML fields \n",
    "* It's also possible to harvest the zenodo webpage (not recommanded) in order to have info about the files "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import pandas as pd\n",
    "import lxml.etree as ET\n",
    "from xml.dom import minidom\n",
    "import pathlib\n",
    "import glob\n",
    "import datetime\n",
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Settings Var\n",
    "* reload : fo reusing a previous opening\n",
    "* datereload : folder of the zenododataa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "reload=False #TODO`\n",
    "\n",
    "datereload='20190819'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Functions\n",
    "### Read XML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_oaixml(xmlfile):\n",
    "    doc = minidom.parse(xmlfile)\n",
    "    authors=[]\n",
    "    for element in doc.getElementsByTagName(\"dc:creator\"):\n",
    "        authors.append(element.firstChild.data)\n",
    "  \n",
    "    keywords=[]\n",
    "    for element in doc.getElementsByTagName(\"dc:subject\"):\n",
    "        keywords.append(element.firstChild.data)\n",
    "    \n",
    "    try:\n",
    "        category=doc.getElementsByTagName(\"dc:type\")[1].firstChild.data\n",
    "    except :\n",
    "        category=\"None\"\n",
    "    \n",
    "    try:\n",
    "        lic=doc.getElementsByTagName(\"dc:rights\")[1].firstChild.data\n",
    "    except:\n",
    "        lic=\"None\"\n",
    "    \n",
    "    try:\n",
    "        abstract=doc.getElementsByTagName(\"dc:description\")[0].firstChild.data\n",
    "    except:\n",
    "        abstract=\"None\"\n",
    "\n",
    "    output={\n",
    "        'category':category,\n",
    "        'date':doc.getElementsByTagName(\"dc:date\")[0].firstChild.data,\n",
    "        'title':doc.getElementsByTagName(\"dc:title\")[0].firstChild.data,\n",
    "        'authors':authors,\n",
    "        'abstract':abstract,\n",
    "        'url':doc.getElementsByTagName(\"dc:identifier\")[0].firstChild.data,\n",
    "        'keywords':keywords,\n",
    "        'license':lic,\n",
    "    }\n",
    "    return output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Main\n",
    "### Reading XML Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_elapsed(start):\n",
    "    end = datetime.datetime.now()\n",
    "\n",
    "    time_to_run = end - start\n",
    "    minutes = int(time_to_run.seconds/60)\n",
    "    seconds = time_to_run.seconds % 60\n",
    "    return \"Total runtime: \" + str(minutes) + \" minutes, \" + str(seconds) + \" seconds\"\n",
    "\n",
    "datadir='./xml-oai/'\n",
    "fxml = glob.glob(datadir + \"*.xml\")\n",
    "zenododata=pd.DataFrame(columns=['category','date','title','authors', 'abstract','keywords','license','url','files'])\n",
    "date=datetime.datetime.now().strftime('%Y%m%d')\n",
    "pathlib.Path(\"processed_data/\" + date).mkdir(parents=True, exist_ok=True) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Do the work"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1395378\n",
      "50000 Total runtime: 5 minutes, 34 seconds\n",
      "100000 Total runtime: 10 minutes, 52 seconds\n",
      "150000 Total runtime: 16 minutes, 35 seconds\n",
      "200000 Total runtime: 22 minutes, 51 seconds\n",
      "250000 Total runtime: 29 minutes, 17 seconds\n",
      "300000 Total runtime: 35 minutes, 11 seconds\n",
      "350000 Total runtime: 40 minutes, 58 seconds\n",
      "400000 Total runtime: 46 minutes, 47 seconds\n",
      "450000 Total runtime: 52 minutes, 42 seconds\n",
      "500000 Total runtime: 58 minutes, 35 seconds\n",
      "550000 Total runtime: 64 minutes, 39 seconds\n",
      "600000 Total runtime: 70 minutes, 49 seconds\n",
      "650000 Total runtime: 76 minutes, 56 seconds\n",
      "700000 Total runtime: 83 minutes, 1 seconds\n",
      "750000 Total runtime: 89 minutes, 11 seconds\n",
      "800000 Total runtime: 95 minutes, 19 seconds\n",
      "850000 Total runtime: 101 minutes, 28 seconds\n",
      "900000 Total runtime: 107 minutes, 20 seconds\n",
      "950000 Total runtime: 113 minutes, 13 seconds\n",
      "1000000 Total runtime: 119 minutes, 9 seconds\n",
      "1050000 Total runtime: 125 minutes, 5 seconds\n",
      "1100000 Total runtime: 131 minutes, 5 seconds\n",
      "1150000 Total runtime: 137 minutes, 1 seconds\n",
      "1200000 Total runtime: 142 minutes, 56 seconds\n",
      "1250000 Total runtime: 148 minutes, 46 seconds\n",
      "1300000 Total runtime: 154 minutes, 41 seconds\n",
      "1350000 Total runtime: 160 minutes, 35 seconds\n"
     ]
    }
   ],
   "source": [
    "start=datetime.datetime.now()\n",
    "if reload:\n",
    "    print(\"reolading from \" + datereload)\n",
    "    zenododata=pd.read_pickle(\"processed_data/\" + datereload + \"/zenododata.pkl\",compression='gzip')\n",
    "    print(\"from row:\" + zenododata.size)\n",
    "\n",
    "print(len(fxml))\n",
    "dicta=[]\n",
    "i=0\n",
    "for xmlfile in fxml:\n",
    "    row = read_oaixml(xmlfile)\n",
    "    #if reload and zenododata.url.isin(row['url']):\n",
    "    #    continue\n",
    "   # if harvest_web:\n",
    "    #    row.update({'files':getarchives(row['url'])})\n",
    "    #zenododata=zenododata.append(row,ignore_index=True)\n",
    "    dicta.append(row)\n",
    "    #i +=1\n",
    "    if len(dicta) % 50000 is 0:\n",
    "        print(len(dicta), time_elapsed(start))\n",
    "       # zenododata.to_pickle(\"processed_data/\" + date + \"/zenododata.pkl\",compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1395378"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dicta)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the work"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "zenododata=zenododata.append(dicta,ignore_index=True) \n",
    "zenododata\n",
    "zenododata.to_pickle(\"processed_data/\" + date + \"/zenododata.pkl\",compression='gzip')\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}