{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# XML Opening\n", "\n", "* Open the the xml files created with ```oai-harvest https://zenodo.org/oai2d``` using [bloomonkey/oai-harvest](https://github.com/bloomonkey/oai-harvest)\n", "* A ```zenododata.pkl``` (gziped) is created it's a dataframe that contains most of the XML fields \n", "* It's also possible to harvest the zenodo webpage (not recommanded) in order to have info about the files " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import lxml.etree as ET\n", "from xml.dom import minidom\n", "import pathlib\n", "import glob\n", "import datetime\n", "import sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Settings Var\n", "* reload : fo reusing a previous opening\n", "* datereload : folder of the zenododataa" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "reload=False #TODO`\n", "\n", "datereload='20190819'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functions\n", "### Read XML" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def read_oaixml(xmlfile):\n", " doc = minidom.parse(xmlfile)\n", " authors=[]\n", " for element in doc.getElementsByTagName(\"dc:creator\"):\n", " authors.append(element.firstChild.data)\n", " \n", " keywords=[]\n", " for element in doc.getElementsByTagName(\"dc:subject\"):\n", " keywords.append(element.firstChild.data)\n", " \n", " try:\n", " category=doc.getElementsByTagName(\"dc:type\")[1].firstChild.data\n", " except :\n", " category=\"None\"\n", " \n", " try:\n", " lic=doc.getElementsByTagName(\"dc:rights\")[1].firstChild.data\n", " except:\n", " lic=\"None\"\n", " \n", " try:\n", " abstract=doc.getElementsByTagName(\"dc:description\")[0].firstChild.data\n", " except:\n", " abstract=\"None\"\n", "\n", " output={\n", " 'category':category,\n", " 'date':doc.getElementsByTagName(\"dc:date\")[0].firstChild.data,\n", " 'title':doc.getElementsByTagName(\"dc:title\")[0].firstChild.data,\n", " 'authors':authors,\n", " 'abstract':abstract,\n", " 'url':doc.getElementsByTagName(\"dc:identifier\")[0].firstChild.data,\n", " 'keywords':keywords,\n", " 'license':lic,\n", " }\n", " return output" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Main\n", "### Reading XML Files" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def time_elapsed(start):\n", " end = datetime.datetime.now()\n", "\n", " time_to_run = end - start\n", " minutes = int(time_to_run.seconds/60)\n", " seconds = time_to_run.seconds % 60\n", " return \"Total runtime: \" + str(minutes) + \" minutes, \" + str(seconds) + \" seconds\"\n", "\n", "datadir='./xml-oai/'\n", "fxml = glob.glob(datadir + \"*.xml\")\n", "zenododata=pd.DataFrame(columns=['category','date','title','authors', 'abstract','keywords','license','url','files'])\n", "date=datetime.datetime.now().strftime('%Y%m%d')\n", "pathlib.Path(\"processed_data/\" + date).mkdir(parents=True, exist_ok=True) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Do the work" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1395378\n", "50000 Total runtime: 5 minutes, 34 seconds\n", "100000 Total runtime: 10 minutes, 52 seconds\n", "150000 Total runtime: 16 minutes, 35 seconds\n", "200000 Total runtime: 22 minutes, 51 seconds\n", "250000 Total runtime: 29 minutes, 17 seconds\n", "300000 Total runtime: 35 minutes, 11 seconds\n", "350000 Total runtime: 40 minutes, 58 seconds\n", "400000 Total runtime: 46 minutes, 47 seconds\n", "450000 Total runtime: 52 minutes, 42 seconds\n", "500000 Total runtime: 58 minutes, 35 seconds\n", "550000 Total runtime: 64 minutes, 39 seconds\n", "600000 Total runtime: 70 minutes, 49 seconds\n", "650000 Total runtime: 76 minutes, 56 seconds\n", "700000 Total runtime: 83 minutes, 1 seconds\n", "750000 Total runtime: 89 minutes, 11 seconds\n", "800000 Total runtime: 95 minutes, 19 seconds\n", "850000 Total runtime: 101 minutes, 28 seconds\n", "900000 Total runtime: 107 minutes, 20 seconds\n", "950000 Total runtime: 113 minutes, 13 seconds\n", "1000000 Total runtime: 119 minutes, 9 seconds\n", "1050000 Total runtime: 125 minutes, 5 seconds\n", "1100000 Total runtime: 131 minutes, 5 seconds\n", "1150000 Total runtime: 137 minutes, 1 seconds\n", "1200000 Total runtime: 142 minutes, 56 seconds\n", "1250000 Total runtime: 148 minutes, 46 seconds\n", "1300000 Total runtime: 154 minutes, 41 seconds\n", "1350000 Total runtime: 160 minutes, 35 seconds\n" ] } ], "source": [ "start=datetime.datetime.now()\n", "if reload:\n", " print(\"reolading from \" + datereload)\n", " zenododata=pd.read_pickle(\"processed_data/\" + datereload + \"/zenododata.pkl\",compression='gzip')\n", " print(\"from row:\" + zenododata.size)\n", "\n", "print(len(fxml))\n", "dicta=[]\n", "i=0\n", "for xmlfile in fxml:\n", " row = read_oaixml(xmlfile)\n", " #if reload and zenododata.url.isin(row['url']):\n", " # continue\n", " # if harvest_web:\n", " # row.update({'files':getarchives(row['url'])})\n", " #zenododata=zenododata.append(row,ignore_index=True)\n", " dicta.append(row)\n", " #i +=1\n", " if len(dicta) % 50000 is 0:\n", " print(len(dicta), time_elapsed(start))\n", " # zenododata.to_pickle(\"processed_data/\" + date + \"/zenododata.pkl\",compression='gzip')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1395378" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dicta)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save the work" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "zenododata=zenododata.append(dicta,ignore_index=True) \n", "zenododata\n", "zenododata.to_pickle(\"processed_data/\" + date + \"/zenododata.pkl\",compression='gzip')\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }