diff --git a/notebooks/Paths_Tomas_preprocessed.md b/notebooks/Paths_Tomas_preprocessed.md index c341e25..1e2d418 100644 --- a/notebooks/Paths_Tomas_preprocessed.md +++ b/notebooks/Paths_Tomas_preprocessed.md @@ -1,56 +1,56 @@ # Paths to preproccesed file Tomas These paths are done in the pyspark, so the hdfs folder is under user=ebouille Array : All arrays are saved in .txt files ### Stop_times Path: -'data/lgpt_guys/stop_times_array_version3.txt' + Structure: [[arrival_time, departure time],[],...] Ordered by: route_id, trip_id, stop_id Table with the same order 'data/lgpt_guys/stop_times_ordered_like_array.csv' ### Routes -'data/lgpt_guys/routes_array_version3.txt' + Structure [[n_Trips, n_stops, Cumsum_trips ("pointer like function"), Cumsum_stops("pointer like function")]] Ordered by: "route_id" ### Route stops Path: -'data/lgpt_guys/route_stops_array_version3.txt' + Structure: [[route_int1], [route_int2],....] (Note: the functions to_numpy() always put brackets between elements of each row, this is the reason the structure is not [route_int1, route_int2,....] ) Ordered by : "route_id", "stop_sequence" ### Stops Path: 'data/lgpt_guys/stops_array_version3.txt' Structure: [[Cumsu transfer ("pointer like function"), cumsum routes ("pointer like function")]] Ordered by: stop_id ### Stop_routes Path: -'data/lgpt_guys/stop_routes_array_version4.txt' + Structure: [[route_int], [route_int],....] Ordered by : "stop_id", "route_id" ### Transfer Path: -'data/lgpt_guys/transfer_array_version3.txt' + Structure: [[stop_int2, Transfer_time_sec],....] Ordered by : "stop_id" \ No newline at end of file diff --git a/notebooks/to_begin_with.ipynb b/notebooks/to_begin_with.ipynb index 0cefeaa..61b25f5 100644 --- a/notebooks/to_begin_with.ipynb +++ b/notebooks/to_begin_with.ipynb @@ -1,4453 +1,3319 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ - "# To Begin With...\n", + "# Preprocessing timetable data\n", "\n", "### Name your spark application as `GASPAR_final` or `GROUP_NAME_final`.\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6849application_1589299642358_1346pysparkidleLinkLink
6852application_1589299642358_1349pysparkidleLinkLink
6858application_1589299642358_1352pysparkidleLinkLink
6861application_1589299642358_1355pysparkidleLinkLink
6866application_1589299642358_1360pysparkidleLinkLink
6867application_1589299642358_1361pysparkidleLinkLink
6869application_1589299642358_1363pysparkidleLinkLink
6871application_1589299642358_1365pysparkbusyLinkLink
6872application_1589299642358_1366pysparkidleLinkLink
6874application_1589299642358_1368pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7044application_1589299642358_1538pysparkidleLinkLink7075application_1589299642358_1569pysparkbusyLinkLink7078application_1589299642358_1572pysparkidleLinkLink7083application_1589299642358_1577pysparkidleLinkLink7085application_1589299642358_1579pysparkidleLinkLink7093application_1589299642358_1587pysparkidleLinkLink7094application_1589299642358_1589pysparkbusyLinkLink7098application_1589299642358_1593pysparkidleLinkLink7099application_1589299642358_1595pysparkidleLinkLink7103application_1589299642358_1599pysparkidleLinkLink7104application_1589299642358_1600pysparkidleLinkLink7105application_1589299642358_1601pysparkidleLinkLink7107application_1589299642358_1603pysparkidleLinkLink7108application_1589299642358_1604pysparkidleLinkLink" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6875application_1589299642358_1369pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7110application_1589299642358_1606pysparkidleLinkLink✔" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocessing" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from geopy.distance import great_circle\n", "from pyspark.sql.functions import *\n", "import numpy as np\n", "import pandas as pd\n", "from geopy.distance import great_circle\n", "from pyspark.sql.types import DoubleType\n", "from pyspark.sql.types import DateType\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create list of stops within 15 km" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Zurich HB coordinates\n", "zurich_geo = (47.378177, 8.540192)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+----------------+----------------+-------------+--------------+\n", "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+-------+--------------------+----------------+----------------+-------------+--------------+\n", "|1322000| Altoggio|46.1672513851495| 8.345807131427| null| null|\n", "|1322001| Antronapiana| 46.060121674738|8.11361957990831| null| null|\n", "|1322002| Anzola|45.9898698225697|8.34571729989858| null| null|\n", "|1322003| Baceno|46.2614983591677|8.31925293162473| null| null|\n", "|1322004|Beura Cardezza, c...|46.0790618438814|8.29927439970313| null| null|\n", "+-------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 5 rows" ] } ], "source": [ "stops = spark.read.csv(\"/data/sbb/timetables/csv/stops/2019/05/14/stops.txt\", header=True, sep = \",\")\n", "stops.show(5)" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#defining udf function\n", "@udf(\"float\")\n", "def great_circle_udf(x, y):\n", " return great_circle(x, y).kilometers" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n", "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n", "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n", "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n", "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 5 rows" ] } ], "source": [ "#transforming zurich coordinates in columsn\n", "zurich_geo_col = struct(lit(zurich_geo[0]), lit(zurich_geo[1]))\n", "#applying filter function based on distance\n", "stops_15km = stops.filter(great_circle_udf(zurich_geo_col, struct(stops.stop_lat, stops.stop_lon)) < 15)\n", "stops_15km.show(5)" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stops_15km.write.csv('data/lgpt_guys/stops_15km.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stops_15km = spark.read.csv('data/lgpt_guys/stops_15km.csv', header = True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Precompute walking times between stations under 500m distance" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- stop_id: string (nullable = true)\n", " |-- stop_name: string (nullable = true)\n", " |-- stop_lat: string (nullable = true)\n", " |-- stop_lon: string (nullable = true)\n", " |-- location_type: string (nullable = true)\n", - " |-- parent_station: string (nullable = true)\n", - " |-- stop_int: long (nullable = false)" + " |-- parent_station: string (nullable = true)" ] } ], "source": [ "stops_15km.printSchema()" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "stops_15km2 = stops_15km.select(stops_15km.stop_id.alias(\"stop_id2\"), stops_15km.stop_int.alias(\"stop_int2\"), stops_15km.stop_lat.alias(\"stop_lat2\"),\\\n", + "#create second dataframe for crossjoin\n", + "stops_15km2 = stops_15km.select(stops_15km.stop_id.alias(\"stop_id2\"), stops_15km.stop_lat.alias(\"stop_lat2\"),\\\n", " stops_15km.stop_lon.alias(\"stop_lon2\") )" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", - "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int| stop_id2|stop_int2| stop_lat2| stop_lon2|\n", - "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8500926| 0|47.4236270123012| 8.4031825286317|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186| 1|47.3934058321612|8.39894248049007|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:1| 2|47.3934666445388|8.39894248049007|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:2| 3|47.3935274568464|8.39894248049007|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186P| 4|47.3934058321612|8.39894248049007|\n", - "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", + "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+\n", + "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station| stop_id2| stop_lat2| stop_lon2|\n", + "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8500926|47.4236270123012| 8.4031825286317|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186|47.3934058321612|8.39894248049007|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:1|47.3934666445388|8.39894248049007|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:2|47.3935274568464|8.39894248049007|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186P|47.3934058321612|8.39894248049007|\n", + "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+\n", "only showing top 5 rows" ] } ], "source": [ "#creating all combinations of transfers\n", "combination_stops = stops_15km.crossJoin(stops_15km2)\n", "combination_stops.show(5)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", - "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int| stop_id2|stop_int2| stop_lat2| stop_lon2|\n", - "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186| 1|47.3934058321612|8.39894248049007|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:1| 2|47.3934666445388|8.39894248049007|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:2| 3|47.3935274568464|8.39894248049007|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186P| 4|47.3934058321612|8.39894248049007|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502187| 5|47.3646945560768|8.37709545277724|\n", - "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", + "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+\n", + "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station| stop_id2| stop_lat2| stop_lon2|\n", + "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186|47.3934058321612|8.39894248049007|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:1|47.3934666445388|8.39894248049007|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:2|47.3935274568464|8.39894248049007|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186P|47.3934058321612|8.39894248049007|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502187|47.3646945560768|8.37709545277724|\n", + "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+\n", "only showing top 5 rows" ] } ], "source": [ "#eliminating transfer to same station\n", "combination_stops = combination_stops.filter(combination_stops.stop_id!=combination_stops.stop_id2)\n", "combination_stops.show(5)" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------+--------------------+----------------+----------------+-------------+--------------+--------+-----------+---------+----------------+----------------+------------+\n", - "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int| stop_id2|stop_int2| stop_lat2| stop_lon2| distance|\n", - "+-------+--------------------+----------------+----------------+-------------+--------------+--------+-----------+---------+----------------+----------------+------------+\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 0| 8590616| 1166|47.4228794433749|8.40437728795975| 0.122429974|\n", - "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 0| 8590737| 1270|47.4253712985116|8.40013723981811| 0.30017462|\n", - "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1|8502186:0:1| 2|47.3934666445388|8.39894248049007|0.0067620375|\n", - "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1|8502186:0:2| 3|47.3935274568464|8.39894248049007| 0.013524067|\n", - "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1| 8502186P| 4|47.3934058321612|8.39894248049007| 0.0|\n", - "+-------+--------------------+----------------+----------------+-------------+--------------+--------+-----------+---------+----------------+----------------+------------+\n", + "+-------+--------------------+----------------+----------------+-------------+--------------+-----------+----------------+----------------+------------+\n", + "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station| stop_id2| stop_lat2| stop_lon2| distance|\n", + "+-------+--------------------+----------------+----------------+-------------+--------------+-----------+----------------+----------------+------------+\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 8590616|47.4228794433749|8.40437728795975| 0.122429974|\n", + "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 8590737|47.4253712985116|8.40013723981811| 0.30017462|\n", + "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|8502186:0:1|47.3934666445388|8.39894248049007|0.0067620375|\n", + "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|8502186:0:2|47.3935274568464|8.39894248049007| 0.013524067|\n", + "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 8502186P|47.3934058321612|8.39894248049007| 0.0|\n", + "+-------+--------------------+----------------+----------------+-------------+--------------+-----------+----------------+----------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ + "# adding distance\n", "combination_stops = combination_stops.withColumn(\"distance\", great_circle_udf(struct(combination_stops.stop_lat, combination_stops.stop_lon), struct(combination_stops.stop_lat2, combination_stops.stop_lon2)))\n", "combination_stops = combination_stops.filter(combination_stops.distance < 0.5)\n", "combination_stops.show(5)" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------+--------+-----------+---------+-----------------+\n", - "|stop_id|stop_int| stop_id2|stop_int2|Transfer_time_sec|\n", - "+-------+--------+-----------+---------+-----------------+\n", - "|8500926| 0| 8590616| 1166| 146|\n", - "|8500926| 0| 8590737| 1270| 360|\n", - "|8502186| 1|8502186:0:1| 2| 8|\n", - "|8502186| 1|8502186:0:2| 3| 16|\n", - "|8502186| 1| 8502186P| 4| 0|\n", - "+-------+--------+-----------+---------+-----------------+\n", + "+-------+-----------+------------+-----------------+\n", + "|stop_id| stop_id2| distance|Transfer_time_sec|\n", + "+-------+-----------+------------+-----------------+\n", + "|8500926| 8590616| 0.122429974| 146|\n", + "|8500926| 8590737| 0.30017462| 360|\n", + "|8502186|8502186:0:1|0.0067620375| 8|\n", + "|8502186|8502186:0:2| 0.013524067| 16|\n", + "|8502186| 8502186P| 0.0| 0|\n", + "+-------+-----------+------------+-----------------+\n", "only showing top 5 rows" ] } ], "source": [ - "transfer = combination_stops.select(combination_stops.stop_id, combination_stops.stop_int, combination_stops.stop_id2,\\\n", - " combination_stops.stop_int2, ((combination_stops.distance/0.05*60).alias(\"Transfer_time_sec\")).cast(\"integer\"))\n", + "#adding transfer time in seconds\n", + "transfer = combination_stops.select(combination_stops.stop_id, combination_stops.stop_id2,\\\n", + " combination_stops.distance, ((combination_stops.distance/0.05*60).alias(\"Transfer_time_sec\")).cast(\"integer\"))\n", "transfer.show(5)" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- stop_id: string (nullable = true)\n", - " |-- stop_int: long (nullable = false)\n", " |-- stop_id2: string (nullable = true)\n", - " |-- stop_int2: long (nullable = false)\n", + " |-- distance: float (nullable = true)\n", " |-- Transfer_time_sec: integer (nullable = true)" ] } ], "source": [ "transfer.printSchema()" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "transfer.write.csv('data/lgpt_guys/transfers.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean timetable data\n", "At the end of each step a file reader to be see the dataframe at this step without having to recompute" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Calendar.txt\n", "- drop services that are not on all business days" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "calendar = spark.read.csv(\"/data/sbb/timetables/csv/calendar/2019/05/14/calendar.txt\", header=True, sep = \",\")\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b08k4| 1| 0| 0| 0| 0| 0| 0| 20181209|20191214|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "calendar.show(5)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxn| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "calendar_business_days = calendar.filter((calendar.monday==1) & (calendar.tuesday==1) & (calendar.wednesday==1) & (calendar.thursday==1) & (calendar.friday==1))\n", "calendar_business_days.show(5)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+\n", "|service_id|\n", "+----------+\n", "| TA+b0nx9|\n", "| TA+b03bf|\n", "| TA+b0008|\n", "| TA+b0nxg|\n", "| TA+b0nxn|\n", "+----------+\n", "only showing top 5 rows" ] } ], "source": [ "service_id_business_days = calendar_business_days.select(calendar_business_days.service_id)\n", "service_id_business_days.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### trips.txt\n", "- Drop trips not in business days" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+----------+--------------------+------------------+---------------+------------+\n", "| route_id|service_id| trip_id| trip_headsign|trip_short_name|direction_id|\n", "+-----------+----------+--------------------+------------------+---------------+------------+\n", "|1-1-C-j19-1| TA+b0001|5.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 108| 1|\n", "|1-1-C-j19-1| TA+b0001|7.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 112| 1|\n", "|1-1-C-j19-1| TA+b0001|9.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 116| 1|\n", "|1-1-C-j19-1| TA+b0001|11.TA.1-1-C-j19-1...|Zofingen, Altachen| 120| 1|\n", "|1-1-C-j19-1| TA+b0001|13.TA.1-1-C-j19-1...|Zofingen, Altachen| 124| 1|\n", "+-----------+----------+--------------------+------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips = spark.read.csv(\"/data/sbb/timetables/csv/trips/2019/05/14/trips.txt\", header=True, sep = \",\")\n", "trips.show(5)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+-----------+--------------------+--------------------+---------------+------------+\n", "|service_id| route_id| trip_id| trip_headsign|trip_short_name|direction_id|\n", "+----------+-----------+--------------------+--------------------+---------------+------------+\n", "| TA+b0001|1-1-C-j19-1|46.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 113| 0|\n", "| TA+b0001|1-1-C-j19-1|59.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 139| 0|\n", "| TA+b0001|1-340-j19-1|2.TA.1-340-j19-1.1.H| Wohlen AG, Bahnhof| 105| 0|\n", "| TA+b0001|1-354-j19-1|36.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35435| 0|\n", "| TA+b0001|1-354-j19-1|47.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35467| 0|\n", "+----------+-----------+--------------------+--------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips_businessdays = service_id_business_days.join(trips, how=\"inner\", on=\"service_id\").dropDuplicates()\n", "trips_businessdays.show(5)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "528368" ] } ], "source": [ "trips_businessdays.count()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_businessdays.write.csv('data/lgpt_guys/trips_businessdays.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_businessdays = spark.read.csv('data/lgpt_guys/trips_businessdays.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Stop_times.txt\n", "- Drop trips not in trips_business_days\n", "- Drop stops not inside 15 km radius\n", - "- Drop departures before 7 am and after 9 pm\n", - "- Add stop_int" + "- Drop trips with departures before 7 am and after 7 pm" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "| trip_id|arrival_time|departure_time| stop_id|stop_sequence|pickup_type|drop_off_type|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "|1.TA.1-1-B-j19-1.1.R| 04:20:00| 04:20:00|8500010:0:3| 1| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:24:00| 04:24:00|8500020:0:3| 2| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:28:00| 04:28:00|8500021:0:5| 3| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:30:00| 04:30:00|8517131:0:2| 4| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:32:00| 04:32:00|8500300:0:5| 5| 0| 0|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "only showing top 5 rows" ] } ], "source": [ + "#read file\n", "stop_times = spark.read.csv(\"/data/sbb/timetables/csv/stop_times/2019/05/14/stop_times.txt\", header=True, sep = \",\")\n", "stop_times.show(5)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| trip_id|\n", "+--------------------+\n", "|65.TA.1-1-C-j19-1...|\n", "|34.TA.1-135-j19-1...|\n", "|50.TA.1-136-j19-1...|\n", "|42.TA.1-139-j19-1...|\n", "|39.TA.1-141-j19-1...|\n", "+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ + "#dataframe for join\n", "trip_id_trips_businessdays = trips_businessdays.select(trips_businessdays.trip_id)\n", "trip_id_trips_businessdays.show(5)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", - "| trip_id|arrival_time|departure_time| stop_id|stop_sequence|pickup_type|drop_off_type|\n", - "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", - "|93.TA.1-1-B-j19-1...| 17:24:00| 17:24:00|8500020:0:3| 2| 0| 0|\n", - "|143.TA.1-1-B-j19-...| 07:28:00| 07:28:00|8517131:0:1| 9| 0| 0|\n", - "|164.TA.1-1-B-j19-...| 10:21:00| 10:22:00|8500301:0:4| 6| 0| 0|\n", - "|31.TA.1-1-C-j19-1...| 19:39:00| 19:39:00| 8502384| 19| 0| 0|\n", - "|34.TA.1-1-C-j19-1...| 20:39:00| 20:39:00| 8502384| 19| 0| 0|\n", - "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", + "+--------------------+------------+--------------+-------+-------------+-----------+-------------+\n", + "| trip_id|arrival_time|departure_time|stop_id|stop_sequence|pickup_type|drop_off_type|\n", + "+--------------------+------------+--------------+-------+-------------+-----------+-------------+\n", + "|104.TA.8-6-j19-1.2.H| 07:13:00| 07:13:00|8592375| 1| 0| 0|\n", + "|149.TA.8-6-j19-1.2.H| 08:36:00| 08:36:00|8589151| 21| 0| 0|\n", + "|171.TA.8-6-j19-1.2.H| 16:45:00| 16:45:00|8591767| 14| 0| 0|\n", + "|223.TA.8-6-j19-1.2.H| 10:00:00| 10:00:00|8591767| 14| 0| 0|\n", + "|258.TA.8-6-j19-1.4.R| 17:07:00| 17:07:00|8592378| 20| 0| 0|\n", + "+--------------------+------------+--------------+-------+-------------+-----------+-------------+\n", "only showing top 5 rows" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "--- Logging error ---\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.7/site-packages/sparkmagic/livyclientlib/livysession.py\", line 54, in run\n", - " self.livy_session.refresh_status_and_info()\n", - " File \"/opt/conda/lib/python3.7/site-packages/sparkmagic/livyclientlib/livysession.py\", line 287, in refresh_status_and_info\n", - " response = self._http_client.get_session(self.id)\n", - " File \"/opt/conda/lib/python3.7/site-packages/sparkmagic/livyclientlib/livyreliablehttpclient.py\", line 39, in get_session\n", - " return self._http_client.get(self._session_url(session_id), [200]).json()\n", - " File \"/opt/conda/lib/python3.7/site-packages/sparkmagic/livyclientlib/reliablehttpclient.py\", line 47, in get\n", - " return self._send_request(relative_url, accepted_status_codes, self._session.get)\n", - " File \"/opt/conda/lib/python3.7/site-packages/sparkmagic/livyclientlib/reliablehttpclient.py\", line 58, in _send_request\n", - " return self._send_request_helper(self.compose_url(relative_url), accepted_status_codes, function, data, 0)\n", - " File \"/opt/conda/lib/python3.7/site-packages/sparkmagic/livyclientlib/reliablehttpclient.py\", line 96, in _send_request_helper\n", - " .format(status, url, text))\n", - "sparkmagic.livyclientlib.exceptions.HttpClientException: Invalid status code '404' from http://iccluster044.iccluster.epfl.ch:8998/sessions/6872 with error payload: \"Session '6872' not found.\"\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 1029, in emit\n", - " self.flush()\n", - " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 1009, in flush\n", - " self.stream.flush()\n", - "OSError: [Errno 12] Cannot allocate memory\n", - "Call stack:\n", - " File \"/opt/conda/lib/python3.7/threading.py\", line 890, in _bootstrap\n", - " self._bootstrap_inner()\n", - " File \"/opt/conda/lib/python3.7/threading.py\", line 926, in _bootstrap_inner\n", - " self.run()\n", - " File \"/opt/conda/lib/python3.7/site-packages/sparkmagic/livyclientlib/livysession.py\", line 61, in run\n", - " self.livy_session.logger.error(u'{}'.format(e))\n", - " File \"/opt/conda/lib/python3.7/site-packages/hdijupyterutils/log.py\", line 24, in error\n", - " self.logger.error(self._transform_log_message(message))\n", - "Message: 'LivySession\\tInvalid status code \\'404\\' from http://iccluster044.iccluster.epfl.ch:8998/sessions/6872 with error payload: \"Session \\'6872\\' not found.\"'\n", - "Arguments: ()\n" - ] } ], "source": [ "#drop trips not in business days\n", "stop_times_business_days = stop_times.join(trip_id_trips_businessdays, how=\"inner\", on = \"trip_id\").dropDuplicates()\n", "stop_times_business_days.show(5)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+--------+\n", - "| stop_id|stop_int|\n", - "+-----------+--------+\n", - "| 8500926| 0|\n", - "| 8502186| 1|\n", - "|8502186:0:1| 2|\n", - "|8502186:0:2| 3|\n", - "| 8502186P| 4|\n", - "+-----------+--------+\n", + "+-----------+\n", + "| stop_id|\n", + "+-----------+\n", + "| 8500926|\n", + "| 8502186|\n", + "|8502186:0:1|\n", + "|8502186:0:2|\n", + "| 8502186P|\n", + "+-----------+\n", "only showing top 5 rows" ] } ], "source": [ - "stop_id_stops_15km = stops_15km.select(stops_15km.stop_id, stops_15km.stop_int)\n", + "stop_id_stops_15km = stops_15km.select(stops_15km.stop_id)\n", "stop_id_stops_15km.show(5)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", - "| stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int|\n", - "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", - "|8503000:0:10|113.TA.21-75-j19-...| 07:35:00| 07:35:00| 1| 0| 0| 96|\n", - "| 8591361|2290.TA.26-31-j19...| 24:49:00| 24:49:00| 8| 0| 0| 1721|\n", - "| 8590318|2289.TA.26-31-j19...| 24:52:00| 24:52:00| 7| 0| 0| 1038|\n", - "| 8591219|2629.TA.26-31-j19...| 20:45:00| 20:45:00| 12| 0| 0| 1588|\n", - "| 8591137|2629.TA.26-31-j19...| 21:01:00| 21:01:00| 23| 0| 0| 1510|\n", - "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", + "+------------+--------------------+------------+--------------+-------------+-----------+-------------+\n", + "| stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|\n", + "+------------+--------------------+------------+--------------+-------------+-----------+-------------+\n", + "| 8590314|441.TA.1-1-E-j19-...| 21:17:00| 21:17:00| 32| 0| 0|\n", + "| 8590317|610.TA.1-1-E-j19-...| 06:34:00| 06:34:00| 31| 0| 0|\n", + "|8503000:0:13| 4.TA.1-1-j19-1.4.H| 05:19:00| 05:19:00| 1| 0| 0|\n", + "| 8590276|23.TA.1-10-j19-1.1.R| 16:57:00| 16:57:00| 3| 0| 0|\n", + "| 8594307|36.TA.1-11-B-j19-...| 09:39:00| 09:39:00| 6| 0| 0|\n", + "+------------+--------------------+------------+--------------+-------------+-----------+-------------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop trips not in 15 km radius\n", "stop_times_zurich = stop_times_business_days.join(stop_id_stops_15km, how=\"inner\", on = \"stop_id\").dropDuplicates()\n", "stop_times_zurich.show(5)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", - "|stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int|\n", - "+-------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", - "|8591219|2629.TA.26-31-j19...| 20:45:00| 20:45:00| 12| 0| 0| 1588|\n", - "|8591262|2631.TA.26-31-j19...| 20:30:00| 20:30:00| 14| 0| 0| 1630|\n", - "|8591072|2638.TA.26-31-j19...| 19:59:00| 19:59:00| 30| 0| 0| 1452|\n", - "|8591445|2687.TA.26-31-j19...| 13:55:00| 13:55:00| 32| 0| 0| 1801|\n", - "|8591186|2695.TA.26-31-j19...| 12:21:00| 12:21:00| 7| 0| 0| 1555|\n", - "+-------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", + "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------------+\n", + "| stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|hour_departure|\n", + "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------------+\n", + "| 8590314|441.TA.1-1-E-j19-...| 21:17:00| 21:17:00| 32| 0| 0| 21|\n", + "| 8590317|610.TA.1-1-E-j19-...| 06:34:00| 06:34:00| 31| 0| 0| 6|\n", + "|8503000:0:13| 4.TA.1-1-j19-1.4.H| 05:19:00| 05:19:00| 1| 0| 0| 5|\n", + "| 8590276|23.TA.1-10-j19-1.1.R| 16:57:00| 16:57:00| 3| 0| 0| 16|\n", + "| 8594307|36.TA.1-11-B-j19-...| 09:39:00| 09:39:00| 6| 0| 0| 9|\n", + "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------------+\n", "only showing top 5 rows" ] } ], "source": [ - "#drop departures before 7 am and after 9 pm\n", - "stop_times_zurich_day = stop_times_zurich.filter(hour(stop_times_zurich.departure_time) < 21).filter(hour(stop_times_zurich.departure_time) > 7)\n", - "stop_times_zurich_day.show(5)" + "#create column with hour of departure for filtering\n", + "stop_times_zurich = stop_times_zurich.withColumn(\"hour_departure\", hour(stop_times_zurich[\"departure_time\"]))\n", + "stop_times_zurich.show(5)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_times_zurich_pandas = stop_times_zurich.toPandas()" + ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 398630 entries, 0 to 398629\n", + "Data columns (total 8 columns):\n", + "stop_id 398630 non-null object\n", + "trip_id 398630 non-null object\n", + "arrival_time 398630 non-null object\n", + "departure_time 398630 non-null object\n", + "stop_sequence 398630 non-null object\n", + "pickup_type 398630 non-null object\n", + "drop_off_type 398630 non-null object\n", + "hour_departure 392157 non-null float64\n", + "dtypes: float64(1), object(7)\n", + "memory usage: 24.3+ MB" + ] + } + ], + "source": [ + "stop_times_zurich_pandas.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#dropping trip_ids with departure before 7 and after 7 pm\n", + "trip_id_drop = np.where(((stop_times_zurich_pandas.hour_departure > 19) |\\\n", + " (stop_times_zurich_pandas.hour_departure < 7)),\\\n", + " stop_times_zurich_pandas[\"trip_id\"] , None)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_times_zurich_day = stop_times_zurich_pandas[~stop_times_zurich_pandas[\"trip_id\"].isin(trip_id_drop)]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " trip_id departure_time\n", + "115309 961.TA.26-70-A-j19-1.5.H 07:00:00\n", + "317110 30.TA.26-817-j19-1.1.H 07:00:00\n", + "210269 117.TA.26-4-B-j19-1.3.R 07:00:00\n", + "382941 521.TA.26-140-j19-1.5.R 07:00:00\n", + "136103 1401.TA.26-140-j19-1.8.H 07:00:00" + ] + } + ], + "source": [ + "stop_times_zurich_day[[\"trip_id\", \"departure_time\"]].sort_values(\"departure_time\").head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_times_zurich_day = spark.createDataFrame(stop_times_zurich_day)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_zurich_day.write.csv('data/lgpt_guys/stop_times_zurich_day.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_zurich_day = spark.read.csv('data/lgpt_guys/stop_times_zurich_day.csv', header = True)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "296583" + "300632" ] } ], "source": [ "stop_times_zurich_day.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### trips.txt\n", "- drop trips not inside zurich area\n", - "- drop trips active only before 7am and 9pm\n", - "- Add route_int" + "- drop trips active only before 7am and 9pm" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| trip_id|\n", "+--------------------+\n", - "|247.TA.16-13-j19-...|\n", - "|246.TA.16-5-j19-1...|\n", - "|67.TA.17-4-j19-1....|\n", - "|32.TA.21-75-j19-1...|\n", - "|2667.TA.26-31-j19...|\n", + "|240.TA.25-75-j19-...|\n", + "|67.TA.26-811-j19-...|\n", + "|134.TA.26-811-j19...|\n", + "|154.TA.26-811-j19...|\n", + "|72.TA.26-812-j19-...|\n", "+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ - "#trips inside zurich sarea and add route_int\n", + "#trips inside zurich sarea \n", "trips_zurich = stop_times_zurich_day.select(stop_times_zurich_day.trip_id)\n", "trips_zurich.show(5)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+----------+-------------+------------------+---------------+------------+\n", "| trip_id|service_id| route_id| trip_headsign|trip_short_name|direction_id|\n", "+--------------------+----------+-------------+------------------+---------------+------------+\n", "|1349.TA.26-13-j19...| TA+b0003| 26-13-j19-1|Zürich, Frankental| 3781| 1|\n", "|902.TA.26-14-A-j1...| TA+b0003|26-14-A-j19-1| Zürich, Seebach| 3535| 1|\n", "|25.TA.26-25-A-j19...| TA+b0003|26-25-A-j19-1| Zürich, Dolder| 108| 1|\n", "|335.TA.26-25-A-j1...| TA+b0003|26-25-A-j19-1| Zürich, Römerhof| 54| 0|\n", "|161.TA.26-302-j19...| TA+b0003| 26-302-j19-1| Dietikon, Bahnhof| 8727| 1|\n", "+--------------------+----------+-------------+------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop trips not in zurich and during the day\n", "trips_zurich_day = trips_businessdays.join(trips_zurich, how=\"inner\", on=\"trip_id\").dropDuplicates()\n", "trips_zurich_day.show(5)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day.write.csv('data/lgpt_guys/trips_zurich_day.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day = spark.read.csv('data/lgpt_guys/trips_zurich_day.csv', header = True)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "24386" + "24647" ] } ], "source": [ "trips_zurich_day.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### stop_times.txt\n", "- drop day speficic trip \n", "- Drop trips that are trimmed in 2 by 15 km radius" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+------------+\n", - "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+------------+\n", - "|2680.TA.26-31-j19...|8588078| 14:28:00| 14:28:00| 16| 0| 0| 975| 26-31-j19-1|\n", - "|253.TA.26-62-j19-...|8591118| 12:26:00| 12:26:00| 7| 0| 0| 1493| 26-62-j19-1|\n", - "|71.TA.26-650-j19-...|8575937| 09:56:00| 09:56:00| 3| 0| 0| 642|26-650-j19-1|\n", - "|37.TA.26-720-j19-...|8576133| 12:26:00| 12:26:00| 9| 0| 0| 672|26-720-j19-1|\n", - "|68.TA.26-721-j19-...|8503155| 15:52:00| 15:52:00| 7| 0| 0| 271|26-721-j19-1|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+------------+\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+------------+\n", + "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|hour_departure| route_id|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+------------+\n", + "|595.TA.26-140-j19...|8573162| 15:37:00| 15:37:00| 6| 0| 0| 15.0|26-140-j19-1|\n", + "|384.TA.26-5-B-j19...|8591329| 14:23:00| 14:23:00| 13| 0| 0| 14.0|26-5-B-j19-1|\n", + "|601.TA.26-67-j19-...|8591126| 14:28:00| 14:28:00| 7| 0| 0| 14.0| 26-67-j19-1|\n", + "|1367.TA.26-7-B-j1...|8591276| 16:58:00| 16:58:00| 22| 0| 0| 16.0|26-7-B-j19-1|\n", + "|40.TA.26-727-j19-...|8590640| 14:44:00| 14:44:00| 8| 0| 0| 14.0|26-727-j19-1|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop day specific trips\n", "stop_times_w_routes = stop_times_zurich_day.join(trips_zurich_day.select(trips_zurich_day.route_id, trips_zurich_day.trip_id), \\\n", " how=\"inner\", on=\"trip_id\").dropDuplicates()\n", "stop_times_w_routes.show(5)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "unique_route_time = stop_times_w_routes.dropDuplicates(['route_id', 'arrival_time', 'departure_time', 'stop_sequence', 'stop_id', 'stop_int'])\n" + "# selecting relevant columns\n", + "unique_route_time = stop_times_w_routes.dropDuplicates(['route_id', 'arrival_time', 'departure_time', 'stop_sequence', 'stop_id'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dropping routes if routes have been trimmed in 2" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#creating pandas dataframe to use shift function\n", "stop_times_route_pandas1 = unique_route_time.orderBy(\"trip_id\", \"stop_sequence\").toPandas()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ + "#saving in another variable to avoid making again pandas conversion\n", "stop_times_route_pandas = stop_times_route_pandas1" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "stop_times_route_pandas[\"sequence_1\"] = stop_times_route_pandas[\"stop_sequence\"].shift(-1)\n" + "#creating shift to compare sequence column\n", + "stop_times_route_pandas[\"sequence_1\"] = stop_times_route_pandas[\"stop_sequence\"].shift(-1, fill_value=0)\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - " stop_sequence sequence_1\n", - "0 1 10\n", - "1 10 11\n", - "2 11 12\n", - "3 12 13\n", - "4 13 14" + " stop_sequence sequence_1\n", + "0 1 10\n", + "1 10 11\n", + "2 11 12\n", + "3 12 13\n", + "4 13 14" ] } ], "source": [ "stop_times_route_pandas[[\"stop_sequence\", \"sequence_1\"]].head(5)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ + "#transforming in int\n", "stop_times_route_pandas[\"stop_sequence\"] = pd.to_numeric(stop_times_route_pandas[\"stop_sequence\"])\n", "stop_times_route_pandas[\"sequence_1\"] = pd.to_numeric(stop_times_route_pandas[\"sequence_1\"])" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "stop_times_route_pandas[\"trip_1\"] = stop_times_route_pandas[\"trip_id\"].shift(-1)\n" + "#creating shift to compare trips column\n", + "\n", + "stop_times_route_pandas[\"trip_1\"] = stop_times_route_pandas[\"trip_id\"].shift(-1, fill_value=0)\n" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " trip_id trip_1 stop_sequence sequence_1\n", - "0 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 1 10.0\n", - "1 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 10 11.0\n", - "2 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 11 12.0\n", - "3 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 12 13.0\n", - "4 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 13 14.0" + "0 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 1 10\n", + "1 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 10 11\n", + "2 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 11 12\n", + "3 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 12 13\n", + "4 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 13 14" ] } ], "source": [ "stop_times_route_pandas[[\"trip_id\",\"trip_1\",\"stop_sequence\", \"sequence_1\"]].head(5)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ + "#dropping trips that has been trimmed\n", "trip_id_drop = np.where(((stop_times_route_pandas[\"trip_id\"] == stop_times_route_pandas[\"trip_1\"]) &\\\n", " ((stop_times_route_pandas[\"stop_sequence\"]+1) < stop_times_route_pandas[\"sequence_1\"])),\\\n", " stop_times_route_pandas[\"trip_id\"] , None)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas_clean = stop_times_route_pandas[~stop_times_route_pandas[\"trip_id\"].isin(trip_id_drop)]" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas = stop_times_route_pandas.astype(str)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ + "#recreate dataframe in spark to write the csv\n", "stop_times_route = spark.createDataFrame(stop_times_route_pandas)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+\n", - "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+\n", - "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 500|1-231-j19-1| 10.0|1.TA.1-231-j19-1.1.H|\n", - "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 599|1-231-j19-1| 11.0|1.TA.1-231-j19-1.1.H|\n", - "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 401|1-231-j19-1| 12.0|1.TA.1-231-j19-1.1.H|\n", - "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 598|1-231-j19-1| 13.0|1.TA.1-231-j19-1.1.H|\n", - "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 401|1-231-j19-1| 14.0|1.TA.1-231-j19-1.1.H|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", + "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|hour_departure| route_id|sequence_1| trip_1|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", + "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 9.0|1-231-j19-1| 10|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 9.0|1-231-j19-1| 11|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 9.0|1-231-j19-1| 12|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 9.0|1-231-j19-1| 13|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 10.0|1-231-j19-1| 14|1.TA.1-231-j19-1.1.H|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times_route.show(5)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route.write.csv('data/lgpt_guys/stop_times_route.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route = spark.read.csv('data/lgpt_guys/stop_times_route.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Trips.txt\n", "- dropping trips who have been trimmed" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| trip_id|\n", "+--------------------+\n", - "|292.TA.26-304-j19...|\n", - "|3.TA.26-23-j19-1.3.H|\n", - "|302.TA.26-652-j19...|\n", - "|304.TA.26-61-j19-...|\n", - "|305.TA.26-912-j19...|\n", + "|103.TA.26-925-j19...|\n", + "|104.TA.26-733-j19...|\n", + "|1087.TA.26-5-B-j1...|\n", + "|109.TA.1-1-E-j19-...|\n", + "|109.TA.79-24-j19-...|\n", "+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ + "#creating dataframe\n", "trips_zurich_day_clean = stop_times_route.select(\"trip_id\").dropDuplicates()\n", "trips_zurich_day_clean.show(5)\n" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22984" + ] + } + ], + "source": [ + "#chack difference in number of trips\n", + "trips_zurich_day_clean.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24647" + ] + } + ], + "source": [ + "trips_zurich_day.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+----------+-------------+--------------------+---------------+------------+\n", - "| trip_id|service_id| route_id| trip_headsign|trip_short_name|direction_id|\n", - "+--------------------+----------+-------------+--------------------+---------------+------------+\n", - "|644.TA.26-67-j19-...| TA+b0003| 26-67-j19-1|Zürich, Dunkelhölzli| 4388| 1|\n", - "|1223.TA.26-72-j19...| TA+b0003| 26-72-j19-1| Zürich, Milchbuck| 4912| 1|\n", - "|337.TA.26-145-j19...| TA+b0fal| 26-145-j19-1| Thalwil, Zentrum| 3900| 1|\n", - "|3717.TA.26-8-C-j1...| TA+b0fe2| 26-8-C-j19-1| Zürich, Klusplatz| 105| 0|\n", - "|607.TA.26-75-A-j1...| TA+b0d7h|26-75-A-j19-1|Zürich, Seebacher...| 3298| 1|\n", - "+--------------------+----------+-------------+--------------------+---------------+------------+\n", + "+--------------------+----------+------------+--------------------+---------------+------------+\n", + "| trip_id|service_id| route_id| trip_headsign|trip_short_name|direction_id|\n", + "+--------------------+----------+------------+--------------------+---------------+------------+\n", + "|1223.TA.26-72-j19...| TA+b0003| 26-72-j19-1| Zürich, Milchbuck| 4912| 1|\n", + "|644.TA.26-67-j19-...| TA+b0003| 26-67-j19-1|Zürich, Dunkelhölzli| 4388| 1|\n", + "|632.TA.26-67-j19-...| TA+b0ig5| 26-67-j19-1|Zürich, Dunkelhölzli| 4293| 1|\n", + "|286.TA.26-11-j19-...| TA+b0s5d| 26-11-j19-1| Sennhof-Kyburg| 19139| 0|\n", + "|337.TA.26-145-j19...| TA+b0fal|26-145-j19-1| Thalwil, Zentrum| 3900| 1|\n", + "+--------------------+----------+------------+--------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ + "#join to delete trimmed trips\n", "trips_zurich_day_clean_complete = trips_zurich_day.join(trips_zurich_day_clean, on=\"trip_id\", how=\"inner\" ).dropDuplicates()\n", "trips_zurich_day_clean_complete.show(5)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "22807" + "22984" ] } ], "source": [ "trips_zurich_day_clean_complete.count()" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day_clean_complete.write.csv('data/lgpt_guys/trips_zurich_day_clean_complete.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day_clean_complete = spark.read.csv('data/lgpt_guys/trips_zurich_day_clean_complete.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### routes.txt\n", "- Drop routes not used during all business days and outside radius 15 km \n", "- drop routes trimmed\n", "- Give directionality to routes\n" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+---------+----------------+---------------+----------+----------+\n", "| route_id|agency_id|route_short_name|route_long_name|route_desc|route_type|\n", "+-----------+---------+----------------+---------------+----------+----------+\n", "|11-40-j19-1| 801| 040| null| Bus| 700|\n", "|11-61-j19-1| 7031| 061| null| Bus| 700|\n", "|11-62-j19-1| 7031| 062| null| Bus| 700|\n", "|24-64-j19-1| 801| 064| null| Bus| 700|\n", "|11-83-j19-1| 801| 083| null| Bus| 700|\n", "+-----------+---------+----------------+---------------+----------+----------+\n", "only showing top 5 rows" ] } ], "source": [ "routes = spark.read.csv(\"/data/sbb/timetables/csv/routes/2019/05/14/routes.txt\", header=True, sep = \",\")\n", "routes.show(5)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+------------+\n", "| route_id|direction_id|\n", "+------------+------------+\n", - "| 26-69-j19-1| 0|\n", - "|26-3-A-j19-1| 1|\n", - "|26-301-j19-1| 0|\n", - "|26-140-j19-1| 0|\n", - "| 26-80-j19-1| 0|\n", + "| 26-E-j19-1| 0|\n", + "| 26-13-j19-1| 1|\n", + "|26-33E-j19-1| 0|\n", + "|26-650-j19-1| 1|\n", + "|26-533-j19-1| 0|\n", "+------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#select routes within 15 km, active on business days and not trimmed\n", "route_zurich_clean = trips_zurich_day_clean_complete.select(trips_zurich_day_clean_complete.route_id, trips_zurich_day_clean_complete.direction_id)\n", "route_zurich_clean.show(5)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+---------+----------------+---------------+----------+----------+------------+\n", "| route_id|agency_id|route_short_name|route_long_name|route_desc|route_type|direction_id|\n", "+------------+---------+----------------+---------------+----------+----------+------------+\n", "|26-6-A-j19-1| 11| 6| null| S-Bahn| 400| 1|\n", "|54-4-Y-j19-1| 11| 4-Y| null| ICE| 102| 0|\n", "|26-842-j19-1| 838| 842| null| Bus| 700| 0|\n", "|12-E03-j19-1| 767| E03| null| Bus| 700| 1|\n", "|26-912-j19-1| 849| 912| null| Bus| 700| 0|\n", "+------------+---------+----------------+---------------+----------+----------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#Drop routes not within 15km and active on business days\n", "routes_non_directional = routes.join(route_zurich_clean, how=\"inner\", on=\"route_id\").dropDuplicates()\n", "routes_non_directional.show(5)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", - "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional|\n", - "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", - "| 26-6-A-j19-1| 1| 26-6-A-j19-1-1| 11| 400| 6-1| null|\n", - "| 54-4-Y-j19-1| 0| 54-4-Y-j19-1-0| 11| 102| 4-Y-0| null|\n", - "| 26-842-j19-1| 0| 26-842-j19-1-0| 838| 700| 842-0| null|\n", - "| 26-912-j19-1| 0| 26-912-j19-1-0| 849| 700| 912-0| null|\n", - "|90-71-Y-j19-1| 0| 90-71-Y-j19-1-0| 180| 1000| 71-Y-0| null|\n", - "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", + "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", + "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional|\n", + "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", + "|26-6-A-j19-1| 1| 26-6-A-j19-1-1| 11| 400| 6-1| null|\n", + "|54-4-Y-j19-1| 0| 54-4-Y-j19-1-0| 11| 102| 4-Y-0| null|\n", + "|26-842-j19-1| 0| 26-842-j19-1-0| 838| 700| 842-0| null|\n", + "|12-E03-j19-1| 1| 12-E03-j19-1-1| 767| 700| E03-1| null|\n", + "|26-912-j19-1| 0| 26-912-j19-1-0| 849| 700| 912-0| null|\n", + "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", "only showing top 5 rows" ] } ], "source": [ "#Give directionality to routes\n", "routes_directional = routes_non_directional.select(routes_non_directional.route_id, routes_non_directional.direction_id, concat(routes_non_directional.route_id, lit(\"-\"), routes_non_directional.direction_id).alias(\"route_id_directional\"),\\\n", " routes_non_directional.agency_id, routes_non_directional.route_type,\\\n", " concat(routes_non_directional.route_short_name, lit(\"-\"), routes_non_directional.direction_id).alias(\"route_short_name_directional\"), \\\n", " concat(routes_non_directional.route_long_name, lit(\"-\"), routes_non_directional.direction_id).alias(\"route_long_name_directional\"))\n", "routes_directional.show(5)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+----------+\n", - "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional| route_int|\n", - "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+----------+\n", - "| 26-6-A-j19-1| 1| 26-6-A-j19-1-1| 11| 400| 6-1| null| 0|\n", - "| 54-4-Y-j19-1| 0| 54-4-Y-j19-1-0| 11| 102| 4-Y-0| null| 1|\n", - "| 26-842-j19-1| 0| 26-842-j19-1-0| 838| 700| 842-0| null|8589934592|\n", - "| 26-912-j19-1| 0| 26-912-j19-1-0| 849| 700| 912-0| null|8589934593|\n", - "|90-71-Y-j19-1| 0| 90-71-Y-j19-1-0| 180| 1000| 71-Y-0| null|8589934594|\n", - "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+----------+\n", - "only showing top 5 rows" - ] } ], "source": [ - "routes_curated = routes_directional.withColumn(\"route_int\", monotonically_increasing_id() )\n", - "routes_curated.orderBy(\"route_int\").show(5)" + "routes_curated = routes_directional" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "routes_curated.write.csv('data/lgpt_guys/routes_curated.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "routes_curated = spark.read.csv('data/lgpt_guys/routes_curated.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### trip.txt\n", - "- add route_int" + "- add route information" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", - "| route_id|direction_id| trip_id|service_id| trip_headsign|trip_short_name| route_int|\n", - "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", - "| 26-69-j19-1| 0|1201.TA.26-69-j19...| TA+b0003| Zürich, Milchbuck| 1255|1451698946048|\n", - "|26-3-A-j19-1| 1|1513.TA.26-3-A-j1...| TA+b0j3n| Zürich, Albisrieden| 3206| 386547056640|\n", - "|26-301-j19-1| 0|246.TA.26-301-j19...| TA+b0003| Dietikon, Bahnhof| 3608|1477468749824|\n", - "|26-140-j19-1| 0|988.TA.26-140-j19...| TA+b0nfk|Langnau am Albis,...| 542|1176821039104|\n", - "| 26-80-j19-1| 0|529.TA.26-80-j19-...| TA+b0003|Zürich, Triemlisp...| 637|1554778161152|\n", - "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", + "+------------+------------+--------------------+----------+--------------------+---------------+\n", + "| route_id|direction_id| trip_id|service_id| trip_headsign|trip_short_name|\n", + "+------------+------------+--------------------+----------+--------------------+---------------+\n", + "| 26-E-j19-1| 0|163.TA.26-E-j19-1...| TA+b0d73|Zürich, ETH/Unive...| 1850|\n", + "| 26-13-j19-1| 1|1365.TA.26-13-j19...| TA+b0003| Zürich, Frankental| 3862|\n", + "|26-33E-j19-1| 0|641.TA.26-33E-j19...| TA+b0003|Zürich, Kirche Fl...| 1234|\n", + "|26-650-j19-1| 1|36.TA.26-650-j19-...| TA+b0003|Tagelswangen, Buc...| 3752|\n", + "|26-533-j19-1| 0|65.TA.26-533-j19-...| TA+b0003|Niederhasli, Nass...| 3545|\n", + "+------------+------------+--------------------+----------+--------------------+---------------+\n", "only showing top 5 rows" ] } ], "source": [ - "trips_curated = trips_zurich_day_clean_complete.join(routes_curated.select(\"route_id\", \"direction_id\", \"route_int\"), how=\"left\", on=[\"route_id\", \"direction_id\"])\n", + "trips_curated = trips_zurich_day_clean_complete.join(routes_curated.select(\"route_id\", \"direction_id\"), how=\"left\", on=[\"route_id\", \"direction_id\"])\n", "trips_curated.show(5)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_curated.write.csv('data/lgpt_guys/trips_curated.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_curated = spark.read.csv('data/lgpt_guys/trips_curated.csv', header = True)" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "22807" + "22984" ] } ], "source": [ "trips_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Stop_times.txt\n", - "- add route_int" + "- save as stop_times_curated" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", - "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1| route_int|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", - "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 500|1-231-j19-1| 10.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 599|1-231-j19-1| 11.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 401|1-231-j19-1| 12.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 598|1-231-j19-1| 13.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 401|1-231-j19-1| 14.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", + "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|hour_departure| route_id|sequence_1| trip_1|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", + "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 9.0|1-231-j19-1| 10|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 9.0|1-231-j19-1| 11|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 9.0|1-231-j19-1| 12|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 9.0|1-231-j19-1| 13|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 10.0|1-231-j19-1| 14|1.TA.1-231-j19-1.1.H|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ - "stop_times_curated = stop_times_route.join(trips_curated.select(\"trip_id\", \"route_int\"), how=\"left\", on=\"trip_id\")\n", + "stop_times_curated = stop_times_route\n", "stop_times_curated.show(5)" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_curated.write.csv('data/lgpt_guys/stop_times_curated.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read preprocessed files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transfer time walking distance between zurich stops" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------+--------+-----------+---------+-----------------+\n", - "|stop_id|stop_int| stop_id2|stop_int2|Transfer_time_sec|\n", - "+-------+--------+-----------+---------+-----------------+\n", - "|8500926| 0| 8590616| 1166| 146|\n", - "|8500926| 0| 8590737| 1270| 360|\n", - "|8502186| 1|8502186:0:1| 2| 8|\n", - "|8502186| 1|8502186:0:2| 3| 16|\n", - "|8502186| 1| 8502186P| 4| 0|\n", - "+-------+--------+-----------+---------+-----------------+\n", + "+-------+-----------+------------+-----------------+\n", + "|stop_id| stop_id2| distance|Transfer_time_sec|\n", + "+-------+-----------+------------+-----------------+\n", + "|8500926| 8590616| 0.122429974| 146|\n", + "|8500926| 8590737| 0.30017462| 360|\n", + "|8502186|8502186:0:1|0.0067620375| 8|\n", + "|8502186|8502186:0:2| 0.013524067| 16|\n", + "|8502186| 8502186P| 0.0| 0|\n", + "+-------+-----------+------------+-----------------+\n", "only showing top 5 rows" ] } ], "source": [ "transfer = spark.read.csv('data/lgpt_guys/transfers.csv', header=True)\n", "transfer.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stops within a 15 km radius of Zurich" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", - "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int|\n", - "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", - "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 0|\n", - "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1|\n", - "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P| 2|\n", - "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P| 3|\n", - "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null| 4|\n", - "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", + "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", + "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", + "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", + "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n", + "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n", + "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n", + "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n", + "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n", + "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 5 rows" ] } ], "source": [ "stops_15km = spark.read.csv('data/lgpt_guys/stops_15km.csv', header = True)\n", "stops_15km.show(5)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "1883" ] } ], "source": [ "stops_15km.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Trips during business days and in 15 km radius of zurich" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", - "| route_id|direction_id| trip_id|service_id| trip_headsign|trip_short_name| route_int|\n", - "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", - "| 26-69-j19-1| 0|1201.TA.26-69-j19...| TA+b0003| Zürich, Milchbuck| 1255|1451698946048|\n", - "|26-3-A-j19-1| 1|1513.TA.26-3-A-j1...| TA+b0j3n| Zürich, Albisrieden| 3206| 386547056640|\n", - "|26-301-j19-1| 0|246.TA.26-301-j19...| TA+b0003| Dietikon, Bahnhof| 3608|1477468749824|\n", - "|26-140-j19-1| 0|988.TA.26-140-j19...| TA+b0nfk|Langnau am Albis,...| 542|1176821039104|\n", - "| 26-80-j19-1| 0|529.TA.26-80-j19-...| TA+b0003|Zürich, Triemlisp...| 637|1554778161152|\n", - "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", + "+------------+------------+--------------------+----------+--------------------+---------------+\n", + "| route_id|direction_id| trip_id|service_id| trip_headsign|trip_short_name|\n", + "+------------+------------+--------------------+----------+--------------------+---------------+\n", + "| 26-E-j19-1| 0|163.TA.26-E-j19-1...| TA+b0d73|Zürich, ETH/Unive...| 1850|\n", + "| 26-13-j19-1| 1|1365.TA.26-13-j19...| TA+b0003| Zürich, Frankental| 3862|\n", + "|26-33E-j19-1| 0|641.TA.26-33E-j19...| TA+b0003|Zürich, Kirche Fl...| 1234|\n", + "|26-650-j19-1| 1|36.TA.26-650-j19-...| TA+b0003|Tagelswangen, Buc...| 3752|\n", + "|26-533-j19-1| 0|65.TA.26-533-j19-...| TA+b0003|Niederhasli, Nass...| 3545|\n", + "+------------+------------+--------------------+----------+--------------------+---------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips_curated = spark.read.csv('data/lgpt_guys/trips_curated.csv', header = True)\n", "trips_curated.show(5)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "22807" + "22984" ] } ], "source": [ "trips_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stop_times of stops inside 15 km and with trips on business days" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", - "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1| route_int|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", - "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 500|1-231-j19-1| 10.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 599|1-231-j19-1| 11.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 401|1-231-j19-1| 12.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 598|1-231-j19-1| 13.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 401|1-231-j19-1| 14.0|1.TA.1-231-j19-1.1.H|592705486850|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", + "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|hour_departure| route_id|sequence_1| trip_1|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", + "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 9.0|1-231-j19-1| 10|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 9.0|1-231-j19-1| 11|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 9.0|1-231-j19-1| 12|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 9.0|1-231-j19-1| 13|1.TA.1-231-j19-1.1.H|\n", + "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 10.0|1-231-j19-1| 14|1.TA.1-231-j19-1.1.H|\n", + "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+-----------+----------+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)\n", "stop_times_curated.show(5)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- trip_id: string (nullable = true)\n", " |-- stop_id: string (nullable = true)\n", " |-- arrival_time: string (nullable = true)\n", " |-- departure_time: string (nullable = true)\n", " |-- stop_sequence: string (nullable = true)\n", " |-- pickup_type: string (nullable = true)\n", " |-- drop_off_type: string (nullable = true)\n", - " |-- stop_int: string (nullable = true)\n", + " |-- hour_departure: string (nullable = true)\n", " |-- route_id: string (nullable = true)\n", " |-- sequence_1: string (nullable = true)\n", - " |-- trip_1: string (nullable = true)\n", - " |-- route_int: string (nullable = true)" + " |-- trip_1: string (nullable = true)" ] } ], "source": [ "stop_times_curated.printSchema()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "247920" + "250446" ] } ], "source": [ "stop_times_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Routes inside 15 km and active during business days" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+-------------+\n", - "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional| route_int|\n", - "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+-------------+\n", - "|26-33E-j19-1| 0| 26-33E-j19-1-0| 849| 700| 33E-0| null|1657857376256|\n", - "|26-734-j19-1| 1| 26-734-j19-1-1| 773| 700| 734-1| null|1657857376257|\n", - "|26-132-j19-1| 0| 26-132-j19-1-0| 807| 700| 132-0| null|1657857376258|\n", - "| 26-91-j19-1| 0| 26-91-j19-1-0| 849| 700| 91-0| null|1657857376259|\n", - "|26-142-j19-1| 1| 26-142-j19-1-1| 807| 700| 142-1| null|1657857376260|\n", - "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+-------------+\n", + "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", + "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional|\n", + "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", + "|26-33E-j19-1| 0| 26-33E-j19-1-0| 849| 700| 33E-0| null|\n", + "| 26-91-j19-1| 0| 26-91-j19-1-0| 849| 700| 91-0| null|\n", + "|26-734-j19-1| 1| 26-734-j19-1-1| 773| 700| 734-1| null|\n", + "|26-132-j19-1| 0| 26-132-j19-1-0| 807| 700| 132-0| null|\n", + "|26-745-j19-1| 0| 26-745-j19-1-0| 849| 700| 745-0| null|\n", + "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", "only showing top 5 rows" ] } ], "source": [ "routes_curated = spark.read.csv('data/lgpt_guys/routes_curated.csv', header = True)\n", "routes_curated.show(5)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "507" + "521" ] } ], "source": [ "routes_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Raptor specific preprocessing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "StopTimes: [[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_curated = stop_times_curated.withColumn(\"arrival_time_stamp\", to_timestamp(stop_times_curated.arrival_time, 'HH:mm:ss')).withColumn(\"departure_time_stamp\", to_timestamp(stop_times_curated.departure_time, 'HH:mm:ss'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+-------------------+--------------------+\n", - "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1| route_int| arrival_time_stamp|departure_time_stamp|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+-------------------+--------------------+\n", - "|100.TA.1-1-E-j19-...|8578679| 08:49:00| 08:49:00| 1| 0| 0| 791|1-1-E-j19-1| 2.0|100.TA.1-1-E-j19-...|884763262977|1970-01-01 08:49:00| 1970-01-01 08:49:00|\n", - "|100.TA.1-1-E-j19-...|8590314| 08:50:00| 08:50:00| 2| 0| 0| 1036|1-1-E-j19-1| 3.0|100.TA.1-1-E-j19-...|884763262977|1970-01-01 08:50:00| 1970-01-01 08:50:00|\n", - "|100.TA.1-1-E-j19-...|8590317| 08:51:00| 08:51:00| 3| 0| 0| 1037|1-1-E-j19-1| 19.0|100.TA.1-321-j19-...|884763262977|1970-01-01 08:51:00| 1970-01-01 08:51:00|\n", - "|101.TA.1-1-E-j19-...|8578679| 09:04:00| 09:04:00| 1| 0| 0| 791|1-1-E-j19-1| 2.0|101.TA.1-1-E-j19-...|884763262977|1970-01-01 09:04:00| 1970-01-01 09:04:00|\n", - "|101.TA.1-1-E-j19-...|8590314| 09:05:00| 09:05:00| 2| 0| 0| 1036|1-1-E-j19-1| 3.0|101.TA.1-1-E-j19-...|884763262977|1970-01-01 09:05:00| 1970-01-01 09:05:00|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+-------------------+--------------------+\n", - "only showing top 5 rows" - ] - } - ], - "source": [ - "stop_times_ordered = stop_times_curated.orderBy(\"route_id\", \"trip_id\", \"stop_sequence\")\n", - "stop_times_ordered.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_ordered.write.csv('data/lgpt_guys/stop_times_ordered_like_array.csv', header = True, mode=\"overwrite\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_ordered = stop_times_ordered.toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_ordered_pandas = stop_times_ordered" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " arrival_time arrival_time_stamp\n", - "0 08:49:00 1970-01-01 08:49:00\n", - "1 08:50:00 1970-01-01 08:50:00\n", - "2 08:51:00 1970-01-01 08:51:00\n", - "3 09:04:00 1970-01-01 09:04:00\n", - "4 09:05:00 1970-01-01 09:05:00" - ] - } - ], - "source": [ - "stop_times_ordered_pandas[[\"arrival_time\", \"arrival_time_stamp\"]].head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_ordered_pandas['arrival_time'] = pd.to_datetime(stop_times_ordered_pandas['arrival_time'])\n", - "stop_times_ordered_pandas['departure_time'] = pd.to_datetime(stop_times_ordered_pandas['departure_time'])" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " arrival_time arrival_time_stamp\n", - "0 2020-05-19 08:49:00 1970-01-01 08:49:00\n", - "1 2020-05-19 08:50:00 1970-01-01 08:50:00\n", - "2 2020-05-19 08:51:00 1970-01-01 08:51:00\n", - "3 2020-05-19 09:04:00 1970-01-01 09:04:00\n", - "4 2020-05-19 09:05:00 1970-01-01 09:05:00" - ] - } - ], - "source": [ - "stop_times_ordered_pandas[[\"arrival_time\", \"arrival_time_stamp\"]].head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Adding None to first arrival time and last departure time" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_ordered_pandas['departure_time'] = np.where((stop_times_ordered_pandas[\"stop_sequence\"] > stop_times_ordered_pandas[\"sequence_1\"]), None, stop_times_ordered_pandas['departure_time'])" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_ordered_pandas[\"arrival_time\"] = np.where((stop_times_ordered_pandas[\"stop_sequence\"] == 1), None, stop_times_ordered_pandas['arrival_time'])" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " arrival_time departure_time stop_sequence sequence_1\n", - "0 1589878140000000000 1589878140000000000 1 2.0\n", - "1 1589878200000000000 1589878200000000000 2 3.0\n", - "2 1589878260000000000 None 3 19.0\n", - "3 1589879040000000000 1589879040000000000 1 2.0\n", - "4 1589879100000000000 1589879100000000000 2 3.0" - ] - } - ], - "source": [ - "stop_times_ordered_pandas[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_1\"]].head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "stop_times_ordered_pandas['arrival_time'] = pd.to_datetime(stop_times_ordered_pandas['arrival_time'])\n", - "stop_times_ordered_pandas['departure_time'] = pd.to_datetime(stop_times_ordered_pandas['departure_time'])" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " arrival_time arrival_time_stamp\n", - "0 2020-05-19 08:49:00 1970-01-01 08:49:00\n", - "1 2020-05-19 08:50:00 1970-01-01 08:50:00\n", - "2 2020-05-19 08:51:00 1970-01-01 08:51:00\n", - "3 2020-05-19 09:04:00 1970-01-01 09:04:00\n", - "4 2020-05-19 09:05:00 1970-01-01 09:05:00" - ] - } - ], - "source": [ - "stop_times_ordered_pandas[[\"arrival_time\", \"arrival_time_stamp\"]].head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 247920 entries, 0 to 247919\n", - "Data columns (total 14 columns):\n", - "trip_id 247920 non-null object\n", - "stop_id 247920 non-null object\n", - "arrival_time 247920 non-null datetime64[ns]\n", - "departure_time 226565 non-null datetime64[ns]\n", - "stop_sequence 247920 non-null object\n", - "pickup_type 247920 non-null object\n", - "drop_off_type 247920 non-null object\n", - "stop_int 247920 non-null object\n", - "route_id 247920 non-null object\n", - "sequence_1 247920 non-null object\n", - "trip_1 247920 non-null object\n", - "route_int 247920 non-null object\n", - "arrival_time_stamp 247920 non-null datetime64[ns]\n", - "departure_time_stamp 247920 non-null datetime64[ns]\n", - "dtypes: datetime64[ns](4), object(10)\n", - "memory usage: 26.5+ MB" - ] - } - ], - "source": [ - "stop_times_ordered.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "array([['2020-05-19T08:49:00.000000000', '2020-05-19T08:49:00.000000000'],\n", - " ['2020-05-19T08:50:00.000000000', '2020-05-19T08:50:00.000000000'],\n", - " ['2020-05-19T08:51:00.000000000', 'NaT'],\n", - " ...,\n", - " ['2020-05-19T16:07:00.000000000', '2020-05-19T16:07:00.000000000'],\n", - " ['2020-05-19T16:11:00.000000000', '2020-05-19T16:15:00.000000000'],\n", - " ['2020-05-19T16:19:00.000000000', 'NaT']],\n", - " dtype='datetime64[ns]')" - ] - } - ], - "source": [ - "stop_times_array = stop_times_ordered_pandas[[\"arrival_time\", \"departure_time\"]].to_numpy()\n", - "stop_times_array" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "247920" - ] - } - ], - "source": [ - "np.size(stop_times_array,0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sc.parallelize(stop_times_array).saveAsTextFile('data/lgpt_guys/stop_times_array_version3.txt')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Routes: [[route0_nr. Stops, route0_nr.Trips, route0_pointerStops, route0_pointerTrip],[ route1_nr. Stops, route1_nr. Trips, route1_pointerStops, route1_pointerTrip],…]" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-------+-------+\n", - "| route_id|n_Trips|n_stops|\n", - "+------------+-------+-------+\n", - "| 1-1-E-j19-1| 140| 3|\n", - "| 1-10-j19-1| 37| 4|\n", - "|1-11-B-j19-1| 48| 5|\n", - "| 1-16-j19-1| 24| 8|\n", - "|1-17-A-j19-1| 106| 19|\n", - "+------------+-------+-------+\n", - "only showing top 5 rows" - ] - } - ], - "source": [ - "distinct_trips_stops = stop_times_curated.groupBy(\"route_id\").agg(countDistinct(\"trip_id\").alias(\"n_Trips\"), countDistinct(\"stop_id\").alias(\"n_stops\")).orderBy(\"route_id\")\n", - "distinct_trips_stops.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "distinct_trips_stops_pandas = distinct_trips_stops.toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " route_id n_Trips n_stops Cumsum_trips Cumsum_stops\n", - "0 1-1-E-j19-1 140 3 140 3\n", - "1 1-10-j19-1 37 4 177 7\n", - "2 1-11-B-j19-1 48 5 225 12\n", - "3 1-16-j19-1 24 8 249 20\n", - "4 1-17-A-j19-1 106 19 355 39" - ] - } - ], - "source": [ - "distinct_trips_stops_pandas['Cumsum_trips'] = distinct_trips_stops_pandas.n_Trips.cumsum()\n", - "distinct_trips_stops_pandas['Cumsum_stops'] = distinct_trips_stops_pandas.n_stops.cumsum()\n", - "distinct_trips_stops_pandas.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "array([[ 140, 3, 140, 3],\n", - " [ 37, 4, 177, 7],\n", - " [ 48, 5, 225, 12],\n", - " ...,\n", - " [ 84, 2, 22709, 3275],\n", - " [ 85, 2, 22794, 3277],\n", - " [ 13, 5, 22807, 3282]])" - ] - } - ], - "source": [ - "routes_array = distinct_trips_stops_pandas[['n_Trips', 'n_stops', 'Cumsum_trips', 'Cumsum_stops']].to_numpy()\n", - "routes_array" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "279" - ] - } - ], - "source": [ - "np.size(routes_array, 0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sc.parallelize(routes_array).saveAsTextFile('data/lgpt_guys/routes_array_version3.txt')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------+--------+\n", - "| route_int|stop_int|\n", - "+-------------+--------+\n", - "|1460288880640| 783|\n", - "| 25769803776| 476|\n", - "|1176821039104| 510|\n", - "|1297080123392| 1094|\n", - "|1297080123392| 1817|\n", - "+-------------+--------+\n", - "only showing top 5 rows" - ] - } - ], - "source": [ - "route_stops = stop_times_curated.orderBy(\"route_id\", \"stop_sequence\").select('route_int', 'stop_int').dropDuplicates()\n", - "route_stops.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "array([[1678],\n", - " [ 510],\n", - " [1482],\n", - " ...,\n", - " [1551],\n", - " [ 718],\n", - " [ 963]])" - ] - } - ], - "source": [ - "route_stops_array = np.array(route_stops.select(route_stops.stop_int.cast(\"integer\")).collect())\n", - "route_stops_array" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5545" - ] - } - ], - "source": [ - "np.size(route_stops_array, 0)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sc.parallelize(route_stops_array).saveAsTextFile('data/lgpt_guys/route_stops_array_version3.txt')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------+-------------+-------+--------+---------+-----------------+\n", - "|stop_int| route_int|stop_id|stop_id2|stop_int2|Transfer_time_sec|\n", - "+--------+-------------+-------+--------+---------+-----------------+\n", - "| 1015| 523986010115|8590204| 8590203| 1014| 277|\n", - "| 1015| 523986010115|8590204| 8590201| 1012| 296|\n", - "| 1015| 523986010115|8590204| 8590200| 1011| 167|\n", - "| 1482|1640677507072|8591105| 8591384| 1744| 495|\n", - "| 1482|1640677507072|8591105| 8591299| 1665| 367|\n", - "+--------+-------------+-------+--------+---------+-----------------+\n", - "only showing top 5 rows" - ] - } - ], - "source": [ - "stops_join = route_stops.join(transfer, how=\"inner\", on=\"stop_int\").dropDuplicates()\n", - "stops_join.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------+--------+--------+-----------+\n", - "| stop_id|stop_int|n_Routes|n_Transfers|\n", - "+-----------+--------+--------+-----------+\n", - "| 8500926| 0| 1| 2|\n", - "|8502186:0:1| 2| 1| 7|\n", - "|8502186:0:2| 3| 1| 7|\n", - "|8502187:0:1| 6| 1| 3|\n", - "|8502187:0:2| 7| 1| 3|\n", - "+-----------+--------+--------+-----------+\n", - "only showing top 5 rows" - ] - } - ], - "source": [ - "distinct_route_transfers = stops_join.groupBy(\"stop_id\", \"stop_int\").agg(countDistinct(\"route_int\").alias(\"n_Routes\"), countDistinct(\"stop_int2\").alias(\"n_Transfers\")).orderBy(\"stop_id\")\n", - "distinct_route_transfers.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "distinct_route_transfers_pandas = distinct_route_transfers.toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " stop_id stop_int n_Routes n_Transfers Cumsum_routes Cumsum_transfers\n", - "0 8500926 0 1 2 1 2\n", - "1 8502186:0:1 2 1 7 2 9\n", - "2 8502186:0:2 3 1 7 3 16\n", - "3 8502187:0:1 6 1 3 4 19\n", - "4 8502187:0:2 7 1 3 5 22" - ] - } - ], - "source": [ - "distinct_route_transfers_pandas['Cumsum_routes'] = distinct_route_transfers_pandas.n_Routes.cumsum()\n", - "distinct_route_transfers_pandas['Cumsum_transfers'] = distinct_route_transfers_pandas.n_Transfers.cumsum()\n", - "distinct_route_transfers_pandas.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "array([[ 1, 2],\n", - " [ 2, 9],\n", - " [ 3, 16],\n", - " ...,\n", - " [ 5387, 10654],\n", - " [ 5389, 10669],\n", - " [ 5398, 10702]])" - ] - } - ], - "source": [ - "stops_array = distinct_route_transfers_pandas[['Cumsum_routes', 'Cumsum_transfers']].to_numpy()\n", - "stops_array" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1512" - ] - } - ], - "source": [ - "np.size(stops_array, 0)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sc.parallelize(stops_array).saveAsTextFile('data/lgpt_guys/stops_array_version3.txt')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "array([[1460288880641],\n", - " [ 180388626432],\n", - " [ 481036337153],\n", - " ...,\n", - " [ 317827579907],\n", - " [ 618475290627],\n", - " [ 317827579906]])" - ] - } - ], - "source": [ - "stop_routes_array = np.array(route_stops.orderBy(\"stop_id\", \"route_id\").select(route_stops.route_int).collect()).astype(int)\n", - "stop_routes_array" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sc.parallelize(stop_routes_array).saveAsTextFile('data/lgpt_guys/stop_routes_array_version4.txt')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "root\n", - " |-- stop_id: string (nullable = true)\n", - " |-- stop_int: string (nullable = true)\n", - " |-- stop_id2: string (nullable = true)\n", - " |-- stop_int2: string (nullable = true)\n", - " |-- Transfer_time_sec: string (nullable = true)" - ] - } - ], - "source": [ - "transfer.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "array([[1166, 146],\n", - " [1270, 360],\n", - " [ 2, 8],\n", - " ...,\n", - " [1650, 255],\n", - " [1728, 161],\n", - " [1739, 519]])" - ] - } - ], - "source": [ - "transfer_array = np.array(transfer.orderBy(\"stop_id\").select(transfer.stop_int2.cast(\"integer\"), transfer.Transfer_time_sec.cast(\"integer\")).collect())\n", - "transfer_array" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "12564" - ] - } - ], - "source": [ - "np.size(transfer_array, 0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sc.parallelize(transfer_array).saveAsTextFile('data/lgpt_guys/transfer_array_version3.txt')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Open question\n" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 }