diff --git a/notebooks/old_notebooks/OLD_Arrays_to_pickle.ipynb b/notebooks/old_notebooks/OLD_Arrays_to_pickle.ipynb
deleted file mode 100644
index 490e7a9..0000000
--- a/notebooks/old_notebooks/OLD_Arrays_to_pickle.ipynb
+++ /dev/null
@@ -1,3924 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Preprocessing part 2: preparing the arrays\n",
- "In this notebook we take 2 datasets prepared in spark: stop_times and transfers, and prepare them into the array format needed to run RAPTOR"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Outline\n",
- "In this notebook the following actions are performed:\n",
- "- Transform stop_ids with platform information into the parent station stop_id\n",
- "- Keep only trips with a departure after 7 am and before 7 pm\n",
- "- Delete trips which only have 1 stop\n",
- "- Create integer IDs for routes, trips and stops, following the definition of the RAPTOR algorithm in Stop_times\n",
- "- Add integer IDs to transfers and keep only stops that are inside the stop_times dataset"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Import packages"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import pickle\n",
- "import itertools"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Read files\n",
- "Before running make sure the .csv files are in /data . If not run notebook \"transfer_to_local\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " route_id | \n",
- " stop_id_general | \n",
- " trip_id | \n",
- " stop_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " stop_name | \n",
- " stop_lat | \n",
- " stop_lon | \n",
- " trip_headsign | \n",
- " trip_short_name | \n",
- " direction_id | \n",
- " departure_first_stop | \n",
- " route_int | \n",
- " stop_count | \n",
- " stop_int | \n",
- " route_desc | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 26-66-j19-1 | \n",
- " 8591205 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591205 | \n",
- " 17:00:00 | \n",
- " 17:00:00 | \n",
- " 3 | \n",
- " Zürich, Hürlimannplatz | \n",
- " 47.365066 | \n",
- " 8.526539 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 1317 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " 26-66-j19-1 | \n",
- " 8591415 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591415 | \n",
- " 17:02:00 | \n",
- " 17:02:00 | \n",
- " 4 | \n",
- " Zürich, Waffenplatzstrasse | \n",
- " 47.361482 | \n",
- " 8.525749 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 1267 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " 26-66-j19-1 | \n",
- " 8591204 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591204 | \n",
- " 17:03:00 | \n",
- " 17:03:00 | \n",
- " 5 | \n",
- " Zürich, Hügelstrasse | \n",
- " 47.358543 | \n",
- " 8.526997 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 67 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3 | \n",
- " 26-66-j19-1 | \n",
- " 8591098 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591098 | \n",
- " 17:04:00 | \n",
- " 17:04:00 | \n",
- " 6 | \n",
- " Zürich, Brunau/Mutschellenstr. | \n",
- " 47.355147 | \n",
- " 8.527141 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 512 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " 26-66-j19-1 | \n",
- " 8591392 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591392 | \n",
- " 17:05:00 | \n",
- " 17:05:00 | \n",
- " 7 | \n",
- " Zürich, Thujastrasse | \n",
- " 47.350187 | \n",
- " 8.527806 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 403 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 route_id stop_id_general trip_id stop_id \\\n",
- "0 0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 \n",
- "1 1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 \n",
- "2 2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 \n",
- "3 3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 \n",
- "4 4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 \n",
- "\n",
- " arrival_time departure_time stop_sequence stop_name \\\n",
- "0 17:00:00 17:00:00 3 Zürich, Hürlimannplatz \n",
- "1 17:02:00 17:02:00 4 Zürich, Waffenplatzstrasse \n",
- "2 17:03:00 17:03:00 5 Zürich, Hügelstrasse \n",
- "3 17:04:00 17:04:00 6 Zürich, Brunau/Mutschellenstr. \n",
- "4 17:05:00 17:05:00 7 Zürich, Thujastrasse \n",
- "\n",
- " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n",
- "0 47.365066 8.526539 Zürich, Neubühl 3870 0 \n",
- "1 47.361482 8.525749 Zürich, Neubühl 3870 0 \n",
- "2 47.358543 8.526997 Zürich, Neubühl 3870 0 \n",
- "3 47.355147 8.527141 Zürich, Neubühl 3870 0 \n",
- "4 47.350187 8.527806 Zürich, Neubühl 3870 0 \n",
- "\n",
- " departure_first_stop route_int stop_count stop_int route_desc \n",
- "0 16:55:00 1225 12 1317 Bus \n",
- "1 16:55:00 1225 12 1267 Bus \n",
- "2 16:55:00 1225 12 67 Bus \n",
- "3 16:55:00 1225 12 512 Bus \n",
- "4 16:55:00 1225 12 403 Bus "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#stop_times\n",
- "stop_times_curated = pd.read_csv(\"../data/stop_times_final_cyril.csv\")\n",
- "stop_times_curated.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " route_id | \n",
- " stop_id | \n",
- " trip_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " direction_id | \n",
- " stop_name | \n",
- " route_desc | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 26-759-j19-1 | \n",
- " 8573205:0:K | \n",
- " 1330.TA.26-759-j19-1.7.R | \n",
- " 05:28:00 | \n",
- " 05:28:00 | \n",
- " 1 | \n",
- " 1 | \n",
- " Zürich Flughafen, Bahnhof | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " 26-67-j19-1 | \n",
- " 8591341 | \n",
- " 46.TA.26-67-j19-1.1.R | \n",
- " 05:33:00 | \n",
- " 05:33:00 | \n",
- " 1 | \n",
- " 1 | \n",
- " Zürich, Schmiede Wiedikon | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " 26-325-j19-1 | \n",
- " 8587020:0:D | \n",
- " 265.TA.26-325-j19-1.2.H | \n",
- " 05:34:00 | \n",
- " 05:34:00 | \n",
- " 1 | \n",
- " 0 | \n",
- " Dietikon, Bahnhof | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3 | \n",
- " 26-11-A-j19-1 | \n",
- " 8591382 | \n",
- " 1266.TA.26-11-A-j19-1.21.H | \n",
- " 05:37:00 | \n",
- " 05:37:00 | \n",
- " 1 | \n",
- " 0 | \n",
- " Zürich, Sternen Oerlikon | \n",
- " Tram | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " 26-302-j19-1 | \n",
- " 8590844 | \n",
- " 162.TA.26-302-j19-1.4.R | \n",
- " 05:49:00 | \n",
- " 05:49:00 | \n",
- " 1 | \n",
- " 1 | \n",
- " Urdorf, Oberurdorf | \n",
- " Bus | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 route_id stop_id trip_id \\\n",
- "0 0 26-759-j19-1 8573205:0:K 1330.TA.26-759-j19-1.7.R \n",
- "1 1 26-67-j19-1 8591341 46.TA.26-67-j19-1.1.R \n",
- "2 2 26-325-j19-1 8587020:0:D 265.TA.26-325-j19-1.2.H \n",
- "3 3 26-11-A-j19-1 8591382 1266.TA.26-11-A-j19-1.21.H \n",
- "4 4 26-302-j19-1 8590844 162.TA.26-302-j19-1.4.R \n",
- "\n",
- " arrival_time departure_time stop_sequence direction_id \\\n",
- "0 05:28:00 05:28:00 1 1 \n",
- "1 05:33:00 05:33:00 1 1 \n",
- "2 05:34:00 05:34:00 1 0 \n",
- "3 05:37:00 05:37:00 1 0 \n",
- "4 05:49:00 05:49:00 1 1 \n",
- "\n",
- " stop_name route_desc \n",
- "0 Zürich Flughafen, Bahnhof Bus \n",
- "1 Zürich, Schmiede Wiedikon Bus \n",
- "2 Dietikon, Bahnhof Bus \n",
- "3 Zürich, Sternen Oerlikon Tram \n",
- "4 Urdorf, Oberurdorf Bus "
- ]
- },
- "execution_count": 59,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#stop_times\n",
- "stop_times_curated = pd.read_csv(\"../data/stop_times_curated.csv\")\n",
- "stop_times_curated.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We drop columns not useful to us"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_curated = stop_times_curated.drop(columns=[\"Unnamed: 0\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " stop_id | \n",
- " stop_id2 | \n",
- " distance | \n",
- " Transfer_time_sec | \n",
- " stop_name | \n",
- " stop_name2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 8500926 | \n",
- " 8590616 | \n",
- " 0.122430 | \n",
- " 146 | \n",
- " Oetwil a.d.L., Schweizäcker | \n",
- " Geroldswil, Schweizäcker | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " 8500926 | \n",
- " 8590737 | \n",
- " 0.300175 | \n",
- " 360 | \n",
- " Oetwil a.d.L., Schweizäcker | \n",
- " Oetwil an der Limmat, Halde | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " 8502186 | \n",
- " 8502186:0:1 | \n",
- " 0.006762 | \n",
- " 8 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3 | \n",
- " 8502186 | \n",
- " 8502186:0:2 | \n",
- " 0.013524 | \n",
- " 16 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " 8502186 | \n",
- " 8502186P | \n",
- " 0.000000 | \n",
- " 0 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n",
- "0 0 8500926 8590616 0.122430 146 \n",
- "1 1 8500926 8590737 0.300175 360 \n",
- "2 2 8502186 8502186:0:1 0.006762 8 \n",
- "3 3 8502186 8502186:0:2 0.013524 16 \n",
- "4 4 8502186 8502186P 0.000000 0 \n",
- "\n",
- " stop_name stop_name2 \n",
- "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n",
- "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n",
- "2 Dietikon Stoffelbach Dietikon Stoffelbach \n",
- "3 Dietikon Stoffelbach Dietikon Stoffelbach \n",
- "4 Dietikon Stoffelbach Dietikon Stoffelbach "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#transfers\n",
- "transfers = pd.read_csv(\"../data/transfers.csv\")\n",
- "transfers.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create stop_id same for all platforms\n",
- "In the algorithm we make the simplifying assumptions that each time there is a change is the same station there is a 2 min change time. Due to this assumptions we can keep only the parent station name\n",
- "The parent id is contained in the first 7 characters, so we can take the substring to create the parent stop_id"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {},
- "outputs": [],
- "source": [
- "#copy information stop_id with platform in stop_id_raw\n",
- "stop_times_curated[\"stop_id_raw\"] = stop_times_curated[\"stop_id\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " route_id | \n",
- " stop_id | \n",
- " trip_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " direction_id | \n",
- " stop_name | \n",
- " route_desc | \n",
- " stop_id_raw | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 26-759-j19-1 | \n",
- " 8573205 | \n",
- " 1330.TA.26-759-j19-1.7.R | \n",
- " 05:28:00 | \n",
- " 05:28:00 | \n",
- " 1 | \n",
- " 1 | \n",
- " Zürich Flughafen, Bahnhof | \n",
- " Bus | \n",
- " 8573205:0:K | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 26-67-j19-1 | \n",
- " 8591341 | \n",
- " 46.TA.26-67-j19-1.1.R | \n",
- " 05:33:00 | \n",
- " 05:33:00 | \n",
- " 1 | \n",
- " 1 | \n",
- " Zürich, Schmiede Wiedikon | \n",
- " Bus | \n",
- " 8591341 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 26-325-j19-1 | \n",
- " 8587020 | \n",
- " 265.TA.26-325-j19-1.2.H | \n",
- " 05:34:00 | \n",
- " 05:34:00 | \n",
- " 1 | \n",
- " 0 | \n",
- " Dietikon, Bahnhof | \n",
- " Bus | \n",
- " 8587020:0:D | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 26-11-A-j19-1 | \n",
- " 8591382 | \n",
- " 1266.TA.26-11-A-j19-1.21.H | \n",
- " 05:37:00 | \n",
- " 05:37:00 | \n",
- " 1 | \n",
- " 0 | \n",
- " Zürich, Sternen Oerlikon | \n",
- " Tram | \n",
- " 8591382 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 26-302-j19-1 | \n",
- " 8590844 | \n",
- " 162.TA.26-302-j19-1.4.R | \n",
- " 05:49:00 | \n",
- " 05:49:00 | \n",
- " 1 | \n",
- " 1 | \n",
- " Urdorf, Oberurdorf | \n",
- " Bus | \n",
- " 8590844 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " route_id stop_id trip_id arrival_time \\\n",
- "0 26-759-j19-1 8573205 1330.TA.26-759-j19-1.7.R 05:28:00 \n",
- "1 26-67-j19-1 8591341 46.TA.26-67-j19-1.1.R 05:33:00 \n",
- "2 26-325-j19-1 8587020 265.TA.26-325-j19-1.2.H 05:34:00 \n",
- "3 26-11-A-j19-1 8591382 1266.TA.26-11-A-j19-1.21.H 05:37:00 \n",
- "4 26-302-j19-1 8590844 162.TA.26-302-j19-1.4.R 05:49:00 \n",
- "\n",
- " departure_time stop_sequence direction_id stop_name \\\n",
- "0 05:28:00 1 1 Zürich Flughafen, Bahnhof \n",
- "1 05:33:00 1 1 Zürich, Schmiede Wiedikon \n",
- "2 05:34:00 1 0 Dietikon, Bahnhof \n",
- "3 05:37:00 1 0 Zürich, Sternen Oerlikon \n",
- "4 05:49:00 1 1 Urdorf, Oberurdorf \n",
- "\n",
- " route_desc stop_id_raw \n",
- "0 Bus 8573205:0:K \n",
- "1 Bus 8591341 \n",
- "2 Bus 8587020:0:D \n",
- "3 Tram 8591382 \n",
- "4 Bus 8590844 "
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Use only first 7 characters for stop_id\n",
- "stop_times_curated[\"stop_id\"] = stop_times_curated[\"stop_id_raw\"].str.slice(0, 7)\n",
- "stop_times_curated[\"stop_id\"] = pd.to_numeric(stop_times_curated[\"stop_id\"])\n",
- "stop_times_curated.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "metadata": {},
- "outputs": [],
- "source": [
- "#copy information stop_id with platform in stop_id_raw\n",
- "transfers[\"stop_id_raw\"] = transfers[\"stop_id\"]\n",
- "transfers[\"stop_id2_raw\"] = transfers[\"stop_id2\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We do the operation also on the transfers dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " stop_id | \n",
- " stop_id2 | \n",
- " distance | \n",
- " Transfer_time_sec | \n",
- " stop_name | \n",
- " stop_name2 | \n",
- " stop_id_raw | \n",
- " stop_id2_raw | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 8500926 | \n",
- " 8590616 | \n",
- " 0.122430 | \n",
- " 146 | \n",
- " Oetwil a.d.L., Schweizäcker | \n",
- " Geroldswil, Schweizäcker | \n",
- " 8500926 | \n",
- " 8590616 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " 8500926 | \n",
- " 8590737 | \n",
- " 0.300175 | \n",
- " 360 | \n",
- " Oetwil a.d.L., Schweizäcker | \n",
- " Oetwil an der Limmat, Halde | \n",
- " 8500926 | \n",
- " 8590737 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " 8502186 | \n",
- " 8502186 | \n",
- " 0.006762 | \n",
- " 8 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- " 8502186 | \n",
- " 8502186:0:1 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3 | \n",
- " 8502186 | \n",
- " 8502186 | \n",
- " 0.013524 | \n",
- " 16 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- " 8502186 | \n",
- " 8502186:0:2 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " 8502186 | \n",
- " 8502186 | \n",
- " 0.000000 | \n",
- " 0 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- " 8502186 | \n",
- " 8502186P | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n",
- "0 0 8500926 8590616 0.122430 146 \n",
- "1 1 8500926 8590737 0.300175 360 \n",
- "2 2 8502186 8502186 0.006762 8 \n",
- "3 3 8502186 8502186 0.013524 16 \n",
- "4 4 8502186 8502186 0.000000 0 \n",
- "\n",
- " stop_name stop_name2 stop_id_raw \\\n",
- "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 8500926 \n",
- "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 8500926 \n",
- "2 Dietikon Stoffelbach Dietikon Stoffelbach 8502186 \n",
- "3 Dietikon Stoffelbach Dietikon Stoffelbach 8502186 \n",
- "4 Dietikon Stoffelbach Dietikon Stoffelbach 8502186 \n",
- "\n",
- " stop_id2_raw \n",
- "0 8590616 \n",
- "1 8590737 \n",
- "2 8502186:0:1 \n",
- "3 8502186:0:2 \n",
- "4 8502186P "
- ]
- },
- "execution_count": 65,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Use only first 7 characters for stop_id\n",
- "transfers[\"stop_id\"] = transfers[\"stop_id_raw\"].str.slice(0, 7)\n",
- "transfers[\"stop_id2\"] = transfers[\"stop_id2_raw\"].str.slice(0, 7)\n",
- "transfers[\"stop_id\"] = pd.to_numeric(transfers[\"stop_id\"])\n",
- "transfers[\"stop_id2\"] = pd.to_numeric(transfers[\"stop_id2\"])\n",
- "transfers.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Keep only trips during the day\n",
- "Our model will only consider trips during business days and normal hours, so we can delete all departures before 7 am and after 7 pm"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can get the hour of departure using str.slice , and explore the hours we have in the dataset. Then we convert these hours in integers in order to filter."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15',\n",
- " '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '00',\n",
- " '04', '01'], dtype=object)"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_curated.departure_time.str.slice(0,2).unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_curated[\"hour_departure\"] = pd.to_numeric(stop_times_curated.departure_time.str.slice(0,2))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check if well converted to int"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,\n",
- " 22, 23, 24, 25, 0, 4, 1])"
- ]
- },
- "execution_count": 68,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_curated[\"hour_departure\"].unique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We drop the trips before 7 am and after 7 pm using np.where function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "331751"
- ]
- },
- "execution_count": 69,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_curated.trip_id.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "metadata": {},
- "outputs": [],
- "source": [
- "trip_id_drop = np.where(((stop_times_curated.hour_departure > 19) |\\\n",
- " (stop_times_curated.hour_departure < 7)),\\\n",
- " stop_times_curated[\"trip_id\"] , None)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_curated = stop_times_curated[~stop_times_curated[\"trip_id\"].isin(trip_id_drop)]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 72,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "246576"
- ]
- },
- "execution_count": 72,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_curated.trip_id.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "With this operation we have decreased the size of stop_times by about 90k lines"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Delete trips with 1 stop\n",
- "Trips with only 1 stop are useless in our dataset and will only pollute the algorithm. For this reason we dete these"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We start by counting the stops of each trip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 73,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " route_id | \n",
- " stop_id | \n",
- " trip_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " direction_id | \n",
- " stop_name | \n",
- " route_desc | \n",
- " stop_id_raw | \n",
- " hour_departure | \n",
- "
\n",
- " \n",
- " trip_id | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1.TA.1-231-j19-1.1.H | \n",
- " 1 | \n",
- " 15 | \n",
- " 1 | \n",
- " 17 | \n",
- " 17 | \n",
- " 18 | \n",
- " 1 | \n",
- " 15 | \n",
- " 1 | \n",
- " 15 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 1.TA.1-44-j19-1.1.R | \n",
- " 1 | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " 1 | \n",
- " 3 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 1.TA.1-444-j19-1.1.H | \n",
- " 1 | \n",
- " 9 | \n",
- " 1 | \n",
- " 9 | \n",
- " 9 | \n",
- " 9 | \n",
- " 1 | \n",
- " 9 | \n",
- " 1 | \n",
- " 9 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 1.TA.12-E03-j19-1.1.H | \n",
- " 1 | \n",
- " 2 | \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1 | \n",
- " 2 | \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 1.TA.18-46-j19-1.1.H | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " route_id stop_id trip_id arrival_time \\\n",
- "trip_id \n",
- "1.TA.1-231-j19-1.1.H 1 15 1 17 \n",
- "1.TA.1-44-j19-1.1.R 1 3 1 3 \n",
- "1.TA.1-444-j19-1.1.H 1 9 1 9 \n",
- "1.TA.12-E03-j19-1.1.H 1 2 1 2 \n",
- "1.TA.18-46-j19-1.1.H 1 1 1 1 \n",
- "\n",
- " departure_time stop_sequence direction_id stop_name \\\n",
- "trip_id \n",
- "1.TA.1-231-j19-1.1.H 17 18 1 15 \n",
- "1.TA.1-44-j19-1.1.R 3 3 1 3 \n",
- "1.TA.1-444-j19-1.1.H 9 9 1 9 \n",
- "1.TA.12-E03-j19-1.1.H 2 2 1 2 \n",
- "1.TA.18-46-j19-1.1.H 1 1 1 1 \n",
- "\n",
- " route_desc stop_id_raw hour_departure \n",
- "trip_id \n",
- "1.TA.1-231-j19-1.1.H 1 15 2 \n",
- "1.TA.1-44-j19-1.1.R 1 3 1 \n",
- "1.TA.1-444-j19-1.1.H 1 9 1 \n",
- "1.TA.12-E03-j19-1.1.H 1 2 2 \n",
- "1.TA.18-46-j19-1.1.H 1 1 1 "
- ]
- },
- "execution_count": 73,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "number_stop = stop_times_curated.groupby('trip_id').nunique()\n",
- "number_stop.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 74,
- "metadata": {},
- "outputs": [],
- "source": [
- "#get trips with 1 stop\n",
- "trip_with_1_stop = np.where((number_stop.stop_id == 1), number_stop.index, None)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check number of trips before cleaning"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "20261"
- ]
- },
- "execution_count": 75,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_curated.trip_id.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We drop the rows with a unique stop per trip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
- "metadata": {},
- "outputs": [],
- "source": [
- "#drop trips with only 1 stop\n",
- "stop_times_curated = stop_times_curated[~stop_times_curated[\"trip_id\"].isin(trip_with_1_stop)]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And we check how many trips there still. About 900 trips with only 1 stop have been deleted"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 77,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19390"
- ]
- },
- "execution_count": 77,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_curated.trip_id.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create route_int, trip_int and stop_int as consecutive integer IDs\n",
- "This operation is needed for sorting the routes, trips and stops in the right order. Additionally integers are lighter than strings so the algorithm will need fewer memory to work with the arrays."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Route_int\n",
- "The route_int Id is given in an abitrary order"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We start creating a tuple with all the stops in a trip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 78,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_curated = stop_times_curated.sort_values([\"trip_id\", \"stop_sequence\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 79,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " stop_id | \n",
- "
\n",
- " \n",
- " trip_id | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1.TA.1-231-j19-1.1.H | \n",
- " (8572747, 8582462, 8572600, 8572601, 8502553, ... | \n",
- "
\n",
- " \n",
- " 1.TA.1-44-j19-1.1.R | \n",
- " (8590275, 8591891, 8590279) | \n",
- "
\n",
- " \n",
- " 1.TA.1-444-j19-1.1.H | \n",
- " (8572747, 8580847, 8581346, 8502894, 8502979, ... | \n",
- "
\n",
- " \n",
- " 1.TA.12-E03-j19-1.1.H | \n",
- " (8573205, 8596126) | \n",
- "
\n",
- " \n",
- " 1.TA.21-23-j19-1.1.R | \n",
- " (8503000, 8503003) | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " stop_id\n",
- "trip_id \n",
- "1.TA.1-231-j19-1.1.H (8572747, 8582462, 8572600, 8572601, 8502553, ...\n",
- "1.TA.1-44-j19-1.1.R (8590275, 8591891, 8590279)\n",
- "1.TA.1-444-j19-1.1.H (8572747, 8580847, 8581346, 8502894, 8502979, ...\n",
- "1.TA.12-E03-j19-1.1.H (8573205, 8596126)\n",
- "1.TA.21-23-j19-1.1.R (8503000, 8503003)"
- ]
- },
- "execution_count": 79,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#group stops into a sequence\n",
- "tuple_stops = stop_times_curated.groupby('trip_id')['stop_id'].apply(tuple).to_frame()\n",
- "tuple_stops.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 80,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19390"
- ]
- },
- "execution_count": 80,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tuple_stops.index.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And we can group all these sequences in unique groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 81,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " stop_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, 8503000, 8503006, 8503016) | \n",
- "
\n",
- " \n",
- " (8502208, 8502209, 8503201, 8503200, 8503010, 8503011, 8503016) | \n",
- "
\n",
- " \n",
- " (8502208, 8502209, 8503202) | \n",
- "
\n",
- " \n",
- " (8502208, 8502209, 8503202, 8503009, 8503010, 8503011, 8503000, 8503006, 8503016, 8503307) | \n",
- "
\n",
- " \n",
- " (8502208, 8502209, 8503202, 8503200, 8503009, 8503000, 8503015, 8503016, 8503307, 8503305) | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: [(8502208, 8502209, 8503201, 8503010, 8503011, 8503000, 8503006, 8503016), (8502208, 8502209, 8503201, 8503200, 8503010, 8503011, 8503016), (8502208, 8502209, 8503202), (8502208, 8502209, 8503202, 8503009, 8503010, 8503011, 8503000, 8503006, 8503016, 8503307), (8502208, 8502209, 8503202, 8503200, 8503009, 8503000, 8503015, 8503016, 8503307, 8503305)]"
- ]
- },
- "execution_count": 81,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#group to get unique stop sequences\n",
- "unique_stop_sequence = tuple_stops.groupby(\"stop_id\").count()\n",
- "unique_stop_sequence.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 82,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2555"
- ]
- },
- "execution_count": 82,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "unique_stop_sequence.index.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "These unique sequences of stops are our routes. We can create a unique ID, an integer, for each route"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 83,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " stop_id | \n",
- " route_int | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " (8502208, 8502209, 8503201, 8503200, 8503010, ... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " (8502208, 8502209, 8503202) | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " (8502208, 8502209, 8503202, 8503009, 8503010, ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " (8502208, 8502209, 8503202, 8503200, 8503009, ... | \n",
- " 4 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " stop_id route_int\n",
- "0 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0\n",
- "1 (8502208, 8502209, 8503201, 8503200, 8503010, ... 1\n",
- "2 (8502208, 8502209, 8503202) 2\n",
- "3 (8502208, 8502209, 8503202, 8503009, 8503010, ... 3\n",
- "4 (8502208, 8502209, 8503202, 8503200, 8503009, ... 4"
- ]
- },
- "execution_count": 83,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#create dataframe and route_int\n",
- "df_unique_stop_sequence = unique_stop_sequence.reset_index()\n",
- "df_unique_stop_sequence[\"route_int\"] = df_unique_stop_sequence.index\n",
- "df_unique_stop_sequence.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We add the route information to the trip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 84,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " stop_id | \n",
- " route_int | \n",
- "
\n",
- " \n",
- " trip_id | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 425.TA.26-24-j19-1.220.R | \n",
- " (8502208, 8502209, 8503201, 8503200, 8503010, ... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 22.TA.30-57-Y-j19-1.1.H | \n",
- " (8502208, 8502209, 8503202) | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 11.TA.30-57-Y-j19-1.1.H | \n",
- " (8502208, 8502209, 8503202) | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " 14.TA.30-57-Y-j19-1.1.H | \n",
- " (8502208, 8502209, 8503202) | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " stop_id \\\n",
- "trip_id \n",
- "403.TA.26-24-j19-1.220.R (8502208, 8502209, 8503201, 8503010, 8503011, ... \n",
- "425.TA.26-24-j19-1.220.R (8502208, 8502209, 8503201, 8503200, 8503010, ... \n",
- "22.TA.30-57-Y-j19-1.1.H (8502208, 8502209, 8503202) \n",
- "11.TA.30-57-Y-j19-1.1.H (8502208, 8502209, 8503202) \n",
- "14.TA.30-57-Y-j19-1.1.H (8502208, 8502209, 8503202) \n",
- "\n",
- " route_int \n",
- "trip_id \n",
- "403.TA.26-24-j19-1.220.R 0 \n",
- "425.TA.26-24-j19-1.220.R 1 \n",
- "22.TA.30-57-Y-j19-1.1.H 2 \n",
- "11.TA.30-57-Y-j19-1.1.H 2 \n",
- "14.TA.30-57-Y-j19-1.1.H 2 "
- ]
- },
- "execution_count": 84,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#join with trip information\n",
- "trip_with_routes = tuple_stops.join(df_unique_stop_sequence.set_index(\"stop_id\"), on=\"stop_id\", how=\"left\").sort_values(\"route_int\")\n",
- "trip_with_routes.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 85,
- "metadata": {},
- "outputs": [],
- "source": [
- "trip_with_routes = trip_with_routes.rename(columns={\"stop_id\" : \"all_stops\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check if wrong manipulations cause to have the same, or higher, number of routes than trips. It is not the case"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 86,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19390"
- ]
- },
- "execution_count": 86,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#check if routes and trips do not have the same number\n",
- "trip_with_routes.index.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 87,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2555"
- ]
- },
- "execution_count": 87,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trip_with_routes.route_int.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We add the rout_int column to stop_times dataframe"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 88,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "245705"
- ]
- },
- "execution_count": 88,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_curated.trip_id.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 89,
- "metadata": {},
- "outputs": [],
- "source": [
- "#join to get route_int in stop_times\n",
- "stop_times_routes = stop_times_curated.join(trip_with_routes, how=\"left\", on=\"trip_id\" , lsuffix='_left', rsuffix='_right').drop_duplicates()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 90,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "245705"
- ]
- },
- "execution_count": 90,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_routes.trip_id.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 91,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " route_id | \n",
- " stop_id | \n",
- " trip_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " direction_id | \n",
- " stop_name | \n",
- " route_desc | \n",
- " stop_id_raw | \n",
- " hour_departure | \n",
- " all_stops | \n",
- " route_int | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 81914 | \n",
- " 1-231-j19-1 | \n",
- " 8572747 | \n",
- " 1.TA.1-231-j19-1.1.H | \n",
- " 09:37:00 | \n",
- " 09:37:00 | \n",
- " 1 | \n",
- " 0 | \n",
- " Bremgarten AG, Bahnhof | \n",
- " Bus | \n",
- " 8572747 | \n",
- " 9 | \n",
- " (8572747, 8582462, 8572600, 8572601, 8502553, ... | \n",
- " 618 | \n",
- "
\n",
- " \n",
- " 181281 | \n",
- " 1-231-j19-1 | \n",
- " 8582462 | \n",
- " 1.TA.1-231-j19-1.1.H | \n",
- " 09:38:00 | \n",
- " 09:38:00 | \n",
- " 3 | \n",
- " 0 | \n",
- " Bremgarten AG, Zelgli | \n",
- " Bus | \n",
- " 8582462 | \n",
- " 9 | \n",
- " (8572747, 8582462, 8572600, 8572601, 8502553, ... | \n",
- " 618 | \n",
- "
\n",
- " \n",
- " 42460 | \n",
- " 1-231-j19-1 | \n",
- " 8572600 | \n",
- " 1.TA.1-231-j19-1.1.H | \n",
- " 09:39:00 | \n",
- " 09:39:00 | \n",
- " 4 | \n",
- " 0 | \n",
- " Zufikon, Emaus | \n",
- " Bus | \n",
- " 8572600 | \n",
- " 9 | \n",
- " (8572747, 8582462, 8572600, 8572601, 8502553, ... | \n",
- " 618 | \n",
- "
\n",
- " \n",
- " 224454 | \n",
- " 1-231-j19-1 | \n",
- " 8572601 | \n",
- " 1.TA.1-231-j19-1.1.H | \n",
- " 09:39:00 | \n",
- " 09:39:00 | \n",
- " 5 | \n",
- " 0 | \n",
- " Zufikon, Algier | \n",
- " Bus | \n",
- " 8572601 | \n",
- " 9 | \n",
- " (8572747, 8582462, 8572600, 8572601, 8502553, ... | \n",
- " 618 | \n",
- "
\n",
- " \n",
- " 11836 | \n",
- " 1-231-j19-1 | \n",
- " 8502553 | \n",
- " 1.TA.1-231-j19-1.1.H | \n",
- " 09:43:00 | \n",
- " 09:43:00 | \n",
- " 6 | \n",
- " 0 | \n",
- " Unterlunkhofen, Breitenäcker | \n",
- " Bus | \n",
- " 8502553 | \n",
- " 9 | \n",
- " (8572747, 8582462, 8572600, 8572601, 8502553, ... | \n",
- " 618 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " route_id stop_id trip_id arrival_time \\\n",
- "81914 1-231-j19-1 8572747 1.TA.1-231-j19-1.1.H 09:37:00 \n",
- "181281 1-231-j19-1 8582462 1.TA.1-231-j19-1.1.H 09:38:00 \n",
- "42460 1-231-j19-1 8572600 1.TA.1-231-j19-1.1.H 09:39:00 \n",
- "224454 1-231-j19-1 8572601 1.TA.1-231-j19-1.1.H 09:39:00 \n",
- "11836 1-231-j19-1 8502553 1.TA.1-231-j19-1.1.H 09:43:00 \n",
- "\n",
- " departure_time stop_sequence direction_id \\\n",
- "81914 09:37:00 1 0 \n",
- "181281 09:38:00 3 0 \n",
- "42460 09:39:00 4 0 \n",
- "224454 09:39:00 5 0 \n",
- "11836 09:43:00 6 0 \n",
- "\n",
- " stop_name route_desc stop_id_raw hour_departure \\\n",
- "81914 Bremgarten AG, Bahnhof Bus 8572747 9 \n",
- "181281 Bremgarten AG, Zelgli Bus 8582462 9 \n",
- "42460 Zufikon, Emaus Bus 8572600 9 \n",
- "224454 Zufikon, Algier Bus 8572601 9 \n",
- "11836 Unterlunkhofen, Breitenäcker Bus 8502553 9 \n",
- "\n",
- " all_stops route_int \n",
- "81914 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n",
- "181281 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n",
- "42460 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n",
- "224454 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n",
- "11836 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 "
- ]
- },
- "execution_count": 91,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_routes.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 92,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2554"
- ]
- },
- "execution_count": 92,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#check if route_int is correct\n",
- "stop_times_routes.route_int.max()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Trip_int\n",
- "The trip_int number needs to be ordered by route_int and time"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 93,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19390"
- ]
- },
- "execution_count": 93,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#check number trips in stop_times\n",
- "stop_times_routes.trip_id.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 94,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " route_id | \n",
- " stop_id | \n",
- " trip_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " direction_id | \n",
- " stop_name | \n",
- " route_desc | \n",
- " stop_id_raw | \n",
- " hour_departure | \n",
- " all_stops | \n",
- " route_int | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 181290 | \n",
- " 26-24-j19-1 | \n",
- " 8502208 | \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- " 10:44:00 | \n",
- " 10:45:00 | \n",
- " 3 | \n",
- " 1 | \n",
- " Horgen Oberdorf | \n",
- " S-Bahn | \n",
- " 8502208:0:4 | \n",
- " 10 | \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 261974 | \n",
- " 26-24-j19-1 | \n",
- " 8502209 | \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- " 10:47:00 | \n",
- " 10:47:00 | \n",
- " 4 | \n",
- " 1 | \n",
- " Oberrieden Dorf | \n",
- " S-Bahn | \n",
- " 8502209:0:1 | \n",
- " 10 | \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 130162 | \n",
- " 26-24-j19-1 | \n",
- " 8503201 | \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- " 10:53:00 | \n",
- " 10:53:00 | \n",
- " 6 | \n",
- " 1 | \n",
- " Rüschlikon | \n",
- " S-Bahn | \n",
- " 8503201:0:2 | \n",
- " 10 | \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 173670 | \n",
- " 26-24-j19-1 | \n",
- " 8503010 | \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- " 11:02:00 | \n",
- " 11:03:00 | \n",
- " 9 | \n",
- " 1 | \n",
- " Zürich Enge | \n",
- " S-Bahn | \n",
- " 8503010:0:2 | \n",
- " 11 | \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " 238129 | \n",
- " 26-24-j19-1 | \n",
- " 8503011 | \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- " 11:04:00 | \n",
- " 11:04:00 | \n",
- " 10 | \n",
- " 1 | \n",
- " Zürich Wiedikon | \n",
- " S-Bahn | \n",
- " 8503011:0:2 | \n",
- " 11 | \n",
- " (8502208, 8502209, 8503201, 8503010, 8503011, ... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " route_id stop_id trip_id arrival_time \\\n",
- "181290 26-24-j19-1 8502208 403.TA.26-24-j19-1.220.R 10:44:00 \n",
- "261974 26-24-j19-1 8502209 403.TA.26-24-j19-1.220.R 10:47:00 \n",
- "130162 26-24-j19-1 8503201 403.TA.26-24-j19-1.220.R 10:53:00 \n",
- "173670 26-24-j19-1 8503010 403.TA.26-24-j19-1.220.R 11:02:00 \n",
- "238129 26-24-j19-1 8503011 403.TA.26-24-j19-1.220.R 11:04:00 \n",
- "\n",
- " departure_time stop_sequence direction_id stop_name \\\n",
- "181290 10:45:00 3 1 Horgen Oberdorf \n",
- "261974 10:47:00 4 1 Oberrieden Dorf \n",
- "130162 10:53:00 6 1 Rüschlikon \n",
- "173670 11:03:00 9 1 Zürich Enge \n",
- "238129 11:04:00 10 1 Zürich Wiedikon \n",
- "\n",
- " route_desc stop_id_raw hour_departure \\\n",
- "181290 S-Bahn 8502208:0:4 10 \n",
- "261974 S-Bahn 8502209:0:1 10 \n",
- "130162 S-Bahn 8503201:0:2 10 \n",
- "173670 S-Bahn 8503010:0:2 11 \n",
- "238129 S-Bahn 8503011:0:2 11 \n",
- "\n",
- " all_stops route_int \n",
- "181290 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n",
- "261974 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n",
- "130162 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n",
- "173670 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n",
- "238129 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 "
- ]
- },
- "execution_count": 94,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_routes.sort_values([\"route_int\", \"arrival_time\"]).head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Generate sequential trip_int, ordered by route and by time"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 95,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " trip_int | \n",
- " trip_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- " 0 | \n",
- " 403.TA.26-24-j19-1.220.R | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 425.TA.26-24-j19-1.220.R | \n",
- " 1 | \n",
- " 425.TA.26-24-j19-1.220.R | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4.TA.30-57-Y-j19-1.1.H | \n",
- " 2 | \n",
- " 4.TA.30-57-Y-j19-1.1.H | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 5.TA.30-57-Y-j19-1.1.H | \n",
- " 3 | \n",
- " 5.TA.30-57-Y-j19-1.1.H | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 6.TA.30-57-Y-j19-1.1.H | \n",
- " 4 | \n",
- " 6.TA.30-57-Y-j19-1.1.H | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 trip_int trip_id\n",
- "0 403.TA.26-24-j19-1.220.R 0 403.TA.26-24-j19-1.220.R\n",
- "1 425.TA.26-24-j19-1.220.R 1 425.TA.26-24-j19-1.220.R\n",
- "2 4.TA.30-57-Y-j19-1.1.H 2 4.TA.30-57-Y-j19-1.1.H\n",
- "3 5.TA.30-57-Y-j19-1.1.H 3 5.TA.30-57-Y-j19-1.1.H\n",
- "4 6.TA.30-57-Y-j19-1.1.H 4 6.TA.30-57-Y-j19-1.1.H"
- ]
- },
- "execution_count": 95,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "trip_df = pd.DataFrame(stop_times_routes.sort_values([\"route_int\", \"arrival_time\"]).trip_id.unique())\n",
- "trip_df[\"trip_int\"] = trip_df.index\n",
- "trip_df[\"trip_id\"] = trip_df.iloc[:,0]\n",
- "trip_df.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 96,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19390"
- ]
- },
- "execution_count": 96,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#check number trip_id\n",
- "trip_df.trip_id.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We join trip_id to stop_times dataframe"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#join to get trip_int in stop_times\n",
- "stop_times_routes_trip = stop_times_routes.join(trip_df.set_index(\"trip_id\"), how=\"inner\", on=\"trip_id\" , lsuffix='_left', rsuffix='_right').drop_duplicates()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#save ordered stop_times\n",
- "stop_times_routes_trip = stop_times_routes_trip.sort_values([\"route_int\", \"trip_int\", \"stop_sequence\"])\n",
- "stop_times_routes_trip.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#check if manipulations did not destroy trips\n",
- "stop_times_routes_trip.trip_id.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Stop_int\n",
- "Stop_int id needs to ordered by route, trip and stop sequence"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#check number stops at entry\n",
- "stop_times_routes_trip.stop_id.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "stop_times_routes_trip is already in the right order. We create dataframe to create stop_int"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops_df = pd.DataFrame(stop_times_routes_trip.stop_id.unique())\n",
- "stops_df[\"stop_int\"] = stops_df.index\n",
- "stops_df[\"stop_id\"] = stops_df.iloc[:,0]\n",
- "stops_df.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#check if number stop_int correct\n",
- "stops_df.stop_int.nunique()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We add stop_int information to stop_times"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#join to get stop_int\n",
- "stop_times_routes_trip_stop = stop_times_routes_trip.join(stops_df.set_index(\"stop_id\"), how=\"inner\", on=\"stop_id\", lsuffix='_left', rsuffix='_right').drop_duplicates()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_routes_trip_stop.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#check if no stops deleted during manipulation\n",
- "stop_times_routes_trip_stop.stop_id.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_routes_trip_stop.stop_int.max()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#keep only useful columns \n",
- "stop_times_int = stop_times_routes_trip_stop[[\"route_int\", \"trip_int\", \"stop_int\", \"stop_sequence\", \"arrival_time\", \"departure_time\",\\\n",
- " \"route_id\", \"trip_id\", \"stop_id\", \\\n",
- " \"route_desc\", \"stop_id_raw\", \"stop_name\"]].sort_values([\"route_int\", \"trip_int\", \"stop_sequence\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_int = stop_times_int.reset_index(drop=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_int.loc[100:150].head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "An overview of number of routes, trips and stops"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_int.route_int.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_int.trip_int.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_int.stop_int.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_int.stop_int.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Transfer: delete transfer to same stop & get stop_int & stop_int2\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "12564"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#check number stops transfers\n",
- "transfers.stop_id.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " stop_id | \n",
- " stop_id2 | \n",
- " distance | \n",
- " Transfer_time_sec | \n",
- " stop_name | \n",
- " stop_name2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 8500926 | \n",
- " 8590616 | \n",
- " 0.122430 | \n",
- " 146 | \n",
- " Oetwil a.d.L., Schweizäcker | \n",
- " Geroldswil, Schweizäcker | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " 8500926 | \n",
- " 8590737 | \n",
- " 0.300175 | \n",
- " 360 | \n",
- " Oetwil a.d.L., Schweizäcker | \n",
- " Oetwil an der Limmat, Halde | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " 8502186 | \n",
- " 8502186:0:1 | \n",
- " 0.006762 | \n",
- " 8 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3 | \n",
- " 8502186 | \n",
- " 8502186:0:2 | \n",
- " 0.013524 | \n",
- " 16 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " 8502186 | \n",
- " 8502186P | \n",
- " 0.000000 | \n",
- " 0 | \n",
- " Dietikon Stoffelbach | \n",
- " Dietikon Stoffelbach | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n",
- "0 0 8500926 8590616 0.122430 146 \n",
- "1 1 8500926 8590737 0.300175 360 \n",
- "2 2 8502186 8502186:0:1 0.006762 8 \n",
- "3 3 8502186 8502186:0:2 0.013524 16 \n",
- "4 4 8502186 8502186P 0.000000 0 \n",
- "\n",
- " stop_name stop_name2 \n",
- "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n",
- "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n",
- "2 Dietikon Stoffelbach Dietikon Stoffelbach \n",
- "3 Dietikon Stoffelbach Dietikon Stoffelbach \n",
- "4 Dietikon Stoffelbach Dietikon Stoffelbach "
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfers.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We delete transfers to the same stop"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers_df = transfers[transfers['stop_id'] != transfers['stop_id2']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "12564"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfers_df.stop_id.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We create the stop_int column in transfers. This action eliminates stops not in stop_times"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_int = stop_times_curated"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers_df = transfers_df.merge(stop_times_int[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\"), how=\"inner\", on = \"stop_id\").drop_duplicates()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers_df.stop_id.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers_df.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#create dataframe with stops\n",
- "df_stop_int2 = stop_times_int[[\"stop_id\", \"stop_int\"]].rename(columns={\"stop_id\": \"stop_id2\", \"stop_int\" : \"stop_int_2\"})\n",
- "df_stop_int2.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We add the the stop id for the arrival destination, stop_int2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers_df_int = transfers_df.merge(df_stop_int2.set_index(\"stop_id2\"), how=\"inner\", on = \"stop_id2\").drop_duplicates()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers_df_int.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers_df_int.stop_id.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers = transfers_df_int"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#check number unique stops2 in transfers\n",
- "transfers.stop_id2.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers.stop_id.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " route_id | \n",
- " stop_id_general | \n",
- " trip_id | \n",
- " stop_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " stop_name | \n",
- " stop_lat | \n",
- " stop_lon | \n",
- " trip_headsign | \n",
- " trip_short_name | \n",
- " direction_id | \n",
- " departure_first_stop | \n",
- " route_int | \n",
- " stop_count | \n",
- " stop_int | \n",
- " route_desc | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0 | \n",
- " 26-66-j19-1 | \n",
- " 8591205 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591205 | \n",
- " 17:00:00 | \n",
- " 17:00:00 | \n",
- " 3 | \n",
- " Zürich, Hürlimannplatz | \n",
- " 47.365066 | \n",
- " 8.526539 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 1317 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " 26-66-j19-1 | \n",
- " 8591415 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591415 | \n",
- " 17:02:00 | \n",
- " 17:02:00 | \n",
- " 4 | \n",
- " Zürich, Waffenplatzstrasse | \n",
- " 47.361482 | \n",
- " 8.525749 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 1267 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2 | \n",
- " 26-66-j19-1 | \n",
- " 8591204 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591204 | \n",
- " 17:03:00 | \n",
- " 17:03:00 | \n",
- " 5 | \n",
- " Zürich, Hügelstrasse | \n",
- " 47.358543 | \n",
- " 8.526997 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 67 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 3 | \n",
- " 26-66-j19-1 | \n",
- " 8591098 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591098 | \n",
- " 17:04:00 | \n",
- " 17:04:00 | \n",
- " 6 | \n",
- " Zürich, Brunau/Mutschellenstr. | \n",
- " 47.355147 | \n",
- " 8.527141 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 512 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4 | \n",
- " 26-66-j19-1 | \n",
- " 8591392 | \n",
- " 17.TA.26-66-j19-1.1.H | \n",
- " 8591392 | \n",
- " 17:05:00 | \n",
- " 17:05:00 | \n",
- " 7 | \n",
- " Zürich, Thujastrasse | \n",
- " 47.350187 | \n",
- " 8.527806 | \n",
- " Zürich, Neubühl | \n",
- " 3870 | \n",
- " 0 | \n",
- " 16:55:00 | \n",
- " 1225 | \n",
- " 12 | \n",
- " 403 | \n",
- " Bus | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 route_id stop_id_general trip_id stop_id \\\n",
- "0 0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 \n",
- "1 1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 \n",
- "2 2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 \n",
- "3 3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 \n",
- "4 4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 \n",
- "\n",
- " arrival_time departure_time stop_sequence stop_name \\\n",
- "0 17:00:00 17:00:00 3 Zürich, Hürlimannplatz \n",
- "1 17:02:00 17:02:00 4 Zürich, Waffenplatzstrasse \n",
- "2 17:03:00 17:03:00 5 Zürich, Hügelstrasse \n",
- "3 17:04:00 17:04:00 6 Zürich, Brunau/Mutschellenstr. \n",
- "4 17:05:00 17:05:00 7 Zürich, Thujastrasse \n",
- "\n",
- " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n",
- "0 47.365066 8.526539 Zürich, Neubühl 3870 0 \n",
- "1 47.361482 8.525749 Zürich, Neubühl 3870 0 \n",
- "2 47.358543 8.526997 Zürich, Neubühl 3870 0 \n",
- "3 47.355147 8.527141 Zürich, Neubühl 3870 0 \n",
- "4 47.350187 8.527806 Zürich, Neubühl 3870 0 \n",
- "\n",
- " departure_first_stop route_int stop_count stop_int route_desc \n",
- "0 16:55:00 1225 12 1317 Bus \n",
- "1 16:55:00 1225 12 1267 Bus \n",
- "2 16:55:00 1225 12 67 Bus \n",
- "3 16:55:00 1225 12 512 Bus \n",
- "4 16:55:00 1225 12 403 Bus "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stop_times_ordered = stop_times_curated\n",
- "stop_times_ordered.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We start by making sure the order is correct"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"trip_int\", \"stop_sequence\"])\n",
- "stop_times_ordered.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We add None to first arrival time and last departure time."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#adding a shift\n",
- "stop_times_ordered[\"sequence_shift_1\"] = stop_times_ordered[\"stop_sequence\"].shift(-1, fill_value=0)\n",
- "stop_times_ordered.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"stop_sequence\"] > stop_times_ordered[\"sequence_shift_1\"]), None, stop_times_ordered['departure_time'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"stop_sequence\"] == 1), None, stop_times_ordered['arrival_time'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_shift_1\"]].head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Array structure preparation"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### StopTimes: \n",
- "[[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We transform it in datetime as required by the raptor algorithm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n",
- "stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/stop_times_df.pkl','wb') as f: pickle.dump(stop_times_ordered, f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered = stop_times_ordered.sort_values(by=[\"route_int\", \"trip_int\", \"stop_sequence\"])\n",
- "stop_times_ordered.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And we transform it to array, ready ti be used by raptor"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_array = stop_times_ordered[[\"arrival_time\", \"departure_time\"]].to_numpy()\n",
- "stop_times_array"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.size(stop_times_array,0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/stop_times_array.pkl','wb') as f: pickle.dump(stop_times_array, f)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Routes: \n",
- "[[route0_nr.Trips, route0_nr. Stops, route0_pointerRoutes, route0_pointerStops_times],[route1_nr.Trips, route1_nr. Stops,, route1_pointerRoutes, route1_pointerStops_times],…]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We start by getting the number of trips and stops there is for each route"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops = stop_times_ordered.groupby([\"route_int\"]).nunique()[[\"trip_int\",\"stop_int\"]].sort_index().rename(columns={\"trip_int\": \"n_Trips\", \"stop_int\": \"n_stops\"})\n",
- "distinct_trips_stops.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops.shape"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We create the pointer for the route stops, by adding the unique stops for each route"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops['pointer_routes_stops'] = distinct_trips_stops.n_stops.cumsum().shift(1, fill_value=0)\n",
- "distinct_trips_stops.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We create the pointer for stop_times by adding the number of stops in each route, counting duplicates (due to several trips)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops[\"pointer_stop_times\"] = (stop_times_ordered.groupby([\"route_int\"]).count().stop_id).cumsum().shift(1, fill_value=0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops[\"pointer_routes_stops_shift\"] = distinct_trips_stops['pointer_routes_stops'].shift(-1, fill_value=0)\n",
- "distinct_trips_stops[\"pointer_stop_times_shift\"] = distinct_trips_stops['pointer_stop_times'].shift(-1, fill_value=0)\n",
- "distinct_trips_stops.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops['pointer_routes_stops'] = np.where((distinct_trips_stops[\"pointer_routes_stops\"] == distinct_trips_stops[\"pointer_routes_stops_shift\"]), None, distinct_trips_stops['pointer_routes_stops'])\n",
- "distinct_trips_stops['pointer_stop_times'] = np.where((distinct_trips_stops[\"pointer_stop_times\"] == distinct_trips_stops[\"pointer_stop_times_shift\"]), None, distinct_trips_stops['pointer_stop_times'])\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops.isna().any()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/routes_array_df.pkl','wb') as f: pickle.dump(distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']], f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "routes_array = distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']].to_numpy()\n",
- "routes_array"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.size(routes_array, 0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/routes_array.pkl','wb') as f: pickle.dump(routes_array, f)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "route_stops = stop_times_ordered.sort_values([\"route_int\", \"stop_sequence\"])\n",
- "route_stops = route_stops[['route_int', 'stop_int']].drop_duplicates().reset_index()\n",
- "route_stops.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "route_stops.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "route_stops.route_int.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/route_stops_df.pkl','wb') as f: pickle.dump(route_stops, f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "route_stops_array = route_stops.stop_int.to_numpy()\n",
- "route_stops_array"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.size(np.unique(route_stops_array))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.size(route_stops_array, 0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "route_stops_array.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/route_stops_array.pkl','wb') as f: pickle.dump(route_stops_array, f)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check if pointers are correct\n",
- "It is fundamental that the indexes, that serve as pointers, in Routes are correct"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We start by looking at where the indexes for stop_times and route_stops diverge. This will allow us to change. We can see that Route stops should have a new route at 3 while stop_times should have it at 78, so we try with that"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_trips_stops.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can check if the pointer indicates the routes index number. At the pointer_routes should indicate the first stop of a new route. We try with 3 to see if route_stops has a new route at this index. It does so it works"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "route_stops.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We go and see if stop_times has a new route at 78. It does, so it works"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_ordered.loc[75:80].head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops_join = route_stops.join(transfers.set_index(\"stop_int\"), how=\"left\", on=\"stop_int\").drop_duplicates()\n",
- "stops_join.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops_join.stop_int.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_route_transfers = stops_join.sort_values(\"stop_int\").groupby([\"stop_int\"]).nunique().rename(columns={\"route_int\": \"n_Routes\", \"stop_int_2\": \"n_Transfers\"})\n",
- "distinct_route_transfers = distinct_route_transfers[[\"n_Routes\", \"n_Transfers\"]].sort_index()\n",
- "distinct_route_transfers.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_route_transfers['pointer_stop_routes'] = distinct_route_transfers.n_Routes.cumsum().shift(1, fill_value=0)\n",
- "distinct_route_transfers['pointer_transfers'] = distinct_route_transfers.n_Transfers.cumsum().shift(1, fill_value=0)\n",
- "distinct_route_transfers.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_route_transfers[\"pointer_stop_routes_shift\"] = distinct_route_transfers['pointer_stop_routes'].shift(-1, fill_value=0)\n",
- "distinct_route_transfers[\"pointer_transfers_shift\"] = distinct_route_transfers['pointer_transfers'].shift(-1, fill_value=0)\n",
- "distinct_route_transfers.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_route_transfers['pointer_stop_routes'] = np.where((distinct_route_transfers[\"pointer_stop_routes\"] == distinct_route_transfers[\"pointer_stop_routes_shift\"]), None, distinct_route_transfers['pointer_stop_routes'])\n",
- "distinct_route_transfers['pointer_transfers'] = np.where((distinct_route_transfers[\"pointer_transfers\"] == distinct_route_transfers[\"pointer_transfers_shift\"]), None, distinct_route_transfers['pointer_transfers'])\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "distinct_route_transfers.isna().any()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops_df = distinct_route_transfers[['pointer_stop_routes', 'pointer_transfers']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/stops_df.pkl','wb') as f: pickle.dump(stops_df, f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops_array = stops_df.to_numpy()\n",
- "stops_array"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.size(stops_array, 0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops_array.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/stops_array.pkl','wb') as f: pickle.dump(stops_array, f)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_routes = stop_times_ordered[[\"route_int\", \"stop_int\", \"stop_id\"]].drop_duplicates().sort_values([\"stop_int\", \"route_int\"])\n",
- "stop_routes = stop_routes.reset_index()\n",
- "stop_routes.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_routes.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_times_curated.route_id.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_routes.route_int.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/stop_routes_df.pkl','wb') as f: pickle.dump(stop_routes, f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_routes_array = stop_routes[\"route_int\"].to_numpy()\n",
- "stop_routes_array"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.size(stop_routes_array, 0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_routes_array.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/stop_routes_array.pkl','wb') as f: pickle.dump(stop_routes_array, f)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers.stop_id.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfer_pandas = transfers[[\"stop_int\",\"stop_int_2\", \"Transfer_time_sec\", \"stop_id_raw\"]].sort_values([\"stop_int\", \"stop_int_2\", \"stop_id_raw\"]).drop_duplicates([\"stop_int\", \"stop_int_2\"])\n",
- "transfer_pandas = transfer_pandas.reset_index(drop=True)\n",
- "transfer_pandas.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfer_pandas.stop_int_2.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/transfer_df.pkl','wb') as f: pickle.dump(transfers.sort_values(\"stop_id\"), f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfer_array = transfer_pandas[[\"stop_int_2\", \"Transfer_time_sec\"]].to_numpy()\n",
- "transfer_array"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/transfer_array.pkl','wb') as f: pickle.dump(transfer_array, f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.size(transfer_array, 0)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Check if indexes in stops is correct"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We see first the pointers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops_df.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We see that at the index 8 there should be a new stop. we check and it is false"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfer_pandas.loc[5:10].head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We see that at index 4 we should have a new stop. we check and it true"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_routes.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stop_routes.loc[stop_routes['stop_int'] == 172]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "route_stops.loc[route_stops['stop_int'] == 172]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "read files as pickles"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/stop_times_array.pkl','rb') as f: arrayname1 = pickle.load(f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/routes_array.pkl','rb') as f: arrayname2 = pickle.load(f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../data/route_stops_array.pkl','rb') as f: arrayname3 = pickle.load(f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "arrayname1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "arrayname2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "arrayname3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/old_notebooks/OLD_hdfs_data_processing_spark.ipynb b/notebooks/old_notebooks/OLD_hdfs_data_processing_spark.ipynb
deleted file mode 100644
index c9d6cdc..0000000
--- a/notebooks/old_notebooks/OLD_hdfs_data_processing_spark.ipynb
+++ /dev/null
@@ -1,2225 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# First preprocessing timetable data with Spark\n",
- "\n",
- "We do a first preprocessing in spark to prepare the stop_times and transfer dataset which will then be finetuned for the RAPTOR algorithm using pandas.\n",
- "\n",
- "\n",
- "Any application without a proper name would be promptly killed.
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- "ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
7611 | application_1589299642358_2106 | pyspark | idle | Link | Link | |
7617 | application_1589299642358_2112 | pyspark | idle | Link | Link | |
7632 | application_1589299642358_2126 | pyspark | idle | Link | Link | |
7633 | application_1589299642358_2127 | pyspark | busy | Link | Link | |
7635 | application_1589299642358_2129 | pyspark | idle | Link | Link | |
7640 | application_1589299642358_2135 | pyspark | idle | Link | Link | |
7642 | application_1589299642358_2138 | pyspark | idle | Link | Link | |
7644 | application_1589299642358_2140 | pyspark | busy | Link | Link | |
7651 | application_1589299642358_2147 | pyspark | idle | Link | Link | |
7653 | application_1589299642358_2149 | pyspark | idle | Link | Link | |
7656 | application_1589299642358_2152 | pyspark | idle | Link | Link | |
7664 | application_1589299642358_2160 | pyspark | idle | Link | Link | |
7665 | application_1589299642358_2161 | pyspark | idle | Link | Link | |
7667 | application_1589299642358_2163 | pyspark | idle | Link | Link | |
7670 | application_1589299642358_2166 | pyspark | busy | Link | Link | |
7674 | application_1589299642358_2170 | pyspark | idle | Link | Link | |
7675 | application_1589299642358_2171 | pyspark | idle | Link | Link | |
7676 | application_1589299642358_2172 | pyspark | busy | Link | Link | |
7677 | application_1589299642358_2173 | pyspark | busy | Link | Link | |
7678 | application_1589299642358_2174 | pyspark | idle | Link | Link | |
7680 | application_1589299642358_2176 | pyspark | idle | Link | Link | |
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "%%configure\n",
- "{\"conf\": {\n",
- " \"spark.app.name\": \"lgptguys_final\"\n",
- "}}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Start Spark"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Starting Spark application\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
7681 | application_1589299642358_2177 | pyspark | idle | Link | Link | ✔ |
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "SparkSession available as 'spark'.\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "An error was encountered:\n",
- "unknown magic command '%spark'\n",
- "UnknownMagic: unknown magic command '%spark'\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Initialization\n",
- "%%spark"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Preprocessing"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To avoid the need of re-running the whole code, you can read the file generated at each step using the 'spark.read.csv' cell\n",
- "\n",
- "I this notebook we do the following:\n",
- "- Create list of stops within 15 km from zurich HB\n",
- "- Prepare transfer dataset\n",
- "- - Precompute walking times between stops at less than 500 m\n",
- "- Prepare stop_times dataset\n",
- "- - Keep only trips belonging to services which run each day of the business week (Monday to Friday).\n",
- "- - Keep only stops within the 15km of Zurich\n",
- "- - Make trip_id not specific for 1 day\n",
- "- - Add stop_name to stop_times\n",
- "- - Add transport infomation"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Import of relevant packages"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "from pyspark.sql.functions import *\n",
- "from pyspark.sql.types import DoubleType\n",
- "from pyspark.sql.types import DateType\n",
- "from geopy.distance import great_circle\n",
- "from geopy.distance import great_circle\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create list of stops within 15 km of Zürich Hauptbahnhof"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Zurich HB coordinates\n",
- "zurich_geo = (47.378177, 8.540192)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We first load all stops in the dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------+--------------------+----------------+----------------+-------------+--------------+\n",
- "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n",
- "+-------+--------------------+----------------+----------------+-------------+--------------+\n",
- "|1322000| Altoggio|46.1672513851495| 8.345807131427| null| null|\n",
- "|1322001| Antronapiana| 46.060121674738|8.11361957990831| null| null|\n",
- "|1322002| Anzola|45.9898698225697|8.34571729989858| null| null|\n",
- "|1322003| Baceno|46.2614983591677|8.31925293162473| null| null|\n",
- "|1322004|Beura Cardezza, c...|46.0790618438814|8.29927439970313| null| null|\n",
- "+-------+--------------------+----------------+----------------+-------------+--------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stops = spark.read.csv(\"/data/sbb/timetables/csv/stops/2019/05/14/stops.txt\", header=True, sep = \",\")\n",
- "stops.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "How many stops are there in the dataset, before filtering for those in a 15km radius of Zürich HB ?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "30631"
- ]
- }
- ],
- "source": [
- "stops.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To filter stops within a 15km radius of Zürich HB, we use the function `distance.great_circle` from package `geopy`. It takes two places as an input, each defined by a pair of coordinates, and returns the surface distance between those two points by approximating the Earth as a sphere. To use it we have to define a udf function."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "#defining udf function\n",
- "@udf(\"float\")\n",
- "def great_circle_udf(x, y):\n",
- " return great_circle(x, y).kilometers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n",
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n",
- "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n",
- "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n",
- "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n",
- "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n",
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "#transforming Zurich HB coordinates in a spark dataframe column object\n",
- "zurich_geo_col = struct(lit(zurich_geo[0]), lit(zurich_geo[1]))\n",
- "\n",
- "#applying filter function based on distance\n",
- "stops_15km = stops.filter(great_circle_udf(zurich_geo_col, struct(stops.stop_lat, stops.stop_lon)) < 15)\n",
- "stops_15km.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "It is pretty obvious from the stop names in `stops_15km` that the radius was reduced to the vicinity of Zürich.\n",
- "How many stops are left after restricting the transport data to the radius of 15km around Zürich HB ?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1883"
- ]
- }
- ],
- "source": [
- "stops_15km.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "stops_15km.write.csv('data/lgpt_guys/stops_15km.csv', header = True, mode=\"overwrite\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n",
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n",
- "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n",
- "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n",
- "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n",
- "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n",
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stops_15km = spark.read.csv('data/lgpt_guys/stops_15km.csv', header = True)\n",
- "stops_15km.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Precomputing walking times between stations under 500m distance\n",
- "\n",
- "When two stops are close to each other (less than 500m away), the user may decide to walk from one to the next instead of taking a transport. Therefore, the journey planning algorithm should take footpaths into account. For each footpaths, defined as a departure stop and an arrival stop, we pre-compute the walking time in seconds, by considering a constant walking speed of 50 meters per minute. Note that walking from A to B must be kept as a separate record as walking from B to A for our implementation of RAPTOR.\n",
- "\n",
- "We employ a brute force approach to solve this problem. Since the number of stops is pretty small, we simply compute the walking distance from each stop to every other stop. We prepare this comparison by doing a cross-join (each row of the first dataframe gets joined to every row of the other dataset)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "root\n",
- " |-- stop_id: string (nullable = true)\n",
- " |-- stop_name: string (nullable = true)\n",
- " |-- stop_lat: string (nullable = true)\n",
- " |-- stop_lon: string (nullable = true)\n",
- " |-- location_type: string (nullable = true)\n",
- " |-- parent_station: string (nullable = true)"
- ]
- }
- ],
- "source": [
- "stops_15km.printSchema()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "#create second dataframe for crossjoin\n",
- "stops_15km2 = stops_15km.select(stops_15km.stop_id.alias(\"stop_id2\"), stops_15km.stop_lat.alias(\"stop_lat2\"),\\\n",
- " stops_15km.stop_lon.alias(\"stop_lon2\"), stops_15km.stop_name.alias(\"stop_name2\") )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+--------------------+\n",
- "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station| stop_id2| stop_lat2| stop_lon2| stop_name2|\n",
- "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+--------------------+\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8500926|47.4236270123012| 8.4031825286317|Oetwil a.d.L., Sc...|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186|47.3934058321612|8.39894248049007|Dietikon Stoffelbach|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:1|47.3934666445388|8.39894248049007|Dietikon Stoffelbach|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:2|47.3935274568464|8.39894248049007|Dietikon Stoffelbach|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186P|47.3934058321612|8.39894248049007|Dietikon Stoffelbach|\n",
- "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+--------------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "#creating all combinations of transfers\n",
- "combination_stops = stops_15km.crossJoin(stops_15km2)\n",
- "combination_stops.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notice how the first stop in `stops_15km` is now joined to all other stops (including itself) in `combination_stops`.\n",
- "\n",
- "Transfers to the same stop are not considered as footpaths (we simply define a fixed transfer time of 2 minutes). Therefore, lines in `combination_stops` where the two stops are the same can be filtered out."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+--------------------+\n",
- "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station| stop_id2| stop_lat2| stop_lon2| stop_name2|\n",
- "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+--------------------+\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186|47.3934058321612|8.39894248049007|Dietikon Stoffelbach|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:1|47.3934666445388|8.39894248049007|Dietikon Stoffelbach|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null|8502186:0:2|47.3935274568464|8.39894248049007|Dietikon Stoffelbach|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502186P|47.3934058321612|8.39894248049007|Dietikon Stoffelbach|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 8502187|47.3646945560768|8.37709545277724|Rudolfstetten Hof...|\n",
- "+-------+--------------------+----------------+---------------+-------------+--------------+-----------+----------------+----------------+--------------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "#eliminating transfer to same station\n",
- "combination_stops = combination_stops.filter(combination_stops.stop_id!=combination_stops.stop_id2)\n",
- "combination_stops.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We use `distance.great_circle` between each pair of stops and keep only those for which the walking distance is less than 500m."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------+--------------------+----------------+----------------+-------------+--------------+-----------+----------------+----------------+--------------------+------------+\n",
- "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station| stop_id2| stop_lat2| stop_lon2| stop_name2| distance|\n",
- "+-------+--------------------+----------------+----------------+-------------+--------------+-----------+----------------+----------------+--------------------+------------+\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 8590616|47.4228794433749|8.40437728795975|Geroldswil, Schwe...| 0.122429974|\n",
- "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 8590737|47.4253712985116|8.40013723981811|Oetwil an der Lim...| 0.30017462|\n",
- "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|8502186:0:1|47.3934666445388|8.39894248049007|Dietikon Stoffelbach|0.0067620375|\n",
- "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|8502186:0:2|47.3935274568464|8.39894248049007|Dietikon Stoffelbach| 0.013524067|\n",
- "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 8502186P|47.3934058321612|8.39894248049007|Dietikon Stoffelbach| 0.0|\n",
- "+-------+--------------------+----------------+----------------+-------------+--------------+-----------+----------------+----------------+--------------------+------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "# adding distance\n",
- "combination_stops = combination_stops.withColumn(\"distance\", \\\n",
- " great_circle_udf(struct(combination_stops.stop_lat, combination_stops.stop_lon), \\\n",
- " struct(combination_stops.stop_lat2, combination_stops.stop_lon2)))\n",
- "combination_stops = combination_stops.filter(combination_stops.distance < 0.5)\n",
- "combination_stops.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Finally, the walking distance between each pair of stops is transformed to a walking time in seconds, considering a constant walking speed of 50 meters per minute.\n",
- "\n",
- "**Note that this step may take up to a few minutes to run**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------+-----------+------------+-----------------+--------------------+--------------------+\n",
- "|stop_id| stop_id2| distance|Transfer_time_sec| stop_name| stop_name2|\n",
- "+-------+-----------+------------+-----------------+--------------------+--------------------+\n",
- "|8500926| 8590616| 0.122429974| 146|Oetwil a.d.L., Sc...|Geroldswil, Schwe...|\n",
- "|8500926| 8590737| 0.30017462| 360|Oetwil a.d.L., Sc...|Oetwil an der Lim...|\n",
- "|8502186|8502186:0:1|0.0067620375| 8|Dietikon Stoffelbach|Dietikon Stoffelbach|\n",
- "|8502186|8502186:0:2| 0.013524067| 16|Dietikon Stoffelbach|Dietikon Stoffelbach|\n",
- "|8502186| 8502186P| 0.0| 0|Dietikon Stoffelbach|Dietikon Stoffelbach|\n",
- "+-------+-----------+------------+-----------------+--------------------+--------------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "#adding transfer time in seconds\n",
- "transfer = combination_stops.select(combination_stops.stop_id, combination_stops.stop_id2,\\\n",
- " combination_stops.distance, ((combination_stops.distance/0.05*60).alias(\"Transfer_time_sec\")).cast(\"integer\"), combination_stops.stop_name, combination_stops.stop_name2)\n",
- "transfer.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notice that some stops share the same exact coordinates or are very close to each other. Often, these stops are children stops of the same parent stop."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "root\n",
- " |-- stop_id: string (nullable = true)\n",
- " |-- stop_id2: string (nullable = true)\n",
- " |-- distance: float (nullable = true)\n",
- " |-- Transfer_time_sec: integer (nullable = true)\n",
- " |-- stop_name: string (nullable = true)\n",
- " |-- stop_name2: string (nullable = true)"
- ]
- }
- ],
- "source": [
- "transfer.printSchema()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "transfer.write.csv('data/lgpt_guys/transfers.csv', header = True, mode=\"overwrite\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "transfer = spark.read.csv('data/lgpt_guys/transfers.csv', header = True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Clean Stop_times data\n",
- "The action we want to do on this dataset is to :\n",
- "- Keep only trips belonging to services which run each day of the business week (Monday to Friday).\n",
- "- Keep only stops within the 15km of Zurich\n",
- "- Make trip_id not specific for 1 day\n",
- "- Add stop_name to stop_times\n",
- "- Add transport infomation\n",
- "\n",
- "To do this we will serve ourselves of 4 additional datasets:\n",
- "- Calendar.txt which contains the information on the periodicity of services\n",
- "- Trip.txt which contains a service_id and trip_id. This dataset will be used as a connection between calendar and stop_times\n",
- "- Routes.txt which contains transport information we can add to stop_times\n",
- "- The previously filtered dataset of stops15km"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Keeping services running on business days\n",
- "#### Calendar.txt\n",
- "We suppose a service is the equivalent of a transport line\n",
- "- drop services that are not on all business days\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Reading the file from HDFS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "calendar = spark.read.csv(\"/data/sbb/timetables/csv/calendar/2019/05/14/calendar.txt\", header=True, sep = \",\")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n",
- "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n",
- "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n",
- "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b08k4| 1| 0| 0| 0| 0| 0| 0| 20181209|20191214|\n",
- "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "calendar.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We want to keep only services that run on all business days"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n",
- "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n",
- "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n",
- "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "| TA+b0nxn| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n",
- "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "calendar_business_days = calendar.filter((calendar.monday==1) & \\\n",
- " (calendar.tuesday==1) & \\\n",
- " (calendar.wednesday==1) & \\\n",
- " (calendar.thursday==1) & \\\n",
- " (calendar.friday==1))\n",
- "calendar_business_days.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create a dataframe with the filtered services"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+----------+\n",
- "|service_id|\n",
- "+----------+\n",
- "| TA+b0nx9|\n",
- "| TA+b03bf|\n",
- "| TA+b0008|\n",
- "| TA+b0nxg|\n",
- "| TA+b0nxn|\n",
- "+----------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "service_id_business_days = calendar_business_days.select(calendar_business_days.service_id)\n",
- "service_id_business_days.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### trips.txt\n",
- "We remove trips that do not run on business days, thanks to the filtered service dataset created before.\n",
- "We can use the filtered trip_ids to filter trips not in the stop_times dataset"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Reading file from HDFS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----------+----------+--------------------+------------------+---------------+------------+\n",
- "| route_id|service_id| trip_id| trip_headsign|trip_short_name|direction_id|\n",
- "+-----------+----------+--------------------+------------------+---------------+------------+\n",
- "|1-1-C-j19-1| TA+b0001|5.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 108| 1|\n",
- "|1-1-C-j19-1| TA+b0001|7.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 112| 1|\n",
- "|1-1-C-j19-1| TA+b0001|9.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 116| 1|\n",
- "|1-1-C-j19-1| TA+b0001|11.TA.1-1-C-j19-1...|Zofingen, Altachen| 120| 1|\n",
- "|1-1-C-j19-1| TA+b0001|13.TA.1-1-C-j19-1...|Zofingen, Altachen| 124| 1|\n",
- "+-----------+----------+--------------------+------------------+---------------+------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "trips = spark.read.csv(\"/data/sbb/timetables/csv/trips/2019/05/14/trips.txt\", header=True, sep = \",\")\n",
- "trips.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1017413"
- ]
- }
- ],
- "source": [
- "trips.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We filter trips with an inner join with the filtered service dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+----------+-----------+--------------------+--------------------+---------------+------------+\n",
- "|service_id| route_id| trip_id| trip_headsign|trip_short_name|direction_id|\n",
- "+----------+-----------+--------------------+--------------------+---------------+------------+\n",
- "| TA+b0001|1-1-C-j19-1|46.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 113| 0|\n",
- "| TA+b0001|1-1-C-j19-1|59.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 139| 0|\n",
- "| TA+b0001|1-340-j19-1|2.TA.1-340-j19-1.1.H| Wohlen AG, Bahnhof| 105| 0|\n",
- "| TA+b0001|1-354-j19-1|36.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35435| 0|\n",
- "| TA+b0001|1-354-j19-1|47.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35467| 0|\n",
- "+----------+-----------+--------------------+--------------------+---------------+------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "trips_businessdays = service_id_business_days.join(trips, how=\"inner\", on=\"service_id\").dropDuplicates()\n",
- "trips_businessdays.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "528368"
- ]
- }
- ],
- "source": [
- "trips_businessdays.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Close to half of the trips are filtered out due to them not belonging to services running on each business day of the week."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "trips_businessdays.write.csv('data/lgpt_guys/trips_businessdays.csv', header = True, mode=\"overwrite\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "trips_businessdays = spark.read.csv('data/lgpt_guys/trips_businessdays.csv', header = True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Stop_times.txt\n",
- "Now we can act directly on stop_times to drop trips not active on all business days"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Reading file from HDFS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n",
- "| trip_id|arrival_time|departure_time| stop_id|stop_sequence|pickup_type|drop_off_type|\n",
- "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n",
- "|1.TA.1-1-B-j19-1.1.R| 04:20:00| 04:20:00|8500010:0:3| 1| 0| 0|\n",
- "|1.TA.1-1-B-j19-1.1.R| 04:24:00| 04:24:00|8500020:0:3| 2| 0| 0|\n",
- "|1.TA.1-1-B-j19-1.1.R| 04:28:00| 04:28:00|8500021:0:5| 3| 0| 0|\n",
- "|1.TA.1-1-B-j19-1.1.R| 04:30:00| 04:30:00|8517131:0:2| 4| 0| 0|\n",
- "|1.TA.1-1-B-j19-1.1.R| 04:32:00| 04:32:00|8500300:0:5| 5| 0| 0|\n",
- "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "#read file\n",
- "stop_times = spark.read.csv(\"/data/sbb/timetables/csv/stop_times/2019/05/14/stop_times.txt\", header=True, sep = \",\")\n",
- "stop_times.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "11128930"
- ]
- }
- ],
- "source": [
- "stop_times.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We take from the relevant columns fromt he filtered trips dataframe. We add also the route_id infomation that will be useful later to add transport types info from the routes.txt dataset and the direction_id which will be useful to know the direction of the trip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+--------------------+-----------+------------+\n",
- "| trip_id| route_id|direction_id|\n",
- "+--------------------+-----------+------------+\n",
- "|65.TA.1-1-C-j19-1...|1-1-C-j19-1| 0|\n",
- "|34.TA.1-135-j19-1...|1-135-j19-1| 0|\n",
- "|50.TA.1-136-j19-1...|1-136-j19-1| 1|\n",
- "|42.TA.1-139-j19-1...|1-139-j19-1| 0|\n",
- "|39.TA.1-141-j19-1...|1-141-j19-1| 1|\n",
- "+--------------------+-----------+------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "#dataframe for inner join\n",
- "trip_id_trips_businessdays = trips_businessdays.select(trips_businessdays.trip_id, trips_businessdays.route_id, trips_businessdays.direction_id)\n",
- "trip_id_trips_businessdays.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We filter the trips not active on all business days using an inner join"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+-----------+------------+\n",
- "| trip_id|arrival_time|departure_time| stop_id|stop_sequence|pickup_type|drop_off_type| route_id|direction_id|\n",
- "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+-----------+------------+\n",
- "|123.TA.1-1-B-j19-...| 22:32:00| 22:32:00|8500300:0:5| 5| 0| 0|1-1-B-j19-1| 1|\n",
- "|152.TA.1-1-B-j19-...| 08:18:00| 08:19:00|8500302:0:2| 5| 0| 0|1-1-B-j19-1| 0|\n",
- "|238.TA.1-1-B-j19-...| 22:14:00| 22:14:00|8500303:0:3| 4| 0| 0|1-1-B-j19-1| 0|\n",
- "|230.TA.1-1-B-j19-...| 22:18:00| 22:19:00|8500302:0:2| 5| 0| 0|1-1-B-j19-1| 0|\n",
- "|251.TA.1-1-B-j19-...| 24:10:00| 24:11:00|8500320:0:4| 3| 0| 0|1-1-B-j19-1| 0|\n",
- "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+-----------+------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "#drop trips not in business days\n",
- "stop_times_business_days = stop_times.join(trip_id_trips_businessdays, how=\"inner\", on = \"trip_id\").dropDuplicates()\n",
- "stop_times_business_days.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "3578105"
- ]
- }
- ],
- "source": [
- "stop_times_business_days.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Three quarters of the stop times get filtered out because they belong to trips not in services that run each day of the business week."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Filter stops\n",
- "We filter out stop times not belonging to stops in the 15km radius of Zürich HB (we have already computed this list of stops above)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We select the columns we are interested in"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----------+--------------------+\n",
- "| stop_id| stop_name|\n",
- "+-----------+--------------------+\n",
- "| 8500926|Oetwil a.d.L., Sc...|\n",
- "| 8502186|Dietikon Stoffelbach|\n",
- "|8502186:0:1|Dietikon Stoffelbach|\n",
- "|8502186:0:2|Dietikon Stoffelbach|\n",
- "| 8502186P|Dietikon Stoffelbach|\n",
- "+-----------+--------------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stop_id_stops_15km = stops_15km.select(stops_15km.stop_id, stops_15km.stop_name)\n",
- "stop_id_stops_15km.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Drop trips not in 15 km radius and add stop_names"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----------+--------------------+------------+--------------+-------------+-----------+-------------+------------+------------+--------------------+\n",
- "| stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type| route_id|direction_id| stop_name|\n",
- "+-----------+--------------------+------------+--------------+-------------+-----------+-------------+------------+------------+--------------------+\n",
- "|8503000:0:9|68.TA.25-75-j19-1...| 16:25:00| 16:25:00| 7| 0| 0| 25-75-j19-1| 1| Zürich HB|\n",
- "|8503202:0:4|264.TA.25-75-j19-...| 16:44:00| 16:45:00| 3| 0| 0| 25-75-j19-1| 0| Thalwil|\n",
- "| 8588055|5.TA.26-811-j19-1...| 17:09:00| 17:09:00| 10| 0| 0|26-811-j19-1| 0|Uster, Oberlandst...|\n",
- "| 8573504|7.TA.26-811-j19-1...| 23:56:00| 23:56:00| 11| 0| 0|26-811-j19-1| 0| Uster, Bahnhof|\n",
- "| 8587860|11.TA.26-811-j19-...| 22:47:00| 22:47:00| 3| 0| 0|26-811-j19-1| 0| Uster, Strick|\n",
- "+-----------+--------------------+------------+--------------+-------------+-----------+-------------+------------+------------+--------------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stop_times_zurich = stop_times_business_days.join(stop_id_stops_15km, how=\"inner\", on = \"stop_id\").dropDuplicates()\n",
- "stop_times_zurich.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "398630"
- ]
- }
- ],
- "source": [
- "stop_times_zurich.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This reduces the number of stop times by an order of magnitude."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "stop_times_zurich.write.csv('data/lgpt_guys/stop_times_zurich.csv', header = True, mode=\"overwrite\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "stop_times_zurich = spark.read.csv('data/lgpt_guys/stop_times_zurich.csv', header = True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Adding route information\n",
- "We need to get the route information from the routes.txt dataset\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Reading file from HDFS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----------+---------+----------------+---------------+----------+----------+\n",
- "| route_id|agency_id|route_short_name|route_long_name|route_desc|route_type|\n",
- "+-----------+---------+----------------+---------------+----------+----------+\n",
- "|11-40-j19-1| 801| 040| null| Bus| 700|\n",
- "|11-61-j19-1| 7031| 061| null| Bus| 700|\n",
- "|11-62-j19-1| 7031| 062| null| Bus| 700|\n",
- "|24-64-j19-1| 801| 064| null| Bus| 700|\n",
- "|11-83-j19-1| 801| 083| null| Bus| 700|\n",
- "+-----------+---------+----------------+---------------+----------+----------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "routes = spark.read.csv(\"/data/sbb/timetables/csv/routes/2019/05/14/routes.txt\", header=True, sep = \",\")\n",
- "routes.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------------+-----------+--------------------+------------+--------------+-------------+-----------+-------------+------------+--------------------+-------------+\n",
- "| route_id| stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|direction_id| stop_name| route_desc|\n",
- "+-------------+-----------+--------------------+------------+--------------+-------------+-----------+-------------+------------+--------------------+-------------+\n",
- "| 6-17-j19-1|8503001:0:7|491.TA.6-17-j19-1...| 08:58:00| 08:58:00| 2| 0| 0| 0| Zürich Altstetten| InterRegio|\n",
- "| 26-10-j19-1| 8591123|1413.TA.26-10-j19...| 13:27:00| 13:27:00| 5| 0| 0| 1|Zürich, ETH/Unive...| Tram|\n",
- "|26-23-A-j19-1| 8591362|432.TA.26-23-A-j1...| 10:00:00| 10:00:00| 1| 0| 0| 0|Zürich, Seilbahn ...|Standseilbahn|\n",
- "| 26-31-j19-1| 8591233|2629.TA.26-31-j19...| 21:02:00| 21:02:00| 24| 0| 0| 0| Zürich, Klusplatz| Bus|\n",
- "| 26-311-j19-1|8587020:0:D|128.TA.26-311-j19...| 11:30:00| 11:30:00| 14| 0| 0| 1| Dietikon, Bahnhof| Bus|\n",
- "+-------------+-----------+--------------------+------------+--------------+-------------+-----------+-------------+------------+--------------------+-------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stop_times_zurich_route = stop_times_zurich.join(routes.select(\"route_id\", \"route_desc\"), how=\"inner\", on = \"route_id\").dropDuplicates()\n",
- "stop_times_zurich_route.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Keeping 1 trip per week\n",
- "The trip_ids are day specific. The definition of trip for the RAPTOR algorithm is a sequence of stops at a given time, indipendent of the date. Due to this we keep only 1 trip_id for each stop sequence at a specific time\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "398630"
- ]
- }
- ],
- "source": [
- "stop_times_zurich_route.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "#drop day specific trips\n",
- "stop_times_route = stop_times_zurich_route.orderBy(\"trip_id\").dropDuplicates([\"stop_id\", \"arrival_time\", \"departure_time\", \\\n",
- " \"stop_sequence\", \"route_id\", \"direction_id\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "331751"
- ]
- }
- ],
- "source": [
- "stop_times_route.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+------------+---------------+--------------------+------------+--------------+-------------+------------+--------------------+----------+\n",
- "| route_id| stop_id| trip_id|arrival_time|departure_time|stop_sequence|direction_id| stop_name|route_desc|\n",
- "+------------+---------------+--------------------+------------+--------------+-------------+------------+--------------------+----------+\n",
- "|26-733-j19-1| 8580433|136.TA.26-733-j19...| 05:15:00| 05:15:00| 1| 0| Kloten, Graswinkel| Bus|\n",
- "|26-734-j19-1| 8573205:0:E|141.TA.26-734-j19...| 05:28:00| 05:28:00| 1| 1|Zürich Flughafen,...| Bus|\n",
- "| 26-10-j19-1| 8588553|396.TA.26-10-j19-...| 05:38:00| 05:38:00| 1| 0|Zürich Flughafen,...| Tram|\n",
- "|26-5-A-j19-1|8503000:0:41/42|19.TA.26-5-A-j19-...| 05:39:00| 05:39:00| 1| 0| Zürich HB| S-Bahn|\n",
- "|26-725-j19-1| 8573504|323.TA.26-725-j19...| 06:09:00| 06:09:00| 1| 0| Uster, Bahnhof| Bus|\n",
- "+------------+---------------+--------------------+------------+--------------+-------------+------------+--------------------+----------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stop_times_curated = stop_times_route.drop(\"pickup_type\", \"drop_off_type\")\n",
- "stop_times_curated.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "stop_times_curated.write.csv('data/lgpt_guys/stop_times_curated.csv', header = True, mode=\"overwrite\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Read preprocessed files\n",
- "Here we can have an overview of the file after a first Spark preprocessing"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Transfer time walking distance between zurich stops"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------+-----------+------------+-----------------+--------------------+--------------------+\n",
- "|stop_id| stop_id2| distance|Transfer_time_sec| stop_name| stop_name2|\n",
- "+-------+-----------+------------+-----------------+--------------------+--------------------+\n",
- "|8500926| 8590616| 0.122429974| 146|Oetwil a.d.L., Sc...|Geroldswil, Schwe...|\n",
- "|8500926| 8590737| 0.30017462| 360|Oetwil a.d.L., Sc...|Oetwil an der Lim...|\n",
- "|8502186|8502186:0:1|0.0067620375| 8|Dietikon Stoffelbach|Dietikon Stoffelbach|\n",
- "|8502186|8502186:0:2| 0.013524067| 16|Dietikon Stoffelbach|Dietikon Stoffelbach|\n",
- "|8502186| 8502186P| 0.0| 0|Dietikon Stoffelbach|Dietikon Stoffelbach|\n",
- "+-------+-----------+------------+-----------------+--------------------+--------------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "transfer = spark.read.csv('data/lgpt_guys/transfers.csv', header=True)\n",
- "transfer.show(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Stops within a 15 km radius of Zurich"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n",
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n",
- "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n",
- "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n",
- "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n",
- "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n",
- "+-----------+--------------------+----------------+----------------+-------------+--------------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stops_15km = spark.read.csv('data/lgpt_guys/stops_15km.csv', header = True)\n",
- "stops_15km.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1883"
- ]
- }
- ],
- "source": [
- "stops_15km.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Stop_times of stops inside 15 km and with trips on business days"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-------------+-----------+--------------------+------------+--------------+-------------+------------+--------------------+----------+\n",
- "| route_id| stop_id| trip_id|arrival_time|departure_time|stop_sequence|direction_id| stop_name|route_desc|\n",
- "+-------------+-----------+--------------------+------------+--------------+-------------+------------+--------------------+----------+\n",
- "| 26-759-j19-1|8573205:0:K|1330.TA.26-759-j1...| 05:28:00| 05:28:00| 1| 1|Zürich Flughafen,...| Bus|\n",
- "| 26-67-j19-1| 8591341|46.TA.26-67-j19-1...| 05:33:00| 05:33:00| 1| 1|Zürich, Schmiede ...| Bus|\n",
- "| 26-325-j19-1|8587020:0:D|265.TA.26-325-j19...| 05:34:00| 05:34:00| 1| 0| Dietikon, Bahnhof| Bus|\n",
- "|26-11-A-j19-1| 8591382|1266.TA.26-11-A-j...| 05:37:00| 05:37:00| 1| 0|Zürich, Sternen O...| Tram|\n",
- "| 26-302-j19-1| 8590844|162.TA.26-302-j19...| 05:49:00| 05:49:00| 1| 1| Urdorf, Oberurdorf| Bus|\n",
- "+-------------+-----------+--------------------+------------+--------------+-------------+------------+--------------------+----------+\n",
- "only showing top 5 rows"
- ]
- }
- ],
- "source": [
- "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)\n",
- "stop_times_curated.show(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "331751"
- ]
- }
- ],
- "source": [
- "stop_times_curated.count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "PySpark",
- "language": "",
- "name": "pysparkkernel"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "python",
- "version": 3
- },
- "mimetype": "text/x-python",
- "name": "pyspark",
- "pygments_lexer": "python3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/old_notebooks/debugging.ipynb b/notebooks/old_notebooks/debugging.ipynb
deleted file mode 100644
index 9a66390..0000000
--- a/notebooks/old_notebooks/debugging.ipynb
+++ /dev/null
@@ -1,1469 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pickle\n",
- "import numpy as np\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "def pkload(path):\n",
- " with open(path, 'rb') as f:\n",
- " obj = pickle.load(f)\n",
- " return obj"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Searching for journeys from Uster, Gschwader (stop 8588052) to Buchs ZH, Furttalstrasse (stop 8595356) with arrival at 17:30 leads to a footpath of over 3.2km + in reality, while the algorithm prints this:\n",
- "\n",
- "\" Walk 2.1 minutes from Birmensdorf ZH (stop 8502221)\n",
- " to Dällikon, Industrie (stop 8576276)\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " route_id | \n",
- " stop_id_general | \n",
- " trip_id | \n",
- " stop_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " stop_name | \n",
- " stop_lat | \n",
- " stop_lon | \n",
- " trip_headsign | \n",
- " trip_short_name | \n",
- " direction_id | \n",
- " departure_first_stop | \n",
- " route_int | \n",
- " stop_count | \n",
- " stop_int | \n",
- " route_desc | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 27708 | \n",
- " 197013 | \n",
- " 26-5-A-j19-1 | \n",
- " 8502221 | \n",
- " 114.TA.26-5-A-j19-1.37.R | \n",
- " 8502221:0:2 | \n",
- " 2020-05-24 19:35:00 | \n",
- " 2020-05-24 19:35:00 | \n",
- " 9 | \n",
- " Birmensdorf ZH | \n",
- " 47.357496 | \n",
- " 8.437543 | \n",
- " Pfäffikon SZ | \n",
- " 18575 | \n",
- " 1 | \n",
- " 19:22:00 | \n",
- " 149 | \n",
- " 10 | \n",
- " 276 | \n",
- " S-Bahn | \n",
- " 180388626589 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index route_id stop_id_general trip_id \\\n",
- "27708 197013 26-5-A-j19-1 8502221 114.TA.26-5-A-j19-1.37.R \n",
- "\n",
- " stop_id arrival_time departure_time stop_sequence \\\n",
- "27708 8502221:0:2 2020-05-24 19:35:00 2020-05-24 19:35:00 9 \n",
- "\n",
- " stop_name stop_lat stop_lon trip_headsign trip_short_name \\\n",
- "27708 Birmensdorf ZH 47.357496 8.437543 Pfäffikon SZ 18575 \n",
- "\n",
- " direction_id departure_first_stop route_int stop_count stop_int \\\n",
- "27708 1 19:22:00 149 10 276 \n",
- "\n",
- " route_desc monotonically_increasing_id \n",
- "27708 S-Bahn 180388626589 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Birmensdorf ZH\n",
- "stop_times = pkload(\"../data/stop_times_df_cyril.pkl\")\n",
- "stop_times[stop_times['stop_id_general']==8502221].head(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " route_id | \n",
- " stop_id_general | \n",
- " trip_id | \n",
- " stop_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " stop_name | \n",
- " stop_lat | \n",
- " stop_lon | \n",
- " trip_headsign | \n",
- " trip_short_name | \n",
- " direction_id | \n",
- " departure_first_stop | \n",
- " route_int | \n",
- " stop_count | \n",
- " stop_int | \n",
- " route_desc | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1935 | \n",
- " 222352 | \n",
- " 26-449-j19-1 | \n",
- " 8576276 | \n",
- " 17.TA.26-449-j19-1.1.H | \n",
- " 8576276 | \n",
- " NaT | \n",
- " 2020-05-24 07:00:00 | \n",
- " 2 | \n",
- " Dällikon, Industrie | \n",
- " 47.444737 | \n",
- " 8.438783 | \n",
- " Buchs-Dällikon, Bahnhof | \n",
- " 4747 | \n",
- " 0 | \n",
- " 07:00:00 | \n",
- " 19 | \n",
- " 3 | \n",
- " 474 | \n",
- " Bus | \n",
- " 8589935205 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index route_id stop_id_general trip_id stop_id \\\n",
- "1935 222352 26-449-j19-1 8576276 17.TA.26-449-j19-1.1.H 8576276 \n",
- "\n",
- " arrival_time departure_time stop_sequence stop_name \\\n",
- "1935 NaT 2020-05-24 07:00:00 2 Dällikon, Industrie \n",
- "\n",
- " stop_lat stop_lon trip_headsign trip_short_name \\\n",
- "1935 47.444737 8.438783 Buchs-Dällikon, Bahnhof 4747 \n",
- "\n",
- " direction_id departure_first_stop route_int stop_count stop_int \\\n",
- "1935 0 07:00:00 19 3 474 \n",
- "\n",
- " route_desc monotonically_increasing_id \n",
- "1935 Bus 8589935205 "
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Dallikon Industrie\n",
- "stop_times[stop_times['stop_id_general']==8576276].head(1)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "A google maps footpaths using GPS coordinates yields a walk of 14.1 km. Is there a footpath defined between those two stops in transfers ?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers = pkload(\"../data/transfer_df_cyril.pkl\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " stop_id_general | \n",
- " stop_int | \n",
- " stop_lat_first | \n",
- " stop_lon_first | \n",
- " stop_name_first | \n",
- " stop_id_general_2 | \n",
- " stop_int_2 | \n",
- " stop_lat_first_2 | \n",
- " stop_lon_first_2 | \n",
- " stop_name_first_2 | \n",
- " distance | \n",
- " walking_time | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1300 | \n",
- " 3887 | \n",
- " 8502221 | \n",
- " 276 | \n",
- " 47.357557 | \n",
- " 8.437543 | \n",
- " Birmensdorf ZH | \n",
- " 8573718 | \n",
- " 473 | \n",
- " 47.357125 | \n",
- " 8.438801 | \n",
- " Birmensdorf ZH, Bahnhof | \n",
- " 0.106219 | \n",
- " 127 | \n",
- " 352187318287 | \n",
- "
\n",
- " \n",
- " 1301 | \n",
- " 3888 | \n",
- " 8502221 | \n",
- " 276 | \n",
- " 47.357557 | \n",
- " 8.437543 | \n",
- " Birmensdorf ZH | \n",
- " 8502950 | \n",
- " 877 | \n",
- " 47.353936 | \n",
- " 8.437175 | \n",
- " Birmensdorf ZH, Zentrum | \n",
- " 0.403584 | \n",
- " 484 | \n",
- " 352187318288 | \n",
- "
\n",
- " \n",
- " 1302 | \n",
- " 3889 | \n",
- " 8502221 | \n",
- " 276 | \n",
- " 47.357557 | \n",
- " 8.437543 | \n",
- " Birmensdorf ZH | \n",
- " 8583870 | \n",
- " 1154 | \n",
- " 47.357234 | \n",
- " 8.437013 | \n",
- " Birmensdorf ZH, Bahnhof Süd | \n",
- " 0.053666 | \n",
- " 64 | \n",
- " 352187318289 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index stop_id_general stop_int stop_lat_first stop_lon_first \\\n",
- "1300 3887 8502221 276 47.357557 8.437543 \n",
- "1301 3888 8502221 276 47.357557 8.437543 \n",
- "1302 3889 8502221 276 47.357557 8.437543 \n",
- "\n",
- " stop_name_first stop_id_general_2 stop_int_2 stop_lat_first_2 \\\n",
- "1300 Birmensdorf ZH 8573718 473 47.357125 \n",
- "1301 Birmensdorf ZH 8502950 877 47.353936 \n",
- "1302 Birmensdorf ZH 8583870 1154 47.357234 \n",
- "\n",
- " stop_lon_first_2 stop_name_first_2 distance walking_time \\\n",
- "1300 8.438801 Birmensdorf ZH, Bahnhof 0.106219 127 \n",
- "1301 8.437175 Birmensdorf ZH, Zentrum 0.403584 484 \n",
- "1302 8.437013 Birmensdorf ZH, Bahnhof Süd 0.053666 64 \n",
- "\n",
- " monotonically_increasing_id \n",
- "1300 352187318287 \n",
- "1301 352187318288 \n",
- "1302 352187318289 "
- ]
- },
- "execution_count": 58,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# transfers from Birmensdorf ZH\n",
- "transfers[transfers['stop_id_general']==8502221]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Transfers from Dallikon Industrie"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " stop_id_general | \n",
- " stop_int | \n",
- " stop_lat_first | \n",
- " stop_lon_first | \n",
- " stop_name_first | \n",
- " stop_id_general_2 | \n",
- " stop_int_2 | \n",
- " stop_lat_first_2 | \n",
- " stop_lon_first_2 | \n",
- " stop_name_first_2 | \n",
- " distance | \n",
- " walking_time | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [index, stop_id_general, stop_int, stop_lat_first, stop_lon_first, stop_name_first, stop_id_general_2, stop_int_2, stop_lat_first_2, stop_lon_first_2, stop_name_first_2, distance, walking_time, monotonically_increasing_id]\n",
- "Index: []"
- ]
- },
- "execution_count": 59,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfers[transfers['stop_id_general']==8576276]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "There are no footpaths between Birmensdorf ZH and Dallikon Industrie, and interestingly, Dallikon Industrie has no footpaths at all. That points to a bug with the gestion of 'None' pointers in the array stops for footpaths"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfer_array = pkload(\"../data/transfer_array_cyril.pkl\")\n",
- "stops = pkload(\"../data/stops_array_cyril.pkl\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[3075, 1297.0],\n",
- " [3119, 1300.0],\n",
- " [3131, 1303.0]], dtype=object)"
- ]
- },
- "execution_count": 64,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# transfers from Birmensdorf ZH\n",
- "stop_int = 276\n",
- "stops[stop_int-1:stop_int+2]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 473, 127],\n",
- " [ 877, 484],\n",
- " [1154, 64]])"
- ]
- },
- "execution_count": 65,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**The first entry of transfer_array[276] corresponding to Birmensdorf ZH is 473, which is one less than 474, the stop_id of Dallikon Industrie**. That may be a clue to what's going wrong with the algorithm."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[5408, 2157.0],\n",
- " [5417, nan],\n",
- " [5420, 2160.0]], dtype=object)"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# transfers from Dallikon Industrie\n",
- "stop_int = 474\n",
- "stops[stop_int-1:stop_int+2]\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "metadata": {},
- "outputs": [
- {
- "ename": "ValueError",
- "evalue": "cannot convert float NaN to integer",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Trying to access the transfer for the nan pointer (SHOULD FAIL)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtransfer_array\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m: cannot convert float NaN to integer"
- ]
- }
- ],
- "source": [
- "# Trying to access the transfer for the nan pointer (SHOULD FAIL)\n",
- "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "metadata": {},
- "outputs": [],
- "source": [
- "# checking out what's around the nan pointer:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 276, 127],\n",
- " [ 877, 450],\n",
- " [1154, 162]])"
- ]
- },
- "execution_count": 71,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfer_array[int(stops[stop_int-1][1]):int(stops[stop_int+1][1])]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The first entry of transfer_array[stop_int-1] is 276, the stop_int of Birmensdorf ZH."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## recapitulating the bug with another journey\n",
- "\n",
- "Searching for journeys from Dübendorf, Branzenäsch (stop 8590551) to Kloten, Weinbergstrasse (stop 8579967) with arrival at 17:30 gives an impossible first walk from Dübendorf, Branzenäsch to Uetliberg (stop 8503057)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 72,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " route_id | \n",
- " stop_id_general | \n",
- " trip_id | \n",
- " stop_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " stop_name | \n",
- " stop_lat | \n",
- " stop_lon | \n",
- " trip_headsign | \n",
- " trip_short_name | \n",
- " direction_id | \n",
- " departure_first_stop | \n",
- " route_int | \n",
- " stop_count | \n",
- " stop_int | \n",
- " route_desc | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 95873 | \n",
- " 110350 | \n",
- " 26-752-j19-1 | \n",
- " 8590551 | \n",
- " 190.TA.26-752-j19-1.4.R | \n",
- " 8590551 | \n",
- " 2020-05-24 07:16:00 | \n",
- " 2020-05-24 07:16:00 | \n",
- " 11 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Kunsteisbahn | \n",
- " 1420 | \n",
- " 1 | \n",
- " 07:06:00 | \n",
- " 563 | \n",
- " 15 | \n",
- " 503 | \n",
- " Bus | \n",
- " 627065226053 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index route_id stop_id_general trip_id \\\n",
- "95873 110350 26-752-j19-1 8590551 190.TA.26-752-j19-1.4.R \n",
- "\n",
- " stop_id arrival_time departure_time stop_sequence \\\n",
- "95873 8590551 2020-05-24 07:16:00 2020-05-24 07:16:00 11 \n",
- "\n",
- " stop_name stop_lat stop_lon trip_headsign \\\n",
- "95873 Dübendorf, Branzenäsch 47.394665 8.631157 Dübendorf, Kunsteisbahn \n",
- "\n",
- " trip_short_name direction_id departure_first_stop route_int \\\n",
- "95873 1420 1 07:06:00 563 \n",
- "\n",
- " stop_count stop_int route_desc monotonically_increasing_id \n",
- "95873 15 503 Bus 627065226053 "
- ]
- },
- "execution_count": 72,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Dübendorf, Branzenäsch\n",
- "stop_times[stop_times['stop_id_general']== 8590551].head(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 73,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " route_id | \n",
- " stop_id_general | \n",
- " trip_id | \n",
- " stop_id | \n",
- " arrival_time | \n",
- " departure_time | \n",
- " stop_sequence | \n",
- " stop_name | \n",
- " stop_lat | \n",
- " stop_lon | \n",
- " trip_headsign | \n",
- " trip_short_name | \n",
- " direction_id | \n",
- " departure_first_stop | \n",
- " route_int | \n",
- " stop_count | \n",
- " stop_int | \n",
- " route_desc | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 5304 | \n",
- " 51749 | \n",
- " 26-10-B-j19-1 | \n",
- " 8503057 | \n",
- " 181.TA.26-10-B-j19-1.9.H | \n",
- " 8503057 | \n",
- " 2020-05-24 07:14:00 | \n",
- " NaT | \n",
- " 8 | \n",
- " Uetliberg | \n",
- " 47.352366 | \n",
- " 8.487651 | \n",
- " Uetliberg | \n",
- " 23511 | \n",
- " 0 | \n",
- " 07:03:00 | \n",
- " 62 | \n",
- " 4 | \n",
- " 415 | \n",
- " S-Bahn | \n",
- " 25769805095 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index route_id stop_id_general trip_id \\\n",
- "5304 51749 26-10-B-j19-1 8503057 181.TA.26-10-B-j19-1.9.H \n",
- "\n",
- " stop_id arrival_time departure_time stop_sequence stop_name \\\n",
- "5304 8503057 2020-05-24 07:14:00 NaT 8 Uetliberg \n",
- "\n",
- " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n",
- "5304 47.352366 8.487651 Uetliberg 23511 0 \n",
- "\n",
- " departure_first_stop route_int stop_count stop_int route_desc \\\n",
- "5304 07:03:00 62 4 415 S-Bahn \n",
- "\n",
- " monotonically_increasing_id \n",
- "5304 25769805095 "
- ]
- },
- "execution_count": 73,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Uetliberg (stop 8503057)\n",
- "stop_times[stop_times['stop_id_general']== 8503057].head(1)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "A google maps search for footpaths between the coordinates yields a 14.5 km walk."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 74,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " stop_id_general | \n",
- " stop_int | \n",
- " stop_lat_first | \n",
- " stop_lon_first | \n",
- " stop_name_first | \n",
- " stop_id_general_2 | \n",
- " stop_int_2 | \n",
- " stop_lat_first_2 | \n",
- " stop_lon_first_2 | \n",
- " stop_name_first_2 | \n",
- " distance | \n",
- " walking_time | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 2256 | \n",
- " 2938 | \n",
- " 8590551 | \n",
- " 503 | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 8590587 | \n",
- " 414 | \n",
- " 47.393053 | \n",
- " 8.633367 | \n",
- " Dübendorf, Sonnenberg | \n",
- " 0.244501 | \n",
- " 293 | \n",
- " 618475290624 | \n",
- "
\n",
- " \n",
- " 2257 | \n",
- " 2939 | \n",
- " 8590551 | \n",
- " 503 | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 8590555 | \n",
- " 599 | \n",
- " 47.394877 | \n",
- " 8.628714 | \n",
- " Dübendorf, Claridenstrasse | \n",
- " 0.185439 | \n",
- " 222 | \n",
- " 618475290625 | \n",
- "
\n",
- " \n",
- " 2258 | \n",
- " 2940 | \n",
- " 8590551 | \n",
- " 503 | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 8590581 | \n",
- " 886 | \n",
- " 47.391557 | \n",
- " 8.634876 | \n",
- " Dübendorf, Raubbühl | \n",
- " 0.444715 | \n",
- " 533 | \n",
- " 618475290626 | \n",
- "
\n",
- " \n",
- " 2259 | \n",
- " 2941 | \n",
- " 8590551 | \n",
- " 503 | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 8590550 | \n",
- " 931 | \n",
- " 47.397048 | \n",
- " 8.625614 | \n",
- " Dübendorf, Bettlistrasse | \n",
- " 0.494279 | \n",
- " 593 | \n",
- " 618475290627 | \n",
- "
\n",
- " \n",
- " 2260 | \n",
- " 2942 | \n",
- " 8590551 | \n",
- " 503 | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 8590590 | \n",
- " 1226 | \n",
- " 47.390535 | \n",
- " 8.632469 | \n",
- " Dübendorf, Sunnhalde | \n",
- " 0.469644 | \n",
- " 563 | \n",
- " 618475290628 | \n",
- "
\n",
- " \n",
- " 2261 | \n",
- " 2943 | \n",
- " 8590551 | \n",
- " 503 | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 8590562 | \n",
- " 1244 | \n",
- " 47.396240 | \n",
- " 8.629935 | \n",
- " Dübendorf, Glärnischstrasse | \n",
- " 0.197806 | \n",
- " 237 | \n",
- " 618475290629 | \n",
- "
\n",
- " \n",
- " 2262 | \n",
- " 2944 | \n",
- " 8590551 | \n",
- " 503 | \n",
- " 47.394665 | \n",
- " 8.631157 | \n",
- " Dübendorf, Branzenäsch | \n",
- " 8590549 | \n",
- " 1392 | \n",
- " 47.394817 | \n",
- " 8.634580 | \n",
- " Dübendorf, Bauhof | \n",
- " 0.258181 | \n",
- " 309 | \n",
- " 618475290630 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " index stop_id_general stop_int stop_lat_first stop_lon_first \\\n",
- "2256 2938 8590551 503 47.394665 8.631157 \n",
- "2257 2939 8590551 503 47.394665 8.631157 \n",
- "2258 2940 8590551 503 47.394665 8.631157 \n",
- "2259 2941 8590551 503 47.394665 8.631157 \n",
- "2260 2942 8590551 503 47.394665 8.631157 \n",
- "2261 2943 8590551 503 47.394665 8.631157 \n",
- "2262 2944 8590551 503 47.394665 8.631157 \n",
- "\n",
- " stop_name_first stop_id_general_2 stop_int_2 stop_lat_first_2 \\\n",
- "2256 Dübendorf, Branzenäsch 8590587 414 47.393053 \n",
- "2257 Dübendorf, Branzenäsch 8590555 599 47.394877 \n",
- "2258 Dübendorf, Branzenäsch 8590581 886 47.391557 \n",
- "2259 Dübendorf, Branzenäsch 8590550 931 47.397048 \n",
- "2260 Dübendorf, Branzenäsch 8590590 1226 47.390535 \n",
- "2261 Dübendorf, Branzenäsch 8590562 1244 47.396240 \n",
- "2262 Dübendorf, Branzenäsch 8590549 1392 47.394817 \n",
- "\n",
- " stop_lon_first_2 stop_name_first_2 distance walking_time \\\n",
- "2256 8.633367 Dübendorf, Sonnenberg 0.244501 293 \n",
- "2257 8.628714 Dübendorf, Claridenstrasse 0.185439 222 \n",
- "2258 8.634876 Dübendorf, Raubbühl 0.444715 533 \n",
- "2259 8.625614 Dübendorf, Bettlistrasse 0.494279 593 \n",
- "2260 8.632469 Dübendorf, Sunnhalde 0.469644 563 \n",
- "2261 8.629935 Dübendorf, Glärnischstrasse 0.197806 237 \n",
- "2262 8.634580 Dübendorf, Bauhof 0.258181 309 \n",
- "\n",
- " monotonically_increasing_id \n",
- "2256 618475290624 \n",
- "2257 618475290625 \n",
- "2258 618475290626 \n",
- "2259 618475290627 \n",
- "2260 618475290628 \n",
- "2261 618475290629 \n",
- "2262 618475290630 "
- ]
- },
- "execution_count": 74,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# footpaths from Dübendorf, Branzenäsch\n",
- "transfers[transfers['stop_id_general']== 8590551]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "No sign of Uetliberg here."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " index | \n",
- " stop_id_general | \n",
- " stop_int | \n",
- " stop_lat_first | \n",
- " stop_lon_first | \n",
- " stop_name_first | \n",
- " stop_id_general_2 | \n",
- " stop_int_2 | \n",
- " stop_lat_first_2 | \n",
- " stop_lon_first_2 | \n",
- " stop_name_first_2 | \n",
- " distance | \n",
- " walking_time | \n",
- " monotonically_increasing_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [index, stop_id_general, stop_int, stop_lat_first, stop_lon_first, stop_name_first, stop_id_general_2, stop_int_2, stop_lat_first_2, stop_lon_first_2, stop_name_first_2, distance, walking_time, monotonically_increasing_id]\n",
- "Index: []"
- ]
- },
- "execution_count": 75,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# transfers from Uetliberg (stop 8503057)\n",
- "transfers[transfers['stop_id_general']== 8503057]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Again, utliberg has no footpaths, like Dallikon Industrie in the previous bug."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 79,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[5599, 2253.0],\n",
- " [5603, 2256.0],\n",
- " [5606, 2263.0]], dtype=object)"
- ]
- },
- "execution_count": 79,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# footpaths from Dübendorf, Branzenäsch\n",
- "stop_int = 503\n",
- "stops[stop_int-1:stop_int+2]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 80,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 414, 293],\n",
- " [ 599, 222],\n",
- " [ 886, 533],\n",
- " [ 931, 593],\n",
- " [1226, 563],\n",
- " [1244, 237],\n",
- " [1392, 309]])"
- ]
- },
- "execution_count": 80,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notice here again that the first transfer from Dübendorf, Branzenäsch goes to stop_int= 414, which is one less than the stop_int of Utliberg (415)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 81,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[4850, 1901.0],\n",
- " [4853, nan],\n",
- " [4860, 1907.0]], dtype=object)"
- ]
- },
- "execution_count": 81,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# footpahts from Utliberg:\n",
- "stop_int = 415\n",
- "stops[stop_int-1:stop_int+2]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 83,
- "metadata": {},
- "outputs": [
- {
- "ename": "ValueError",
- "evalue": "cannot convert float NaN to integer",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# expected to fail: accessing a none pointer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtransfer_array\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m: cannot convert float NaN to integer"
- ]
- }
- ],
- "source": [
- "# expected to fail: accessing a none pointer\n",
- "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 84,
- "metadata": {},
- "outputs": [
- {
- "ename": "ValueError",
- "evalue": "cannot convert float NaN to integer",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtransfer_array\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m: cannot convert float NaN to integer"
- ]
- }
- ],
- "source": [
- "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 85,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 503, 293],\n",
- " [ 599, 485],\n",
- " [ 886, 241],\n",
- " [1226, 345],\n",
- " [1244, 526],\n",
- " [1392, 259]])"
- ]
- },
- "execution_count": 85,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfer_array[int(stops[stop_int-1][1]):int(stops[stop_int+1][1])]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Again, the first entry of transfer_array[stop_int-1] is 503, the stop_int of Dübendorf, Branzenäsch !"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Testing Felix's stop conversion:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([False])"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "np.isnan(np.arange(1))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[ 0 75]\n",
- "[0 0]\n",
- "(1407, 2)\n",
- "[ 0 0 0 75]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "array([[ 0, 11, 0, 2],\n",
- " [ 11, 20, 2, 7],\n",
- " [ 20, 38, 7, 22],\n",
- " ...,\n",
- " [15303, 15334, 6242, 6250],\n",
- " [15334, 15339, 6250, 6257],\n",
- " [15339, 15344, 6257, 6264]], dtype=uint32)"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stops = pkload(\"../data/stops_array_cyril.pkl\")\n",
- "stopRoutes = pkload(\"../data/stop_routes_array_cyril.pkl\")\n",
- "print(np.isnan(stops.astype(np.float64)).sum(axis=0))\n",
- "print(np.equal(stops, None).sum(axis=0))\n",
- "print(stops.shape)\n",
- "stops = stops[:,[0,0,1,1]]\n",
- "# Make column 1 contain the start_index of the next stop in stopRoutes\n",
- "stops[:-1,1] = stops[1:,0]\n",
- "stops[-1, 1] = stopRoutes.shape[0]\n",
- "# Make column 3 contain the start_index of the next stop in stopRoutes\n",
- "if np.isnan(stops[-1,2]).item():\n",
- " stops[-1,2] = transfers.shape[0]\n",
- "for i in np.isnan(stops[:-1,2].astype(np.float64)).nonzero()[0][::-1]:\n",
- " stops[i,2] = stops[i+1,2]\n",
- "print(np.isnan(stops.astype(np.float64)).sum(axis=0))\n",
- "stops[:-1,3] = stops[1:,2]\n",
- "stops[-1, 3] = transfers.shape[0]\n",
- "# Convert to int\n",
- "stops = stops.astype(np.uint32)\n",
- "stops"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[5599, 5603, 2253, 2256],\n",
- " [5603, 5606, 2256, 2263],\n",
- " [5606, 5615, 2263, 2266]], dtype=uint32)"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# footpaths from Dübendorf, Branzenäsch\n",
- "stop_int = 503\n",
- "stops[stop_int-1:stop_int+2]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[ 414, 293],\n",
- " [ 599, 222],\n",
- " [ 886, 533],\n",
- " [ 931, 593],\n",
- " [1226, 563],\n",
- " [1244, 237],\n",
- " [1392, 309]])"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "transfer_array[stops[stop_int][2]:stops[stop_int][3]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[4850, 4853, 1901, 1907],\n",
- " [4853, 4860, 1907, 1907],\n",
- " [4860, 4885, 1907, 1914]], dtype=uint32)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# footpaths from utliberg\n",
- "# footpahts from Utliberg:\n",
- "stop_int = 415\n",
- "stops[stop_int-1:stop_int+2]\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "There's the bug ! The pointers for transfer_array in the first entry shows 1901, 1901 when it should in fact be the second entry that shows 1901, 1901. The first entry should show (1901, 1907) (see cell nr 81)."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/old_notebooks/raptor_toy_example_2020_05_21.ipynb b/notebooks/old_notebooks/raptor_toy_example_2020_05_21.ipynb
deleted file mode 100644
index 0be9aca..0000000
--- a/notebooks/old_notebooks/raptor_toy_example_2020_05_21.ipynb
+++ /dev/null
@@ -1,1916 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Coding a RAPTOR toy example\n",
- "\n",
- "## Goal\n",
- "\n",
- "Learn the RAPTOR algorithm by coding it for a toy example with the data structures advised in the paper. We code RAPTOR for a super simple toy example with only two routes and two trips each.\n",
- "\n",
- "## Toy example\n",
- "- TODO updates:\n",
- " - additional route r2 that goes from A to E slowly\n",
- " - walking paths\n",
- "![toy_example](img/RAPTOR_example.png) \n",
- "\n",
- "## Encoding the data structures\n",
- "### General considerations\n",
- "We adhere to the data structures proposed by Delling et al. These structures aim to minimize read times in memory by making use of consecutive in-memory adresses. Thus, structures with varying dimensions (e.g dataframes, python lists) are excluded. We illustrate the difficulty with an example. \n",
- "\n",
- "Each route has a potentially unique number of stops. Therefore, we cannot store stops in a 2D array of routes by stops, as the number of stops is not the same for each route. We adress this problem by storing stops consecutively by route, and keeping track of the index of the first stop for each route.\n",
- "\n",
- "This general strategy is applied to all the required data structures.\n",
- "\n",
- "### routes\n",
- "The `routes` array will contain arrays `[n_trips, n_stops, pt_1st_stop, pt_1st_trip]` where all four values are `int`. To avoid overcomplicating things and try to mimic pointers in python, `pt_1st_stop` and `pt_1st_trip` contain integer indices."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "lines_to_next_cell": 0
- },
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "routes = np.array([[2, 3, 0, 0], #r0\n",
- " [2, 3, 3, 6], #r1\n",
- " [2, 2, 6, 12]]) # r2"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### routeStops\n",
- "`routeStops` is an array that contains the ordered lists of stops for each route. `pt_1st_stop` in `routes` is required to get to the first stop of the route. is itself an array that contains the sequence of stops for route $r_i$."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {
- "lines_to_next_cell": 0
- },
- "outputs": [],
- "source": [
- "routeStops = np.array([0, 1, 2, # A, B, C\n",
- " 3, 2, 4, # D, C, E\n",
- " 0, 4]) # A, E"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### stopTimes\n",
- "\n",
- "The i-th entry in the `stopTimes` array is itself an array which contains the arrival and departure time at a particular stop for a particular trip. `stopTimes` is sorted by routes, and then by trips. We retrieve the index of the first (earliest) trip of the route with the pointer `pt_1st_trip` stored in `routes`. We may use the built-in `numpy` [date and time data structures](https://blog.finxter.com/how-to-work-with-dates-and-times-in-python/). In short, declaring dates and times is done like this: `np.datetime64('YYYY-MM-DDThh:mm')`. Entries with a `NaT` arrival or departure times correspond to beginning and end of trips respectively.\n",
- "\n",
- "Note that trips are indexed implicitely in stopTimes, but we decided to change a little bit from the paper and index them according to their parent route instead of giving them an absolute index. It makes things a bit easier when coding the algorithm."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [],
- "source": [
- "stopTimes = np.array([\n",
- " # r0, t0\n",
- " [None, '2020-05-11T08:00'],\n",
- " ['2020-05-11T08:25', '2020-05-11T08:30'],\n",
- " ['2020-05-11T08:55', None],\n",
- "\n",
- " # ro, t1\n",
- " [None, '2020-05-11T08:10'],\n",
- " ['2020-05-11T08:35', '2020-05-11T08:40'],\n",
- " ['2020-05-11T09:05', None],\n",
- " \n",
- " # r1, t0 \n",
- " [None, '2020-05-11T08:00'],\n",
- " ['2020-05-11T08:05', '2020-05-11T08:10'],\n",
- " ['2020-05-11T08:15', None],\n",
- "\n",
- " # r1, t1\n",
- " [None, '2020-05-11T09:00'],\n",
- " ['2020-05-11T09:05', '2020-05-11T09:10'],\n",
- " ['2020-05-11T09:15', None],\n",
- " \n",
- " #r2, t0\n",
- " [None, '2020-05-11T08:20'],\n",
- " ['2020-05-11T09:20', None],\n",
- " \n",
- " #r2, t1\n",
- " [None, '2020-05-11T08:30'],\n",
- " ['2020-05-11T09:30', None]],\n",
- " dtype='datetime64')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "`NaT` is the `None` equivalent for `numpy datetime64`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ True, False])"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "np.isnat(stopTimes[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### stopRoutes\n",
- "\n",
- "`stopRoutes` contains the routes associated with each stop. We need the pointer in `stops` to index `stopRoutes` correctly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [],
- "source": [
- "stopRoutes = np.array([0, 2, # A\n",
- " 0, # B\n",
- " 0,1, # C\n",
- " 1, # D\n",
- " 1, 2]) # E"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We should also build an array for transfer times (including walking times), but for now let's ignore this additional complexity. Finally, the i-th entry in the `stops` array points to the first entry in `stopRoutes` (and `transfers` when that will be tried) associated with stop $p_i$"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops = np.array([[0, None],# A\n",
- " [2, None], # B\n",
- " [3, None],# C\n",
- " [5, None], # D\n",
- " [6, None], # E\n",
- " [len(stopRoutes), None]]) # fictive stop to account for length of E"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Coding the standard RAPTOR\n",
- "\n",
- "Below, we code RAPTOR as it is described in the paper, with all optimizations. That corresponds to the pseudocode block in the article. It solves the earliest arrival time problem: we enter an start stop, a target stop and a departure time and it finds the earliest arrival time in k rounds (i.e taking at most k transports). Note that walking between stops is not considered a transport."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [],
- "source": [
- "p_s = 0 # start stop = A\n",
- "p_t = 4 # target stop = E\n",
- "tau_0 = np.datetime64('2020-05-11T08:05') # departure time 08:05\n",
- "k_max = 10 # we set a maximum number of transports to pre-allocate memory for the numpy array tau_i"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [],
- "source": [
- "def raptor_standard(p_s, p_t, tau_0, routes, routeStops, stopTimes, stopRoutes, stops,\n",
- " k_max=10):\n",
- " \n",
- " #******************************************initialization******************************************\n",
- " n_stops = len(stops)-1 # to remove the fictive stop to account for all the routes belonging to the last stop\n",
- "\n",
- " # earliest arrival time at each stop for each round.\n",
- " tau = np.full(shape=(k_max, n_stops), fill_value = np.datetime64('2100-01-01T00:00')) # 2100 instead of infinity # number of stops * max number of transports\n",
- "\n",
- " # earliest arrival time at each stop, indep. of round\n",
- " tau_star = np.full(shape=n_stops, fill_value = np.datetime64('2100-01-01T00:00'))\n",
- "\n",
- " # to backtrack the journey of TRANSPORTS once it is finished\n",
- " #[route, trip, boarding stop, exit stop]\n",
- " # we will keep [r, t, p_b, p_e, p_f1, pf2, t_w] i.e \n",
- " # [route, trip (offset by route, not absolute), boarding stop, exit stop, beginning stop of the walk, target stop of the walk, time walked]\n",
- " journey = np.full(shape=(k_max, n_stops, 7), fill_value = -1, dtype=int)\n",
- " \n",
- " marked = [p_s]\n",
- " q = []\n",
- " tau[0, p_s] = tau_0\n",
- " \n",
- " #Maybe TODO (but not in original raptor): footpaths from the departure stop\n",
- "\n",
- " #****************************************** main loop******************************************\n",
- " for k in np.arange(1, k_max+1):\n",
- " print('\\n******************************STARTING round k={}******************************'.format(k))\n",
- " # accumulate routes serving marked stops from previous rounds\n",
- " q = []\n",
- " marked = list(set(marked)) # removing potential duplicate stops in marked due to walking paths\n",
- " print('Marked stops at the start of the round: {}'.format(marked))\n",
- " for p in marked:\n",
- " for r in stopRoutes[stops[p][0]:stops[p+1][0]]: # foreach route r serving p\n",
- " print('Route considered for the queue: ({0}, {1})'.format(r, p))\n",
- " inQueue = False\n",
- " for idx, (rPrime, pPrime) in enumerate(q): \n",
- " # is there already another stop from the same route in q ?\n",
- " if (rPrime == r): \n",
- " # is there already a later stop from the same route in q ?\n",
- " if(np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == pPrime)[0][0] >\\\n",
- " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p)[0][0]):\n",
- " #in that case, replace the later stop pPrime by stop p in q\n",
- " q[idx] = (r, p)\n",
- " inQueue = True\n",
- " # is there already an earlier stop from the same route in q ?\n",
- " else:\n",
- " # in that case, do not add p to the q.\n",
- " inQueue=True\n",
- " if not inQueue:\n",
- " q.append((r, p))\n",
- "\n",
- " marked = [] # unmarking all stops\n",
- "\n",
- " print('Queue before traversing each route: {}'.format(q))\n",
- " # traverse each route\n",
- " for (r, p) in q:\n",
- " print('\\n****TRAVERSING ROUTE r={0} from stop p={1}****'.format(r, p))\n",
- " # t is the t-th trip in route r, not the t-th trip in all trips. This makes things easier\n",
- " t = None\n",
- " # we will keep [r, t, p_b, p_e, p_f, t_w] i.e \n",
- " # [route, trip (offset by route, not absolute), boarding stop, exit stop, target stop of the walk, time walked]\n",
- " t_journey = np.empty(4, dtype=int)# contains tripID, board and exit stops to backtrack the journey\n",
- "\n",
- "\n",
- " # we only traverse the route starting at p, not from the beginning of the route\n",
- " for p_i in routeStops[routes[r][2]+np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p)[0][0]:\\\n",
- " routes[r][2]+routes[r][1]]:\n",
- " print(\"p_i: {}\".format(p_i))\n",
- "\n",
- " if (t is not None):\n",
- " # 1st trip of route + \n",
- " # offset for the right trip + \n",
- " # offset for the right stop\n",
- " arr_t_p_i = stopTimes[routes[r][3] + \\\n",
- " t * routes[r][1] + \\\n",
- " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0]][0]\n",
- " print(\"arr_t_p_i: {}\".format(arr_t_p_i))\n",
- "\n",
- " if arr_t_p_i < min(tau_star[p_i], tau_star[p_t]):\n",
- " tau[k][p_i] = arr_t_p_i\n",
- " tau_star[p_i] = arr_t_p_i\n",
- " marked.append(p_i)\n",
- " # keep a trace that we went down the trip taken before at this stop\n",
- " t_journey[3] = p_i\n",
- " journey[k][p_i][0:4] = t_journey\n",
- " # Can we catch an earlier trip at p_i ?\n",
- " print('\\n----scanning departure times for route r={0} at stop p_i={1}----'.format(r, p_i))\n",
- " t_r = 0\n",
- " while True:\n",
- " t_r_dep = stopTimes[routes[r][3]+\\\n",
- " # offset corresponding to stop p_i in route r\n",
- " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0] + \\\n",
- " routes[r][1]*t_r][1]\n",
- "\n",
- " print(\"Earliest arrival time at previous step: tau[k-1][p_i]: {}\".format(tau[k-1][p_i]))\n",
- " print(\"Departure time considered: t_r_dep: {}\".format(t_r_dep))\n",
- " # We hop on the first trip that departs later than our arrival time at p_i in k-1 transports\n",
- " if t_r_dep > tau[k-1][p_i]:\n",
- " t = t_r\n",
- " print('\\n!!!!Hopped on route r={0}, trip t={1} at stop p_i={2}!!!!'.format(r, t, p_i))\n",
- "\n",
- " # here we probably need to save the trip and boarding stop (boarding time will not be useful)\n",
- " t_journey[0] = r\n",
- " t_journey[1] = t\n",
- " t_journey[2] = p_i\n",
- " break\n",
- " t_r += 1\n",
- "\n",
- " # we could not hop on any trip at this stop\n",
- " if t_r == routes[r][0]:\n",
- " break\n",
- " \n",
- " print('\\n****FOOTPATHS****')\n",
- " \n",
- " marked_footpaths = [] # storing marked stops for footpaths in a separate list to avoid inifinite loops\n",
- " for p in marked:\n",
- " if stops[p][1] is not None:\n",
- " print('checking walking paths from stop {}'.format(p))\n",
- " # making sure there are footpaths for that stop\n",
- " # finding the next stop where there are footpaths to find the next index\n",
- " next_stop = p\n",
- " next_stop_found = False\n",
- " while next_stop < len(stops)-1: #carefully check that's the correct version\n",
- " next_stop = next_stop+1\n",
- " if stops[next_stop][1] is not None:\n",
- " next_stop_found = True\n",
- " break\n",
- " \n",
- " # reinitializing next_stop to p in case no next stop with not 'None' stops[p][1] is found\n",
- " if not next_stop_found:\n",
- " next_stop = p+1 # this works because transfers[p:None] is equivalent to transfers[p:]\n",
- " \n",
- " \n",
- " for f in transfers[stops[p][1]:stops[next_stop][1]]:\n",
- " print(\"Considering footpaths from {} to {}\".format(p, f[0]))\n",
- " \n",
- " # we only consider footpaths if they strictly ameliorate the arrival time at the arrival stop of the path.\n",
- " if(tau[k][p]+np.timedelta64(f[1], 's') < min(tau_star[f[0]], tau_star[p_t])): \n",
- " print(\"Walking to {} is faster !\".format(f[0]))\n",
- " tau[k][f[0]] = tau[k][p]+np.timedelta64(f[1], 's')\n",
- " tau_star[f[0]] = tau[k][p]+np.timedelta64(f[1], 's')\n",
- " marked_footpaths.append(f[0])\n",
- " \n",
- " # keeping tracks of footpaths to backtrack the journey:\n",
- " # [departure stop, arrival stop, walking time]\n",
- " journey[k][f[0]][4:7] = [p, f[0], f[1]]\n",
- " \n",
- " marked.extend(marked_footpaths) # to avoid infinite loops if marked gets appended dynamically\n",
- " # stopping criterion: no stops were marked\n",
- " if not marked:\n",
- " break\n",
- " return(tau, tau_star, k, journey)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "******************************STARTING round k=1******************************\n",
- "Marked stops at the start of the round: [0]\n",
- "Route considered for the queue: (0, 0)\n",
- "Route considered for the queue: (2, 0)\n",
- "Queue before traversing each route: [(0, 0), (2, 0)]\n",
- "\n",
- "****TRAVERSING ROUTE r=0 from stop p=0****\n",
- "p_i: 0\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=0----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:00\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:10\n",
- "\n",
- "!!!!Hopped on route r=0, trip t=1 at stop p_i=0!!!!\n",
- "p_i: 1\n",
- "arr_t_p_i: 2020-05-11T08:35\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=1----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:30\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:40\n",
- "p_i: 2\n",
- "arr_t_p_i: 2020-05-11T09:05\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=2 from stop p=0****\n",
- "p_i: 0\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=0----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:20\n",
- "\n",
- "!!!!Hopped on route r=2, trip t=0 at stop p_i=0!!!!\n",
- "p_i: 4\n",
- "arr_t_p_i: 2020-05-11T09:20\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****FOOTPATHS****\n",
- "\n",
- "******************************STARTING round k=2******************************\n",
- "Marked stops at the start of the round: [1, 2, 4]\n",
- "Route considered for the queue: (0, 1)\n",
- "Route considered for the queue: (0, 2)\n",
- "Route considered for the queue: (1, 2)\n",
- "Route considered for the queue: (1, 4)\n",
- "Route considered for the queue: (2, 4)\n",
- "Queue before traversing each route: [(0, 1), (1, 2), (2, 4)]\n",
- "\n",
- "****TRAVERSING ROUTE r=0 from stop p=1****\n",
- "p_i: 1\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=1----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:30\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:40\n",
- "\n",
- "!!!!Hopped on route r=0, trip t=1 at stop p_i=1!!!!\n",
- "p_i: 2\n",
- "arr_t_p_i: 2020-05-11T09:05\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=1 from stop p=2****\n",
- "p_i: 2\n",
- "\n",
- "----scanning departure times for route r=1 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:10\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T09:10\n",
- "\n",
- "!!!!Hopped on route r=1, trip t=1 at stop p_i=2!!!!\n",
- "p_i: 4\n",
- "arr_t_p_i: 2020-05-11T09:15\n",
- "\n",
- "----scanning departure times for route r=1 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=2 from stop p=4****\n",
- "p_i: 4\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****FOOTPATHS****\n",
- "\n",
- "******************************STARTING round k=3******************************\n",
- "Marked stops at the start of the round: [4]\n",
- "Route considered for the queue: (1, 4)\n",
- "Route considered for the queue: (2, 4)\n",
- "Queue before traversing each route: [(1, 4), (2, 4)]\n",
- "\n",
- "****TRAVERSING ROUTE r=1 from stop p=4****\n",
- "p_i: 4\n",
- "\n",
- "----scanning departure times for route r=1 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=2 from stop p=4****\n",
- "p_i: 4\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****FOOTPATHS****\n"
- ]
- }
- ],
- "source": [
- "tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n",
- " routes = routes, routeStops = routeStops, stopTimes = stopTimes, stopRoutes = stopRoutes, stops = stops)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n",
- " '2100-01-01T00:00', '2020-05-11T09:15'], dtype='datetime64[m]')"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tau_star"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([['2020-05-11T08:05', '2100-01-01T00:00', '2100-01-01T00:00',\n",
- " '2100-01-01T00:00', '2100-01-01T00:00'],\n",
- " ['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n",
- " '2100-01-01T00:00', '2020-05-11T09:20'],\n",
- " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n",
- " '2100-01-01T00:00', '2020-05-11T09:15'],\n",
- " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n",
- " '2100-01-01T00:00', '2100-01-01T00:00']], dtype='datetime64[m]')"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "k_last = k\n",
- "\n",
- "tau[0:k_last+1]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "`journey` contains all the necessary information to backtrack from the solution to the actual journey in terms of sequence of transports.\n",
- "\n",
- "`journey` has dimensions `k` by `n_stops` by 4+3.\n",
- "- The 4 first values store the route and trip taken, the departure and arrival stops.\n",
- "- The 3 last values are used by footpaths. They contain the departure stop for the walk, the arrival stop for the walk and the walking time in seconds.\n",
- "\n",
- "When we hop on a trip, we store the trip (with the route) and the boarding and exit stops as the array `t_journey`: `(r, t, p_boarding, p_exit)`. At each stop `p_i` where we ameliorate the arrival time in round `k`, we store `t_journey` in the first 4 cells of `journey[k][p_i]`. `p_i` corresponds to the exit stop when backtracking.\n",
- "\n",
- "When walking to stop `p_i` is shorter, we store the departure, arrival stops and walking time in the 3 last cells of `journey[k][p_i]`.\n",
- "\n",
- "The end result is a `journey` array which contains -1 values in all seven cells in `journey[k][p_i]` if the arrival time at `p_i` was not ameliorated at step `k`. `journey[k][p_i]` where there are values other than -1 indicate that the arrival time was ameliorated either by walking or by taking a transport. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[[-1, -1, -1, -1, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1]],\n",
- "\n",
- " [[-1, -1, -1, -1, -1, -1, -1],\n",
- " [ 0, 1, 0, 1, -1, -1, -1],\n",
- " [ 0, 1, 0, 2, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1],\n",
- " [ 2, 0, 0, 4, -1, -1, -1]],\n",
- "\n",
- " [[-1, -1, -1, -1, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1],\n",
- " [-1, -1, -1, -1, -1, -1, -1],\n",
- " [ 1, 1, 2, 4, -1, -1, -1]]])"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "journey[0:k_last]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Backtracking\n",
- "\n",
- "We reconstruct the actual journey from the `journey` array by backtracking from the arrival stop to the departure stop. At each round k where we notice that the arrival time for the target stop was ameliorated, we start a new leg corresponding to a journey reaching the target stop in k transports.\n",
- "\n",
- "When backtracking without footpaths, it is sufficient at each round k to check at which stop the trip at round k-1 began. \n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {},
- "outputs": [],
- "source": [
- "def backtrack_journey(k_last, p_t, journey):\n",
- " # journey_act = actual journey, will contain the sequence of transports in the correct order\n",
- " journey_act = [[] for k in range(0, k_last)] # there's maximum k routes to get to the final stop\n",
- " p_board = p_t\n",
- " n_legs = 1 # each leg is a journey to from the departure stop to the target stop in exactly k transports\n",
- " journey_found = False\n",
- "\n",
- " # iterating backwards in rounds from k_last -1 to 1\n",
- " for k in range(k_last-1, 0, -1): # second argument in range is not included in the boundaries\n",
- " # Was the tarrival time at the target stop ameliorated at round k ? \n",
- " if np.any(journey[k][p_t]!=np.array([-1, -1, -1, -1, -1, -1, -1])):\n",
- "\n",
- " # starting a new leg in the list of actual journeys\n",
- " journey_found = True\n",
- " # iterating from k to 0 to reconstruct the actual journey in k transports\n",
- " p_board = p_t\n",
- " for k_prime in range(k, 0, -1):\n",
- "\n",
- " # did we get to that stop by walking ?\n",
- " if journey[k_prime][p_board][5] !=-1:\n",
- "\n",
- " # we keep track of the stop to which we walked to as well as the departure stop of the walk\n",
- " stop_walk_dep = journey[k_prime][p_board][4]\n",
- " journey_act[k].append([journey[k_prime][stop_walk_dep], journey[k_prime][p_board]])\n",
- " p_board = journey[k_prime][stop_walk_dep][2]\n",
- "\n",
- " # we did not get to that stop by walking\n",
- " else:\n",
- "\n",
- " journey_act[k].append(journey[k_prime][p_board])\n",
- " p_board = journey[k_prime][p_board][2]\n",
- "\n",
- " # reversing the order of journey_act to get journeys from the start stop to the target stop\n",
- " journey_act = [j[::-1] for j in journey_act]\n",
- "\n",
- " # building a human readable output for the trip:\n",
- " for k, j in enumerate(journey_act):\n",
- "\n",
- " if j: # going only through non-empty journeys\n",
- " print('******************JOURNEY IN {} TRIPS******************'.format(k))\n",
- " print('raw representation of the journey in {} trips: {}'.format(k, j))\n",
- "\n",
- " for k_prime, t in enumerate(j):\n",
- " # We did not walk at step k\n",
- " if len(t) !=2:\n",
- " p_boarding = t[2]\n",
- " p_exit = t[3]\n",
- " r_k = t[0]\n",
- " time_boarding = stopTimes[routes[r_k][3] + \\\n",
- " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_boarding)[0][0] + \\\n",
- " t[1]*routes[r_k][1]][1]\n",
- " time_exit = stopTimes[routes[r_k][3] + \\\n",
- " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_exit)[0][0] + \\\n",
- " t[1]*routes[r_k][1]][0]\n",
- " print(\"At stop {}, take route {} leaving at time {} \\n...\".format(p_boarding, r_k, time_boarding))\n",
- "\n",
- " print(\" and exit at stop {} at time {}\".format(p_exit, time_exit))\n",
- "\n",
- " # We walked at step k\n",
- " elif len(t)==2:\n",
- " print(t)\n",
- " p_boarding = t[0][2]\n",
- " p_exit = t[0][3]\n",
- " r_k = t[0][0]\n",
- " time_boarding = stopTimes[routes[r_k][3] + \\\n",
- " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_boarding)[0][0] + \\\n",
- " t[0][1]*routes[r_k][1]][1]\n",
- " time_exit = stopTimes[routes[r_k][3] + \\\n",
- " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_exit)[0][0] + \\\n",
- " t[0][1]*routes[r_k][1]][0]\n",
- " p_start_walk = t[1][4]\n",
- " p_end_walk = t[1][5]\n",
- " walk_duration = t[1][6]/60\n",
- "\n",
- " print(\"At stop {}, take route {} leaving at time {} \\n...\".format(p_boarding, r_k, time_boarding))\n",
- "\n",
- " print(\"... exit at stop {} at time {}... \".format(p_exit, time_exit))\n",
- "\n",
- " print(\"and walk for {} minutes from stop {} to stop {}.\".format(walk_duration, p_start_walk, p_end_walk))\n",
- " \n",
- " if not journey_found:\n",
- " print('No journey was found for this query')\n",
- " return journey_found "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "******************JOURNEY IN 1 TRIPS******************\n",
- "raw representation of the journey in 1 trips: [array([ 2, 0, 0, 4, -1, -1, -1])]\n",
- "At stop 0, take route 2 leaving at time 2020-05-11T08:20 \n",
- "...\n",
- " and exit at stop 4 at time 2020-05-11T09:20\n",
- "******************JOURNEY IN 2 TRIPS******************\n",
- "raw representation of the journey in 2 trips: [array([ 0, 1, 0, 2, -1, -1, -1]), array([ 1, 1, 2, 4, -1, -1, -1])]\n",
- "At stop 0, take route 0 leaving at time 2020-05-11T08:10 \n",
- "...\n",
- " and exit at stop 2 at time 2020-05-11T09:05\n",
- "At stop 2, take route 1 leaving at time 2020-05-11T09:10 \n",
- "...\n",
- " and exit at stop 4 at time 2020-05-11T09:15\n"
- ]
- }
- ],
- "source": [
- "backtrack_journey(k_last, p_t, journey);"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Let's add footpaths\n",
- "\n",
- "For now, we have not tried including footpaths in our dataset. Below, we modify the timetable by adding a new route r3, which links a new stop F to E in a single travel. F may be reached in a very long time from A, but in a short time from B, meaning that it should become shorter to:\n",
- "\n",
- "- Take a trip from A to B\n",
- "- Walk from B to F\n",
- "- Take a trip from F to E\n",
- "\n",
- "rather than the current best trip:\n",
- "- Take a trip from A to C\n",
- "- Take a trip from C to E\n",
- "\n",
- "\n",
- "Note that the single transport solution:\n",
- "- Take a trip from A to E\n",
- "\n",
- "should still appear as the optimal solution for k = 1, i.e one transport is taken."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [],
- "source": [
- "routes = np.array([[2, 3, 0, 0], #r0\n",
- " [2, 3, 3, 6], #r1\n",
- " [2, 2, 6, 12], #r2\n",
- " [2, 2, 8, 16]]) # r3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [],
- "source": [
- "routeStops = np.array([0, 1, 2, # A, B, C\n",
- " 3, 2, 4, # D, C, E\n",
- " 0, 4, # A, E\n",
- " 5, 4]) #F, E"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [],
- "source": [
- "stopTimes = np.array([\n",
- " # r0, t0\n",
- " [None, '2020-05-11T08:00'],\n",
- " ['2020-05-11T08:25', '2020-05-11T08:30'],\n",
- " ['2020-05-11T08:55', None],\n",
- "\n",
- " # ro, t1\n",
- " [None, '2020-05-11T08:10'],\n",
- " ['2020-05-11T08:35', '2020-05-11T08:40'],\n",
- " ['2020-05-11T09:05', None],\n",
- " \n",
- " # r1, t0 \n",
- " [None, '2020-05-11T08:00'],\n",
- " ['2020-05-11T08:05', '2020-05-11T08:10'],\n",
- " ['2020-05-11T08:15', None],\n",
- "\n",
- " # r1, t1\n",
- " [None, '2020-05-11T09:00'],\n",
- " ['2020-05-11T09:05', '2020-05-11T09:10'],\n",
- " ['2020-05-11T09:15', None],\n",
- " \n",
- " #r2, t0\n",
- " [None, '2020-05-11T08:20'],\n",
- " ['2020-05-11T09:20', None],\n",
- " \n",
- " #r2, t1\n",
- " [None, '2020-05-11T08:30'],\n",
- " ['2020-05-11T09:30', None],\n",
- " \n",
- " #r3, t0\n",
- " [None, '2020-05-11T08:05'],\n",
- " ['2020-05-11T08:25', None],\n",
- "\n",
- " #r3, t1\n",
- " [None, '2020-05-11T08:45'],\n",
- " ['2020-05-11T09:05', None]],\n",
- " dtype='datetime64')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {},
- "outputs": [],
- "source": [
- "stopRoutes = np.array([0, 2, # A\n",
- " 0, # B\n",
- " 0,1, # C\n",
- " 1, # D\n",
- " 1, 2, 3, # E\n",
- " 3]) # F"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## transfers\n",
- "The `transfers` is a 2D `np.ndarray` where each entry `[p_j, time]` represents the time it takes to reach p_j from stop p_i. The correspondance between the indexing of `transfers` and p_i is done via `stops[p_i][1]`, i.e the first entry in `transfers` containing a connection from stop p_i.\n",
- "\n",
- "As we cannot store different data types in numpy arras, `time` will have to be converted to `np.timedelta64`, the format used to make differences between `np.datetime.64` variables. We will consider all `time` values as **positive values in seconds**."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "numpy.timedelta64(-30,'m')"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stopTimes[0][1] - stopTimes[1][1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "numpy.timedelta64(30,'s')"
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "np.timedelta64(30, 's')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [],
- "source": [
- "transfers = np.array([[5, 3600], # A -> F\n",
- " [5, 300], # B -> F\n",
- " [0, 3600], # F -> A\n",
- " [1, 300] # F -> A\n",
- " ])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {},
- "outputs": [],
- "source": [
- "stops = np.array([[0, 0],# A\n",
- " [2, 1], # B\n",
- " [3, None],# C\n",
- " [5, None], # D\n",
- " [6, None], # E\n",
- " [9, 2], # F\n",
- " [len(stopRoutes), None]]) # fictive stop to account for length of E"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "******************************STARTING round k=1******************************\n",
- "Marked stops at the start of the round: [0]\n",
- "Route considered for the queue: (0, 0)\n",
- "Route considered for the queue: (2, 0)\n",
- "Queue before traversing each route: [(0, 0), (2, 0)]\n",
- "\n",
- "****TRAVERSING ROUTE r=0 from stop p=0****\n",
- "p_i: 0\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=0----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:00\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:10\n",
- "\n",
- "!!!!Hopped on route r=0, trip t=1 at stop p_i=0!!!!\n",
- "p_i: 1\n",
- "arr_t_p_i: 2020-05-11T08:35\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=1----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:30\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:40\n",
- "p_i: 2\n",
- "arr_t_p_i: 2020-05-11T09:05\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=2 from stop p=0****\n",
- "p_i: 0\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=0----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:20\n",
- "\n",
- "!!!!Hopped on route r=2, trip t=0 at stop p_i=0!!!!\n",
- "p_i: 4\n",
- "arr_t_p_i: 2020-05-11T09:20\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****FOOTPATHS****\n",
- "checking walking paths from stop 1\n",
- "Considering footpaths from 1 to 5\n",
- "Walking to 5 is faster !\n",
- "\n",
- "******************************STARTING round k=2******************************\n",
- "Marked stops at the start of the round: [1, 2, 4, 5]\n",
- "Route considered for the queue: (0, 1)\n",
- "Route considered for the queue: (0, 2)\n",
- "Route considered for the queue: (1, 2)\n",
- "Route considered for the queue: (1, 4)\n",
- "Route considered for the queue: (2, 4)\n",
- "Route considered for the queue: (3, 4)\n",
- "Route considered for the queue: (3, 5)\n",
- "Queue before traversing each route: [(0, 1), (1, 2), (2, 4), (3, 5)]\n",
- "\n",
- "****TRAVERSING ROUTE r=0 from stop p=1****\n",
- "p_i: 1\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=1----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:30\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:40\n",
- "\n",
- "!!!!Hopped on route r=0, trip t=1 at stop p_i=1!!!!\n",
- "p_i: 2\n",
- "arr_t_p_i: 2020-05-11T09:05\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=1 from stop p=2****\n",
- "p_i: 2\n",
- "\n",
- "----scanning departure times for route r=1 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:10\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: 2020-05-11T09:10\n",
- "\n",
- "!!!!Hopped on route r=1, trip t=1 at stop p_i=2!!!!\n",
- "p_i: 4\n",
- "arr_t_p_i: 2020-05-11T09:15\n",
- "\n",
- "----scanning departure times for route r=1 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=2 from stop p=4****\n",
- "p_i: 4\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=3 from stop p=5****\n",
- "p_i: 5\n",
- "\n",
- "----scanning departure times for route r=3 at stop p_i=5----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:40\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:05\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:40\n",
- "Departure time considered: t_r_dep: 2020-05-11T08:45\n",
- "\n",
- "!!!!Hopped on route r=3, trip t=1 at stop p_i=5!!!!\n",
- "p_i: 4\n",
- "arr_t_p_i: 2020-05-11T09:05\n",
- "\n",
- "----scanning departure times for route r=3 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****FOOTPATHS****\n",
- "\n",
- "******************************STARTING round k=3******************************\n",
- "Marked stops at the start of the round: [4]\n",
- "Route considered for the queue: (1, 4)\n",
- "Route considered for the queue: (2, 4)\n",
- "Route considered for the queue: (3, 4)\n",
- "Queue before traversing each route: [(1, 4), (2, 4), (3, 4)]\n",
- "\n",
- "****TRAVERSING ROUTE r=1 from stop p=4****\n",
- "p_i: 4\n",
- "\n",
- "----scanning departure times for route r=1 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=2 from stop p=4****\n",
- "p_i: 4\n",
- "\n",
- "----scanning departure times for route r=2 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=3 from stop p=4****\n",
- "p_i: 4\n",
- "\n",
- "----scanning departure times for route r=3 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****FOOTPATHS****\n"
- ]
- }
- ],
- "source": [
- "tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n",
- " routes = routes, routeStops = routeStops, stopTimes = stopTimes, stopRoutes = stopRoutes, stops = stops)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([['2020-05-11T08:05', '2100-01-01T00:00', '2100-01-01T00:00',\n",
- " '2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00'],\n",
- " ['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n",
- " '2100-01-01T00:00', '2020-05-11T09:20', '2020-05-11T08:40'],\n",
- " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n",
- " '2100-01-01T00:00', '2020-05-11T09:05', '2100-01-01T00:00'],\n",
- " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n",
- " '2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00']],\n",
- " dtype='datetime64[m]')"
- ]
- },
- "execution_count": 58,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "k_last = k\n",
- "tau[0:k_last+1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n",
- " '2100-01-01T00:00', '2020-05-11T09:05', '2020-05-11T08:40'],\n",
- " dtype='datetime64[m]')"
- ]
- },
- "execution_count": 59,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tau_star"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[[ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1]],\n",
- "\n",
- " [[ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ 0, 1, 0, 1, -1, -1, -1],\n",
- " [ 0, 1, 0, 2, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ 2, 0, 0, 4, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, 1, 5, 300]],\n",
- "\n",
- " [[ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1],\n",
- " [ 3, 1, 5, 4, -1, -1, -1],\n",
- " [ -1, -1, -1, -1, -1, -1, -1]]])"
- ]
- },
- "execution_count": 60,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "journey[0:k_last]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Backtracking with footpaths\n",
- "\n",
- "When backtracking, with footpaths, we first look through the footpaths to backtrack to the departure stop for the walk, and then use the departure stop of the walk as an arrival stop for a transport.\n",
- "\n",
- "But with footpaths added, it is possible to reach a stop C from stop A by:\n",
- "- first taking a transport to a stop B\n",
- "- walking from stop B to stop C.\n",
- "\n",
- "Therefore, we need to keep track of all the footpaths taken at step i that ameliorated arrival times at the target stop.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "******************JOURNEY IN 1 TRIPS******************\n",
- "raw representation of the journey in 1 trips: [array([ 2, 0, 0, 4, -1, -1, -1])]\n",
- "At stop 0, take route 2 leaving at time 2020-05-11T08:20 \n",
- "...\n",
- " and exit at stop 4 at time 2020-05-11T09:20\n",
- "******************JOURNEY IN 2 TRIPS******************\n",
- "raw representation of the journey in 2 trips: [[array([ 0, 1, 0, 1, -1, -1, -1]), array([ -1, -1, -1, -1, 1, 5, 300])], array([ 3, 1, 5, 4, -1, -1, -1])]\n",
- "[array([ 0, 1, 0, 1, -1, -1, -1]), array([ -1, -1, -1, -1, 1, 5, 300])]\n",
- "At stop 0, take route 0 leaving at time 2020-05-11T08:10 \n",
- "...\n",
- "... exit at stop 1 at time 2020-05-11T08:35... \n",
- "and walk for 5.0 minutes from stop 1 to stop 5.\n",
- "At stop 5, take route 3 leaving at time 2020-05-11T08:45 \n",
- "...\n",
- " and exit at stop 4 at time 2020-05-11T09:05\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "backtrack_journey(k_last, p_t, journey)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Trying to run the standard RAPTOR on real size data\n",
- "### Loading real sized data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[1 11 0 0]\n",
- " [1 11 11 11]\n",
- " [1 11 22 22]\n",
- " ...\n",
- " [1 6 237432 245713]\n",
- " [1 13 237438 245719]\n",
- " [3 2 237451 245732]]\n",
- "We find 16210 routes in the data\n"
- ]
- }
- ],
- "source": [
- "import pickle\n",
- "# step 1 convert the data from string to numpy series\n",
- "routes_real = pickle.load( open( \"../data/routes_array2.pkl\", \"rb\" ) )\n",
- "print(routes_real)\n",
- "print('We find {} routes in the data'.format(len(routes_real)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[0 None]\n",
- " [4 None]\n",
- " [7 None]\n",
- " ...\n",
- " [7841 None]\n",
- " [7844 None]\n",
- " [7847 None]]\n",
- "We find 1407 stops in the data\n"
- ]
- }
- ],
- "source": [
- "stops_real = pickle.load(open( \"../data/stops_array.pkl\", \"rb\" ) )\n",
- "print(stops_real)\n",
- "print('We find {} stops in the data'.format(len(stops_real)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[ 'NaT' '2020-05-21T16:53:00.000000000']\n",
- " ['2020-05-21T16:55:00.000000000' '2020-05-21T16:55:00.000000000']\n",
- " ['2020-05-21T16:57:00.000000000' '2020-05-21T16:57:00.000000000']\n",
- " ...\n",
- " ['2020-05-21T15:10:00.000000000' 'NaT']\n",
- " [ 'NaT' '2020-05-21T16:45:00.000000000']\n",
- " ['2020-05-21T17:05:00.000000000' 'NaT']]\n",
- "We find 245738 arrival/departure times for stops in the data\n"
- ]
- }
- ],
- "source": [
- "stopTimes_real = pickle.load(open( \"../data/stop_times_array1.pkl\", \"rb\" ) )\n",
- "print(stopTimes_real)\n",
- "print('We find {} arrival/departure times for stops in the data'.format(len(stopTimes_real)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[1166 146]\n",
- " [1270 360]\n",
- " [ 2 8]\n",
- " ...\n",
- " [ 108 371]\n",
- " [ 102 439]\n",
- " [1739 519]]\n",
- "We find 12564 footpaths (bidirectional) in the data\n"
- ]
- }
- ],
- "source": [
- "transfer_real = pickle.load(open( \"../data/transfer_array.pkl\", \"rb\" ) )\n",
- "print(transfer_real)\n",
- "print('We find {} footpaths (bidirectional) in the data'.format(len(transfer_real)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[ 0 1 88 ... 736 735 736]\n",
- "We find 8000 (r, p) route stops combinations in the data\n"
- ]
- }
- ],
- "source": [
- "stopRoutes_real = pickle.load(open( \"../data/stop_routes_array.pkl\", \"rb\" ) )\n",
- "# The route index alone was not selected:\n",
- "#stopRoutes_real = stopRoutes_real[:, 1]\n",
- "print(stopRoutes_real)\n",
- "print('We find {} (r, p) route stops combinations in the data'.format(len(stopRoutes_real)))\n",
- "#print('We find {} unique routes desserving stops'.format(len(np.unique(stopRoutes_real))))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 72,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[ 0 1 2 ... 759 554 493]\n",
- "We find 7849 route, stops combinations\n",
- "We find 1407 unique stops desserving routes\n"
- ]
- }
- ],
- "source": [
- "routeStops_real = pickle.load(open( \"../data/route_stops_array.pkl\", \"rb\" ) )\n",
- "print(routeStops_real)\n",
- "print('We find {} route, stops combinations'.format(len(routeStops_real)))\n",
- "print('We find {} unique stops desserving routes'.format(len(np.unique(routeStops_real))))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "******************************STARTING round k=1******************************\n",
- "Marked stops at the start of the round: [0]\n",
- "Route considered for the queue: (0, 0)\n",
- "Route considered for the queue: (1, 0)\n",
- "Route considered for the queue: (88, 0)\n",
- "Route considered for the queue: (89, 0)\n",
- "Queue before traversing each route: [(0, 0), (1, 0), (88, 0), (89, 0)]\n",
- "\n",
- "****TRAVERSING ROUTE r=0 from stop p=0****\n",
- "p_i: 0\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=0----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-21T12:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T16:53:00.000000000\n",
- "\n",
- "!!!!Hopped on route r=0, trip t=0 at stop p_i=0!!!!\n",
- "p_i: 1\n",
- "arr_t_p_i: 2020-05-21T16:55:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=1----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T16:55:00.000000000\n",
- "p_i: 2\n",
- "arr_t_p_i: 2020-05-21T16:57:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T16:57:00.000000000\n",
- "p_i: 0\n",
- "arr_t_p_i: NaT\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=0----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-21T12:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T16:53:00.000000000\n",
- "\n",
- "!!!!Hopped on route r=0, trip t=0 at stop p_i=0!!!!\n",
- "p_i: 1\n",
- "arr_t_p_i: 2020-05-21T16:55:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=1----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T16:55:00.000000000\n",
- "p_i: 2\n",
- "arr_t_p_i: 2020-05-21T16:57:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=2----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T16:57:00.000000000\n",
- "p_i: 3\n",
- "arr_t_p_i: 2020-05-21T17:01:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=3----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T17:01:00.000000000\n",
- "p_i: 4\n",
- "arr_t_p_i: 2020-05-21T17:01:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=4----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T17:01:00.000000000\n",
- "p_i: 5\n",
- "arr_t_p_i: 2020-05-21T17:03:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=5----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T17:03:00.000000000\n",
- "p_i: 6\n",
- "arr_t_p_i: 2020-05-21T17:03:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=6----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: 2020-05-21T17:03:00.000000000\n",
- "p_i: 7\n",
- "arr_t_p_i: 2020-05-21T17:04:00.000000000\n",
- "\n",
- "----scanning departure times for route r=0 at stop p_i=7----\n",
- "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n",
- "Departure time considered: t_r_dep: NaT\n",
- "\n",
- "****TRAVERSING ROUTE r=1 from stop p=0****\n"
- ]
- },
- {
- "ename": "IndexError",
- "evalue": "index 0 is out of bounds for axis 0 with size 0",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n\u001b[0;32m----> 7\u001b[0;31m routes = routes_real, routeStops = routeStops_real, stopTimes = stopTimes_real, stopRoutes = stopRoutes_real, stops = stops_real)\n\u001b[0m",
- "\u001b[0;32m\u001b[0m in \u001b[0;36mraptor_standard\u001b[0;34m(p_s, p_t, tau_0, routes, routeStops, stopTimes, stopRoutes, stops, k_max)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# we only traverse the route starting at p, not from the beginning of the route\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 66\u001b[0;31m for p_i in routeStops[routes[r][2]+np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p)[0][0]:\\\n\u001b[0m\u001b[1;32m 67\u001b[0m routes[r][2]+routes[r][1]]:\n\u001b[1;32m 68\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"p_i: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp_i\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mIndexError\u001b[0m: index 0 is out of bounds for axis 0 with size 0"
- ]
- }
- ],
- "source": [
- "p_s_real = 10 # start stop = A\n",
- "p_t = 4 # target stop = E\n",
- "tau_0 = np.datetime64('2020-05-21T12:00:00') # departure time 08:05\n",
- "k_max = 10 # we set a maximum number of transports to pre-allocate memory for the numpy array tau_i\n",
- "\n",
- "tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n",
- " routes = routes_real, routeStops = routeStops_real, stopTimes = stopTimes_real, stopRoutes = stopRoutes_real, stops = stops_real)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Code for prototyping and debugging:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "p_s = 0 # start stop = A\n",
- "p_t = 4 # target stop = E\n",
- "tau_0 = np.datetime64('2020-05-11T08:05') # departure time 08:05\n",
- "k_max = 10 # we set a maximum number of transports to pre-allocate memory for the numpy array tau_i"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# initialization\n",
- "n_stops = len(stops)\n",
- "\n",
- "# earliest arrival time at each stop for each round.\n",
- "tau = np.full(shape=(k_max, n_stops), fill_value = np.datetime64('2100-01-01T00:00')) # 2100 instead of infinity # number of stops * max number of transports\n",
- "\n",
- "# earliest arrival time at each stop, indep. of round\n",
- "tau_star = np.full(shape=n_stops, fill_value = np.datetime64('2100-01-01T00:00'))\n",
- "\n",
- "marked = [p_s]\n",
- "q = []\n",
- "tau[0, p_s] = tau_0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "p_i"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "t_r_dep = stopTimes[routes[r][3]+\\\n",
- " # offset corresponding to stop p_i in route r\n",
- " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0] + \\\n",
- " routes[r][1]*t_r][1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "if np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2) <\\\n",
- "np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 3):\n",
- " print(\"hello\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "routeStops[routes[1][2] + np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2)[0][0]:routes[1][2]+routes[1][1]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "routeStops[routes[1][2] + np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2)[0][0]:6]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "routeStops[routes[1][2]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "routeStops[np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2)[0][0]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "if True and \\\n",
- " True:\n",
- " print(\"hello\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tau[0][0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "stopTimes[3][1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "a = np.arange(1, 10)\n",
- "a"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "a[1:10:2]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stopTimes[routes[0][3]+\\\n",
- " # offset corresponding to stop p_i in route r\n",
- " np.where(routeStops[routes[0][2]:routes[0][2]+routes[0][1]] == 0)[0][0]:\\\n",
- " # end of the trips of r\n",
- " routes[0][3]+routes[0][0]*routes[0][1]:\\\n",
- " # we can jump from the number of stops in r to find the next departure of route r at p_i\n",
- " routes[0][1]\n",
- " ]\n",
- "# we may more simply loop through all trips, and stop as soon as the departure time is after the arrival time\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stopTimes[routes[0][3]+\\\n",
- " # offset corresponding to stop p_i in route r\n",
- " np.where(routeStops[routes[0][2]:routes[0][2]+routes[0][1]] == 0)[0][0]][1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "stopTimes[routes[1][3]+\\\n",
- " # offset corresponding to stop p_i in route r\n",
- " np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 3)[0][0] + \\\n",
- " routes[1][1]*1][1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# t_r is a trip that belongs to route r. t_r can take value 0 to routes[r][0]-1\n",
- "t = None\n",
- "r = 1\n",
- "tau_k_1 = tau[0][0]\n",
- "p_i = 3\n",
- "\n",
- "t_r = 0\n",
- "while True:\n",
- " \n",
- " t_r_dep = stopTimes[routes[r][3]+\\\n",
- " # offset corresponding to stop p_i in route r\n",
- " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0] + \\\n",
- " routes[r][1]*t_r][1]\n",
- " \n",
- " if t_r_dep > tau_k_1:\n",
- " # retrieving the index of the departure time of the trip in stopTimes\n",
- " #t = routes[r][3] + t_r * routes[r][1]\n",
- " t = t_r\n",
- " break\n",
- " t_r += 1\n",
- " # we could not hop on any trip at this stop\n",
- " if t_r == routes[r][0]:\n",
- " break\n",
- " \n",
- "print(\"done\")\n",
- "print(t)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "r = 1\n",
- "t = 1\n",
- "p_i = 2\n",
- "# 1st trip of route + offset for the right trip + offset for the right stop\n",
- "stopTimes[routes[r][3] + t * routes[r][1] + np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "d = []\n",
- "not d"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "r = 1\n",
- "t = 0\n",
- "p_i = 4\n",
- "arr_t_p_i = stopTimes[routes[r][3] + \\\n",
- " t * routes[r][1] + \\\n",
- " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0]][0]\n",
- "arr_t_p_i"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.datetime64('NaT') > np.datetime64('2100-01-01')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.datetime64('NaT') < np.datetime64('2100-01-01')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "jupytext": {
- "formats": "ipynb,md,py:percent"
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}