diff --git a/notebooks/old_notebooks/OLD_Arrays_to_pickle.ipynb b/notebooks/old_notebooks/OLD_Arrays_to_pickle.ipynb new file mode 100644 index 0000000..490e7a9 --- /dev/null +++ b/notebooks/old_notebooks/OLD_Arrays_to_pickle.ipynb @@ -0,0 +1,3924 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing part 2: preparing the arrays\n", + "In this notebook we take 2 datasets prepared in spark: stop_times and transfers, and prepare them into the array format needed to run RAPTOR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "In this notebook the following actions are performed:\n", + "- Transform stop_ids with platform information into the parent station stop_id\n", + "- Keep only trips with a departure after 7 am and before 7 pm\n", + "- Delete trips which only have 1 stop\n", + "- Create integer IDs for routes, trips and stops, following the definition of the RAPTOR algorithm in Stop_times\n", + "- Add integer IDs to transfers and keep only stops that are inside the stop_times dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import packages" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pickle\n", + "import itertools" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read files\n", + "Before running make sure the .csv files are in /data . If not run notebook \"transfer_to_local\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
0026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
1126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
2226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
3326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
4426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 route_id stop_id_general trip_id stop_id \\\n", + "0 0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 \n", + "1 1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 \n", + "2 2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 \n", + "3 3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 \n", + "4 4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 \n", + "\n", + " arrival_time departure_time stop_sequence stop_name \\\n", + "0 17:00:00 17:00:00 3 Zürich, Hürlimannplatz \n", + "1 17:02:00 17:02:00 4 Zürich, Waffenplatzstrasse \n", + "2 17:03:00 17:03:00 5 Zürich, Hügelstrasse \n", + "3 17:04:00 17:04:00 6 Zürich, Brunau/Mutschellenstr. \n", + "4 17:05:00 17:05:00 7 Zürich, Thujastrasse \n", + "\n", + " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n", + "0 47.365066 8.526539 Zürich, Neubühl 3870 0 \n", + "1 47.361482 8.525749 Zürich, Neubühl 3870 0 \n", + "2 47.358543 8.526997 Zürich, Neubühl 3870 0 \n", + "3 47.355147 8.527141 Zürich, Neubühl 3870 0 \n", + "4 47.350187 8.527806 Zürich, Neubühl 3870 0 \n", + "\n", + " departure_first_stop route_int stop_count stop_int route_desc \n", + "0 16:55:00 1225 12 1317 Bus \n", + "1 16:55:00 1225 12 1267 Bus \n", + "2 16:55:00 1225 12 67 Bus \n", + "3 16:55:00 1225 12 512 Bus \n", + "4 16:55:00 1225 12 403 Bus " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#stop_times\n", + "stop_times_curated = pd.read_csv(\"../data/stop_times_final_cyril.csv\")\n", + "stop_times_curated.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0route_idstop_idtrip_idarrival_timedeparture_timestop_sequencedirection_idstop_nameroute_desc
0026-759-j19-18573205:0:K1330.TA.26-759-j19-1.7.R05:28:0005:28:0011Zürich Flughafen, BahnhofBus
1126-67-j19-1859134146.TA.26-67-j19-1.1.R05:33:0005:33:0011Zürich, Schmiede WiedikonBus
2226-325-j19-18587020:0:D265.TA.26-325-j19-1.2.H05:34:0005:34:0010Dietikon, BahnhofBus
3326-11-A-j19-185913821266.TA.26-11-A-j19-1.21.H05:37:0005:37:0010Zürich, Sternen OerlikonTram
4426-302-j19-18590844162.TA.26-302-j19-1.4.R05:49:0005:49:0011Urdorf, OberurdorfBus
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 route_id stop_id trip_id \\\n", + "0 0 26-759-j19-1 8573205:0:K 1330.TA.26-759-j19-1.7.R \n", + "1 1 26-67-j19-1 8591341 46.TA.26-67-j19-1.1.R \n", + "2 2 26-325-j19-1 8587020:0:D 265.TA.26-325-j19-1.2.H \n", + "3 3 26-11-A-j19-1 8591382 1266.TA.26-11-A-j19-1.21.H \n", + "4 4 26-302-j19-1 8590844 162.TA.26-302-j19-1.4.R \n", + "\n", + " arrival_time departure_time stop_sequence direction_id \\\n", + "0 05:28:00 05:28:00 1 1 \n", + "1 05:33:00 05:33:00 1 1 \n", + "2 05:34:00 05:34:00 1 0 \n", + "3 05:37:00 05:37:00 1 0 \n", + "4 05:49:00 05:49:00 1 1 \n", + "\n", + " stop_name route_desc \n", + "0 Zürich Flughafen, Bahnhof Bus \n", + "1 Zürich, Schmiede Wiedikon Bus \n", + "2 Dietikon, Bahnhof Bus \n", + "3 Zürich, Sternen Oerlikon Tram \n", + "4 Urdorf, Oberurdorf Bus " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#stop_times\n", + "stop_times_curated = pd.read_csv(\"../data/stop_times_curated.csv\")\n", + "stop_times_curated.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We drop columns not useful to us" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_curated = stop_times_curated.drop(columns=[\"Unnamed: 0\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker
11850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde
2285021868502186:0:10.0067628Dietikon StoffelbachDietikon Stoffelbach
3385021868502186:0:20.01352416Dietikon StoffelbachDietikon Stoffelbach
4485021868502186P0.0000000Dietikon StoffelbachDietikon Stoffelbach
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", + "0 0 8500926 8590616 0.122430 146 \n", + "1 1 8500926 8590737 0.300175 360 \n", + "2 2 8502186 8502186:0:1 0.006762 8 \n", + "3 3 8502186 8502186:0:2 0.013524 16 \n", + "4 4 8502186 8502186P 0.000000 0 \n", + "\n", + " stop_name stop_name2 \n", + "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n", + "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n", + "2 Dietikon Stoffelbach Dietikon Stoffelbach \n", + "3 Dietikon Stoffelbach Dietikon Stoffelbach \n", + "4 Dietikon Stoffelbach Dietikon Stoffelbach " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#transfers\n", + "transfers = pd.read_csv(\"../data/transfers.csv\")\n", + "transfers.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create stop_id same for all platforms\n", + "In the algorithm we make the simplifying assumptions that each time there is a change is the same station there is a 2 min change time. Due to this assumptions we can keep only the parent station name\n", + "The parent id is contained in the first 7 characters, so we can take the substring to create the parent stop_id" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "#copy information stop_id with platform in stop_id_raw\n", + "stop_times_curated[\"stop_id_raw\"] = stop_times_curated[\"stop_id\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_idstop_idtrip_idarrival_timedeparture_timestop_sequencedirection_idstop_nameroute_descstop_id_raw
026-759-j19-185732051330.TA.26-759-j19-1.7.R05:28:0005:28:0011Zürich Flughafen, BahnhofBus8573205:0:K
126-67-j19-1859134146.TA.26-67-j19-1.1.R05:33:0005:33:0011Zürich, Schmiede WiedikonBus8591341
226-325-j19-18587020265.TA.26-325-j19-1.2.H05:34:0005:34:0010Dietikon, BahnhofBus8587020:0:D
326-11-A-j19-185913821266.TA.26-11-A-j19-1.21.H05:37:0005:37:0010Zürich, Sternen OerlikonTram8591382
426-302-j19-18590844162.TA.26-302-j19-1.4.R05:49:0005:49:0011Urdorf, OberurdorfBus8590844
\n", + "
" + ], + "text/plain": [ + " route_id stop_id trip_id arrival_time \\\n", + "0 26-759-j19-1 8573205 1330.TA.26-759-j19-1.7.R 05:28:00 \n", + "1 26-67-j19-1 8591341 46.TA.26-67-j19-1.1.R 05:33:00 \n", + "2 26-325-j19-1 8587020 265.TA.26-325-j19-1.2.H 05:34:00 \n", + "3 26-11-A-j19-1 8591382 1266.TA.26-11-A-j19-1.21.H 05:37:00 \n", + "4 26-302-j19-1 8590844 162.TA.26-302-j19-1.4.R 05:49:00 \n", + "\n", + " departure_time stop_sequence direction_id stop_name \\\n", + "0 05:28:00 1 1 Zürich Flughafen, Bahnhof \n", + "1 05:33:00 1 1 Zürich, Schmiede Wiedikon \n", + "2 05:34:00 1 0 Dietikon, Bahnhof \n", + "3 05:37:00 1 0 Zürich, Sternen Oerlikon \n", + "4 05:49:00 1 1 Urdorf, Oberurdorf \n", + "\n", + " route_desc stop_id_raw \n", + "0 Bus 8573205:0:K \n", + "1 Bus 8591341 \n", + "2 Bus 8587020:0:D \n", + "3 Tram 8591382 \n", + "4 Bus 8590844 " + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Use only first 7 characters for stop_id\n", + "stop_times_curated[\"stop_id\"] = stop_times_curated[\"stop_id_raw\"].str.slice(0, 7)\n", + "stop_times_curated[\"stop_id\"] = pd.to_numeric(stop_times_curated[\"stop_id\"])\n", + "stop_times_curated.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "#copy information stop_id with platform in stop_id_raw\n", + "transfers[\"stop_id_raw\"] = transfers[\"stop_id\"]\n", + "transfers[\"stop_id2_raw\"] = transfers[\"stop_id2\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We do the operation also on the transfers dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2stop_id_rawstop_id2_raw
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker85009268590616
11850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde85009268590737
22850218685021860.0067628Dietikon StoffelbachDietikon Stoffelbach85021868502186:0:1
33850218685021860.01352416Dietikon StoffelbachDietikon Stoffelbach85021868502186:0:2
44850218685021860.0000000Dietikon StoffelbachDietikon Stoffelbach85021868502186P
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", + "0 0 8500926 8590616 0.122430 146 \n", + "1 1 8500926 8590737 0.300175 360 \n", + "2 2 8502186 8502186 0.006762 8 \n", + "3 3 8502186 8502186 0.013524 16 \n", + "4 4 8502186 8502186 0.000000 0 \n", + "\n", + " stop_name stop_name2 stop_id_raw \\\n", + "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 8500926 \n", + "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 8500926 \n", + "2 Dietikon Stoffelbach Dietikon Stoffelbach 8502186 \n", + "3 Dietikon Stoffelbach Dietikon Stoffelbach 8502186 \n", + "4 Dietikon Stoffelbach Dietikon Stoffelbach 8502186 \n", + "\n", + " stop_id2_raw \n", + "0 8590616 \n", + "1 8590737 \n", + "2 8502186:0:1 \n", + "3 8502186:0:2 \n", + "4 8502186P " + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Use only first 7 characters for stop_id\n", + "transfers[\"stop_id\"] = transfers[\"stop_id_raw\"].str.slice(0, 7)\n", + "transfers[\"stop_id2\"] = transfers[\"stop_id2_raw\"].str.slice(0, 7)\n", + "transfers[\"stop_id\"] = pd.to_numeric(transfers[\"stop_id\"])\n", + "transfers[\"stop_id2\"] = pd.to_numeric(transfers[\"stop_id2\"])\n", + "transfers.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keep only trips during the day\n", + "Our model will only consider trips during business days and normal hours, so we can delete all departures before 7 am and after 7 pm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can get the hour of departure using str.slice , and explore the hours we have in the dataset. Then we convert these hours in integers in order to filter." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15',\n", + " '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '00',\n", + " '04', '01'], dtype=object)" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated.departure_time.str.slice(0,2).unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_curated[\"hour_departure\"] = pd.to_numeric(stop_times_curated.departure_time.str.slice(0,2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check if well converted to int" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,\n", + " 22, 23, 24, 25, 0, 4, 1])" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated[\"hour_departure\"].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We drop the trips before 7 am and after 7 pm using np.where function" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "331751" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated.trip_id.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "trip_id_drop = np.where(((stop_times_curated.hour_departure > 19) |\\\n", + " (stop_times_curated.hour_departure < 7)),\\\n", + " stop_times_curated[\"trip_id\"] , None)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_curated = stop_times_curated[~stop_times_curated[\"trip_id\"].isin(trip_id_drop)]" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "246576" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated.trip_id.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With this operation we have decreased the size of stop_times by about 90k lines" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Delete trips with 1 stop\n", + "Trips with only 1 stop are useless in our dataset and will only pollute the algorithm. For this reason we dete these" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by counting the stops of each trip" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_idstop_idtrip_idarrival_timedeparture_timestop_sequencedirection_idstop_nameroute_descstop_id_rawhour_departure
trip_id
1.TA.1-231-j19-1.1.H11511717181151152
1.TA.1-44-j19-1.1.R13133313131
1.TA.1-444-j19-1.1.H19199919191
1.TA.12-E03-j19-1.1.H12122212122
1.TA.18-46-j19-1.1.H11111111111
\n", + "
" + ], + "text/plain": [ + " route_id stop_id trip_id arrival_time \\\n", + "trip_id \n", + "1.TA.1-231-j19-1.1.H 1 15 1 17 \n", + "1.TA.1-44-j19-1.1.R 1 3 1 3 \n", + "1.TA.1-444-j19-1.1.H 1 9 1 9 \n", + "1.TA.12-E03-j19-1.1.H 1 2 1 2 \n", + "1.TA.18-46-j19-1.1.H 1 1 1 1 \n", + "\n", + " departure_time stop_sequence direction_id stop_name \\\n", + "trip_id \n", + "1.TA.1-231-j19-1.1.H 17 18 1 15 \n", + "1.TA.1-44-j19-1.1.R 3 3 1 3 \n", + "1.TA.1-444-j19-1.1.H 9 9 1 9 \n", + "1.TA.12-E03-j19-1.1.H 2 2 1 2 \n", + "1.TA.18-46-j19-1.1.H 1 1 1 1 \n", + "\n", + " route_desc stop_id_raw hour_departure \n", + "trip_id \n", + "1.TA.1-231-j19-1.1.H 1 15 2 \n", + "1.TA.1-44-j19-1.1.R 1 3 1 \n", + "1.TA.1-444-j19-1.1.H 1 9 1 \n", + "1.TA.12-E03-j19-1.1.H 1 2 2 \n", + "1.TA.18-46-j19-1.1.H 1 1 1 " + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "number_stop = stop_times_curated.groupby('trip_id').nunique()\n", + "number_stop.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "#get trips with 1 stop\n", + "trip_with_1_stop = np.where((number_stop.stop_id == 1), number_stop.index, None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check number of trips before cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20261" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated.trip_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We drop the rows with a unique stop per trip" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "#drop trips with only 1 stop\n", + "stop_times_curated = stop_times_curated[~stop_times_curated[\"trip_id\"].isin(trip_with_1_stop)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we check how many trips there still. About 900 trips with only 1 stop have been deleted" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19390" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated.trip_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create route_int, trip_int and stop_int as consecutive integer IDs\n", + "This operation is needed for sorting the routes, trips and stops in the right order. Additionally integers are lighter than strings so the algorithm will need fewer memory to work with the arrays." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Route_int\n", + "The route_int Id is given in an abitrary order" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start creating a tuple with all the stops in a trip" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_curated = stop_times_curated.sort_values([\"trip_id\", \"stop_sequence\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_id
trip_id
1.TA.1-231-j19-1.1.H(8572747, 8582462, 8572600, 8572601, 8502553, ...
1.TA.1-44-j19-1.1.R(8590275, 8591891, 8590279)
1.TA.1-444-j19-1.1.H(8572747, 8580847, 8581346, 8502894, 8502979, ...
1.TA.12-E03-j19-1.1.H(8573205, 8596126)
1.TA.21-23-j19-1.1.R(8503000, 8503003)
\n", + "
" + ], + "text/plain": [ + " stop_id\n", + "trip_id \n", + "1.TA.1-231-j19-1.1.H (8572747, 8582462, 8572600, 8572601, 8502553, ...\n", + "1.TA.1-44-j19-1.1.R (8590275, 8591891, 8590279)\n", + "1.TA.1-444-j19-1.1.H (8572747, 8580847, 8581346, 8502894, 8502979, ...\n", + "1.TA.12-E03-j19-1.1.H (8573205, 8596126)\n", + "1.TA.21-23-j19-1.1.R (8503000, 8503003)" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#group stops into a sequence\n", + "tuple_stops = stop_times_curated.groupby('trip_id')['stop_id'].apply(tuple).to_frame()\n", + "tuple_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19390" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuple_stops.index.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can group all these sequences in unique groups" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_id
(8502208, 8502209, 8503201, 8503010, 8503011, 8503000, 8503006, 8503016)
(8502208, 8502209, 8503201, 8503200, 8503010, 8503011, 8503016)
(8502208, 8502209, 8503202)
(8502208, 8502209, 8503202, 8503009, 8503010, 8503011, 8503000, 8503006, 8503016, 8503307)
(8502208, 8502209, 8503202, 8503200, 8503009, 8503000, 8503015, 8503016, 8503307, 8503305)
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: [(8502208, 8502209, 8503201, 8503010, 8503011, 8503000, 8503006, 8503016), (8502208, 8502209, 8503201, 8503200, 8503010, 8503011, 8503016), (8502208, 8502209, 8503202), (8502208, 8502209, 8503202, 8503009, 8503010, 8503011, 8503000, 8503006, 8503016, 8503307), (8502208, 8502209, 8503202, 8503200, 8503009, 8503000, 8503015, 8503016, 8503307, 8503305)]" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#group to get unique stop sequences\n", + "unique_stop_sequence = tuple_stops.groupby(\"stop_id\").count()\n", + "unique_stop_sequence.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2555" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_stop_sequence.index.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These unique sequences of stops are our routes. We can create a unique ID, an integer, for each route" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_idroute_int
0(8502208, 8502209, 8503201, 8503010, 8503011, ...0
1(8502208, 8502209, 8503201, 8503200, 8503010, ...1
2(8502208, 8502209, 8503202)2
3(8502208, 8502209, 8503202, 8503009, 8503010, ...3
4(8502208, 8502209, 8503202, 8503200, 8503009, ...4
\n", + "
" + ], + "text/plain": [ + " stop_id route_int\n", + "0 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0\n", + "1 (8502208, 8502209, 8503201, 8503200, 8503010, ... 1\n", + "2 (8502208, 8502209, 8503202) 2\n", + "3 (8502208, 8502209, 8503202, 8503009, 8503010, ... 3\n", + "4 (8502208, 8502209, 8503202, 8503200, 8503009, ... 4" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#create dataframe and route_int\n", + "df_unique_stop_sequence = unique_stop_sequence.reset_index()\n", + "df_unique_stop_sequence[\"route_int\"] = df_unique_stop_sequence.index\n", + "df_unique_stop_sequence.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We add the route information to the trip" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_idroute_int
trip_id
403.TA.26-24-j19-1.220.R(8502208, 8502209, 8503201, 8503010, 8503011, ...0
425.TA.26-24-j19-1.220.R(8502208, 8502209, 8503201, 8503200, 8503010, ...1
22.TA.30-57-Y-j19-1.1.H(8502208, 8502209, 8503202)2
11.TA.30-57-Y-j19-1.1.H(8502208, 8502209, 8503202)2
14.TA.30-57-Y-j19-1.1.H(8502208, 8502209, 8503202)2
\n", + "
" + ], + "text/plain": [ + " stop_id \\\n", + "trip_id \n", + "403.TA.26-24-j19-1.220.R (8502208, 8502209, 8503201, 8503010, 8503011, ... \n", + "425.TA.26-24-j19-1.220.R (8502208, 8502209, 8503201, 8503200, 8503010, ... \n", + "22.TA.30-57-Y-j19-1.1.H (8502208, 8502209, 8503202) \n", + "11.TA.30-57-Y-j19-1.1.H (8502208, 8502209, 8503202) \n", + "14.TA.30-57-Y-j19-1.1.H (8502208, 8502209, 8503202) \n", + "\n", + " route_int \n", + "trip_id \n", + "403.TA.26-24-j19-1.220.R 0 \n", + "425.TA.26-24-j19-1.220.R 1 \n", + "22.TA.30-57-Y-j19-1.1.H 2 \n", + "11.TA.30-57-Y-j19-1.1.H 2 \n", + "14.TA.30-57-Y-j19-1.1.H 2 " + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#join with trip information\n", + "trip_with_routes = tuple_stops.join(df_unique_stop_sequence.set_index(\"stop_id\"), on=\"stop_id\", how=\"left\").sort_values(\"route_int\")\n", + "trip_with_routes.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "trip_with_routes = trip_with_routes.rename(columns={\"stop_id\" : \"all_stops\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check if wrong manipulations cause to have the same, or higher, number of routes than trips. It is not the case" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19390" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check if routes and trips do not have the same number\n", + "trip_with_routes.index.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2555" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trip_with_routes.route_int.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We add the rout_int column to stop_times dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "245705" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated.trip_id.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "#join to get route_int in stop_times\n", + "stop_times_routes = stop_times_curated.join(trip_with_routes, how=\"left\", on=\"trip_id\" , lsuffix='_left', rsuffix='_right').drop_duplicates()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "245705" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_routes.trip_id.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_idstop_idtrip_idarrival_timedeparture_timestop_sequencedirection_idstop_nameroute_descstop_id_rawhour_departureall_stopsroute_int
819141-231-j19-185727471.TA.1-231-j19-1.1.H09:37:0009:37:0010Bremgarten AG, BahnhofBus85727479(8572747, 8582462, 8572600, 8572601, 8502553, ...618
1812811-231-j19-185824621.TA.1-231-j19-1.1.H09:38:0009:38:0030Bremgarten AG, ZelgliBus85824629(8572747, 8582462, 8572600, 8572601, 8502553, ...618
424601-231-j19-185726001.TA.1-231-j19-1.1.H09:39:0009:39:0040Zufikon, EmausBus85726009(8572747, 8582462, 8572600, 8572601, 8502553, ...618
2244541-231-j19-185726011.TA.1-231-j19-1.1.H09:39:0009:39:0050Zufikon, AlgierBus85726019(8572747, 8582462, 8572600, 8572601, 8502553, ...618
118361-231-j19-185025531.TA.1-231-j19-1.1.H09:43:0009:43:0060Unterlunkhofen, BreitenäckerBus85025539(8572747, 8582462, 8572600, 8572601, 8502553, ...618
\n", + "
" + ], + "text/plain": [ + " route_id stop_id trip_id arrival_time \\\n", + "81914 1-231-j19-1 8572747 1.TA.1-231-j19-1.1.H 09:37:00 \n", + "181281 1-231-j19-1 8582462 1.TA.1-231-j19-1.1.H 09:38:00 \n", + "42460 1-231-j19-1 8572600 1.TA.1-231-j19-1.1.H 09:39:00 \n", + "224454 1-231-j19-1 8572601 1.TA.1-231-j19-1.1.H 09:39:00 \n", + "11836 1-231-j19-1 8502553 1.TA.1-231-j19-1.1.H 09:43:00 \n", + "\n", + " departure_time stop_sequence direction_id \\\n", + "81914 09:37:00 1 0 \n", + "181281 09:38:00 3 0 \n", + "42460 09:39:00 4 0 \n", + "224454 09:39:00 5 0 \n", + "11836 09:43:00 6 0 \n", + "\n", + " stop_name route_desc stop_id_raw hour_departure \\\n", + "81914 Bremgarten AG, Bahnhof Bus 8572747 9 \n", + "181281 Bremgarten AG, Zelgli Bus 8582462 9 \n", + "42460 Zufikon, Emaus Bus 8572600 9 \n", + "224454 Zufikon, Algier Bus 8572601 9 \n", + "11836 Unterlunkhofen, Breitenäcker Bus 8502553 9 \n", + "\n", + " all_stops route_int \n", + "81914 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n", + "181281 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n", + "42460 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n", + "224454 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 \n", + "11836 (8572747, 8582462, 8572600, 8572601, 8502553, ... 618 " + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_routes.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2554" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check if route_int is correct\n", + "stop_times_routes.route_int.max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trip_int\n", + "The trip_int number needs to be ordered by route_int and time" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19390" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check number trips in stop_times\n", + "stop_times_routes.trip_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_idstop_idtrip_idarrival_timedeparture_timestop_sequencedirection_idstop_nameroute_descstop_id_rawhour_departureall_stopsroute_int
18129026-24-j19-18502208403.TA.26-24-j19-1.220.R10:44:0010:45:0031Horgen OberdorfS-Bahn8502208:0:410(8502208, 8502209, 8503201, 8503010, 8503011, ...0
26197426-24-j19-18502209403.TA.26-24-j19-1.220.R10:47:0010:47:0041Oberrieden DorfS-Bahn8502209:0:110(8502208, 8502209, 8503201, 8503010, 8503011, ...0
13016226-24-j19-18503201403.TA.26-24-j19-1.220.R10:53:0010:53:0061RüschlikonS-Bahn8503201:0:210(8502208, 8502209, 8503201, 8503010, 8503011, ...0
17367026-24-j19-18503010403.TA.26-24-j19-1.220.R11:02:0011:03:0091Zürich EngeS-Bahn8503010:0:211(8502208, 8502209, 8503201, 8503010, 8503011, ...0
23812926-24-j19-18503011403.TA.26-24-j19-1.220.R11:04:0011:04:00101Zürich WiedikonS-Bahn8503011:0:211(8502208, 8502209, 8503201, 8503010, 8503011, ...0
\n", + "
" + ], + "text/plain": [ + " route_id stop_id trip_id arrival_time \\\n", + "181290 26-24-j19-1 8502208 403.TA.26-24-j19-1.220.R 10:44:00 \n", + "261974 26-24-j19-1 8502209 403.TA.26-24-j19-1.220.R 10:47:00 \n", + "130162 26-24-j19-1 8503201 403.TA.26-24-j19-1.220.R 10:53:00 \n", + "173670 26-24-j19-1 8503010 403.TA.26-24-j19-1.220.R 11:02:00 \n", + "238129 26-24-j19-1 8503011 403.TA.26-24-j19-1.220.R 11:04:00 \n", + "\n", + " departure_time stop_sequence direction_id stop_name \\\n", + "181290 10:45:00 3 1 Horgen Oberdorf \n", + "261974 10:47:00 4 1 Oberrieden Dorf \n", + "130162 10:53:00 6 1 Rüschlikon \n", + "173670 11:03:00 9 1 Zürich Enge \n", + "238129 11:04:00 10 1 Zürich Wiedikon \n", + "\n", + " route_desc stop_id_raw hour_departure \\\n", + "181290 S-Bahn 8502208:0:4 10 \n", + "261974 S-Bahn 8502209:0:1 10 \n", + "130162 S-Bahn 8503201:0:2 10 \n", + "173670 S-Bahn 8503010:0:2 11 \n", + "238129 S-Bahn 8503011:0:2 11 \n", + "\n", + " all_stops route_int \n", + "181290 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n", + "261974 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n", + "130162 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n", + "173670 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 \n", + "238129 (8502208, 8502209, 8503201, 8503010, 8503011, ... 0 " + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_routes.sort_values([\"route_int\", \"arrival_time\"]).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate sequential trip_int, ordered by route and by time" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0trip_inttrip_id
0403.TA.26-24-j19-1.220.R0403.TA.26-24-j19-1.220.R
1425.TA.26-24-j19-1.220.R1425.TA.26-24-j19-1.220.R
24.TA.30-57-Y-j19-1.1.H24.TA.30-57-Y-j19-1.1.H
35.TA.30-57-Y-j19-1.1.H35.TA.30-57-Y-j19-1.1.H
46.TA.30-57-Y-j19-1.1.H46.TA.30-57-Y-j19-1.1.H
\n", + "
" + ], + "text/plain": [ + " 0 trip_int trip_id\n", + "0 403.TA.26-24-j19-1.220.R 0 403.TA.26-24-j19-1.220.R\n", + "1 425.TA.26-24-j19-1.220.R 1 425.TA.26-24-j19-1.220.R\n", + "2 4.TA.30-57-Y-j19-1.1.H 2 4.TA.30-57-Y-j19-1.1.H\n", + "3 5.TA.30-57-Y-j19-1.1.H 3 5.TA.30-57-Y-j19-1.1.H\n", + "4 6.TA.30-57-Y-j19-1.1.H 4 6.TA.30-57-Y-j19-1.1.H" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trip_df = pd.DataFrame(stop_times_routes.sort_values([\"route_int\", \"arrival_time\"]).trip_id.unique())\n", + "trip_df[\"trip_int\"] = trip_df.index\n", + "trip_df[\"trip_id\"] = trip_df.iloc[:,0]\n", + "trip_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19390" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check number trip_id\n", + "trip_df.trip_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We join trip_id to stop_times dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#join to get trip_int in stop_times\n", + "stop_times_routes_trip = stop_times_routes.join(trip_df.set_index(\"trip_id\"), how=\"inner\", on=\"trip_id\" , lsuffix='_left', rsuffix='_right').drop_duplicates()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#save ordered stop_times\n", + "stop_times_routes_trip = stop_times_routes_trip.sort_values([\"route_int\", \"trip_int\", \"stop_sequence\"])\n", + "stop_times_routes_trip.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#check if manipulations did not destroy trips\n", + "stop_times_routes_trip.trip_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Stop_int\n", + "Stop_int id needs to ordered by route, trip and stop sequence" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#check number stops at entry\n", + "stop_times_routes_trip.stop_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "stop_times_routes_trip is already in the right order. We create dataframe to create stop_int" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stops_df = pd.DataFrame(stop_times_routes_trip.stop_id.unique())\n", + "stops_df[\"stop_int\"] = stops_df.index\n", + "stops_df[\"stop_id\"] = stops_df.iloc[:,0]\n", + "stops_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#check if number stop_int correct\n", + "stops_df.stop_int.nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We add stop_int information to stop_times" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#join to get stop_int\n", + "stop_times_routes_trip_stop = stop_times_routes_trip.join(stops_df.set_index(\"stop_id\"), how=\"inner\", on=\"stop_id\", lsuffix='_left', rsuffix='_right').drop_duplicates()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_routes_trip_stop.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#check if no stops deleted during manipulation\n", + "stop_times_routes_trip_stop.stop_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_routes_trip_stop.stop_int.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#keep only useful columns \n", + "stop_times_int = stop_times_routes_trip_stop[[\"route_int\", \"trip_int\", \"stop_int\", \"stop_sequence\", \"arrival_time\", \"departure_time\",\\\n", + " \"route_id\", \"trip_id\", \"stop_id\", \\\n", + " \"route_desc\", \"stop_id_raw\", \"stop_name\"]].sort_values([\"route_int\", \"trip_int\", \"stop_sequence\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_int = stop_times_int.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_int.loc[100:150].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An overview of number of routes, trips and stops" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_int.route_int.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_int.trip_int.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_int.stop_int.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_int.stop_int.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transfer: delete transfer to same stop & get stop_int & stop_int2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12564" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check number stops transfers\n", + "transfers.stop_id.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker
11850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde
2285021868502186:0:10.0067628Dietikon StoffelbachDietikon Stoffelbach
3385021868502186:0:20.01352416Dietikon StoffelbachDietikon Stoffelbach
4485021868502186P0.0000000Dietikon StoffelbachDietikon Stoffelbach
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", + "0 0 8500926 8590616 0.122430 146 \n", + "1 1 8500926 8590737 0.300175 360 \n", + "2 2 8502186 8502186:0:1 0.006762 8 \n", + "3 3 8502186 8502186:0:2 0.013524 16 \n", + "4 4 8502186 8502186P 0.000000 0 \n", + "\n", + " stop_name stop_name2 \n", + "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n", + "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n", + "2 Dietikon Stoffelbach Dietikon Stoffelbach \n", + "3 Dietikon Stoffelbach Dietikon Stoffelbach \n", + "4 Dietikon Stoffelbach Dietikon Stoffelbach " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfers.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We delete transfers to the same stop" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "transfers_df = transfers[transfers['stop_id'] != transfers['stop_id2']]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12564" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfers_df.stop_id.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create the stop_int column in transfers. This action eliminates stops not in stop_times" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_int = stop_times_curated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers_df = transfers_df.merge(stop_times_int[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\"), how=\"inner\", on = \"stop_id\").drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers_df.stop_id.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#create dataframe with stops\n", + "df_stop_int2 = stop_times_int[[\"stop_id\", \"stop_int\"]].rename(columns={\"stop_id\": \"stop_id2\", \"stop_int\" : \"stop_int_2\"})\n", + "df_stop_int2.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We add the the stop id for the arrival destination, stop_int2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers_df_int = transfers_df.merge(df_stop_int2.set_index(\"stop_id2\"), how=\"inner\", on = \"stop_id2\").drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers_df_int.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers_df_int.stop_id.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers = transfers_df_int" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#check number unique stops2 in transfers\n", + "transfers.stop_id2.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers.stop_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
0026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
1126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
2226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
3326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
4426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 route_id stop_id_general trip_id stop_id \\\n", + "0 0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 \n", + "1 1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 \n", + "2 2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 \n", + "3 3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 \n", + "4 4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 \n", + "\n", + " arrival_time departure_time stop_sequence stop_name \\\n", + "0 17:00:00 17:00:00 3 Zürich, Hürlimannplatz \n", + "1 17:02:00 17:02:00 4 Zürich, Waffenplatzstrasse \n", + "2 17:03:00 17:03:00 5 Zürich, Hügelstrasse \n", + "3 17:04:00 17:04:00 6 Zürich, Brunau/Mutschellenstr. \n", + "4 17:05:00 17:05:00 7 Zürich, Thujastrasse \n", + "\n", + " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n", + "0 47.365066 8.526539 Zürich, Neubühl 3870 0 \n", + "1 47.361482 8.525749 Zürich, Neubühl 3870 0 \n", + "2 47.358543 8.526997 Zürich, Neubühl 3870 0 \n", + "3 47.355147 8.527141 Zürich, Neubühl 3870 0 \n", + "4 47.350187 8.527806 Zürich, Neubühl 3870 0 \n", + "\n", + " departure_first_stop route_int stop_count stop_int route_desc \n", + "0 16:55:00 1225 12 1317 Bus \n", + "1 16:55:00 1225 12 1267 Bus \n", + "2 16:55:00 1225 12 67 Bus \n", + "3 16:55:00 1225 12 512 Bus \n", + "4 16:55:00 1225 12 403 Bus " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_ordered = stop_times_curated\n", + "stop_times_ordered.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by making sure the order is correct" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"trip_int\", \"stop_sequence\"])\n", + "stop_times_ordered.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We add None to first arrival time and last departure time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#adding a shift\n", + "stop_times_ordered[\"sequence_shift_1\"] = stop_times_ordered[\"stop_sequence\"].shift(-1, fill_value=0)\n", + "stop_times_ordered.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"stop_sequence\"] > stop_times_ordered[\"sequence_shift_1\"]), None, stop_times_ordered['departure_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"stop_sequence\"] == 1), None, stop_times_ordered['arrival_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_shift_1\"]].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Array structure preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### StopTimes: \n", + "[[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We transform it in datetime as required by the raptor algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n", + "stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_times_df.pkl','wb') as f: pickle.dump(stop_times_ordered, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered = stop_times_ordered.sort_values(by=[\"route_int\", \"trip_int\", \"stop_sequence\"])\n", + "stop_times_ordered.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we transform it to array, ready ti be used by raptor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_array = stop_times_ordered[[\"arrival_time\", \"departure_time\"]].to_numpy()\n", + "stop_times_array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.size(stop_times_array,0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_times_array.pkl','wb') as f: pickle.dump(stop_times_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Routes: \n", + "[[route0_nr.Trips, route0_nr. Stops, route0_pointerRoutes, route0_pointerStops_times],[route1_nr.Trips, route1_nr. Stops,, route1_pointerRoutes, route1_pointerStops_times],…]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by getting the number of trips and stops there is for each route" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops = stop_times_ordered.groupby([\"route_int\"]).nunique()[[\"trip_int\",\"stop_int\"]].sort_index().rename(columns={\"trip_int\": \"n_Trips\", \"stop_int\": \"n_stops\"})\n", + "distinct_trips_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create the pointer for the route stops, by adding the unique stops for each route" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops['pointer_routes_stops'] = distinct_trips_stops.n_stops.cumsum().shift(1, fill_value=0)\n", + "distinct_trips_stops.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create the pointer for stop_times by adding the number of stops in each route, counting duplicates (due to several trips)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops[\"pointer_stop_times\"] = (stop_times_ordered.groupby([\"route_int\"]).count().stop_id).cumsum().shift(1, fill_value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops[\"pointer_routes_stops_shift\"] = distinct_trips_stops['pointer_routes_stops'].shift(-1, fill_value=0)\n", + "distinct_trips_stops[\"pointer_stop_times_shift\"] = distinct_trips_stops['pointer_stop_times'].shift(-1, fill_value=0)\n", + "distinct_trips_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops['pointer_routes_stops'] = np.where((distinct_trips_stops[\"pointer_routes_stops\"] == distinct_trips_stops[\"pointer_routes_stops_shift\"]), None, distinct_trips_stops['pointer_routes_stops'])\n", + "distinct_trips_stops['pointer_stop_times'] = np.where((distinct_trips_stops[\"pointer_stop_times\"] == distinct_trips_stops[\"pointer_stop_times_shift\"]), None, distinct_trips_stops['pointer_stop_times'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops.isna().any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/routes_array_df.pkl','wb') as f: pickle.dump(distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']], f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routes_array = distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']].to_numpy()\n", + "routes_array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.size(routes_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/routes_array.pkl','wb') as f: pickle.dump(routes_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route_stops = stop_times_ordered.sort_values([\"route_int\", \"stop_sequence\"])\n", + "route_stops = route_stops[['route_int', 'stop_int']].drop_duplicates().reset_index()\n", + "route_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route_stops.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route_stops.route_int.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/route_stops_df.pkl','wb') as f: pickle.dump(route_stops, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route_stops_array = route_stops.stop_int.to_numpy()\n", + "route_stops_array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.size(np.unique(route_stops_array))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.size(route_stops_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route_stops_array.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/route_stops_array.pkl','wb') as f: pickle.dump(route_stops_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check if pointers are correct\n", + "It is fundamental that the indexes, that serve as pointers, in Routes are correct" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by looking at where the indexes for stop_times and route_stops diverge. This will allow us to change. We can see that Route stops should have a new route at 3 while stop_times should have it at 78, so we try with that" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can check if the pointer indicates the routes index number. At the pointer_routes should indicate the first stop of a new route. We try with 3 to see if route_stops has a new route at this index. It does so it works" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route_stops.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We go and see if stop_times has a new route at 78. It does, so it works" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered.loc[75:80].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stops_join = route_stops.join(transfers.set_index(\"stop_int\"), how=\"left\", on=\"stop_int\").drop_duplicates()\n", + "stops_join.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stops_join.stop_int.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_route_transfers = stops_join.sort_values(\"stop_int\").groupby([\"stop_int\"]).nunique().rename(columns={\"route_int\": \"n_Routes\", \"stop_int_2\": \"n_Transfers\"})\n", + "distinct_route_transfers = distinct_route_transfers[[\"n_Routes\", \"n_Transfers\"]].sort_index()\n", + "distinct_route_transfers.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_route_transfers['pointer_stop_routes'] = distinct_route_transfers.n_Routes.cumsum().shift(1, fill_value=0)\n", + "distinct_route_transfers['pointer_transfers'] = distinct_route_transfers.n_Transfers.cumsum().shift(1, fill_value=0)\n", + "distinct_route_transfers.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_route_transfers[\"pointer_stop_routes_shift\"] = distinct_route_transfers['pointer_stop_routes'].shift(-1, fill_value=0)\n", + "distinct_route_transfers[\"pointer_transfers_shift\"] = distinct_route_transfers['pointer_transfers'].shift(-1, fill_value=0)\n", + "distinct_route_transfers.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_route_transfers['pointer_stop_routes'] = np.where((distinct_route_transfers[\"pointer_stop_routes\"] == distinct_route_transfers[\"pointer_stop_routes_shift\"]), None, distinct_route_transfers['pointer_stop_routes'])\n", + "distinct_route_transfers['pointer_transfers'] = np.where((distinct_route_transfers[\"pointer_transfers\"] == distinct_route_transfers[\"pointer_transfers_shift\"]), None, distinct_route_transfers['pointer_transfers'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_route_transfers.isna().any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stops_df = distinct_route_transfers[['pointer_stop_routes', 'pointer_transfers']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stops_df.pkl','wb') as f: pickle.dump(stops_df, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stops_array = stops_df.to_numpy()\n", + "stops_array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.size(stops_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stops_array.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stops_array.pkl','wb') as f: pickle.dump(stops_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_routes = stop_times_ordered[[\"route_int\", \"stop_int\", \"stop_id\"]].drop_duplicates().sort_values([\"stop_int\", \"route_int\"])\n", + "stop_routes = stop_routes.reset_index()\n", + "stop_routes.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_routes.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_curated.route_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_routes.route_int.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_routes_df.pkl','wb') as f: pickle.dump(stop_routes, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_routes_array = stop_routes[\"route_int\"].to_numpy()\n", + "stop_routes_array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.size(stop_routes_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_routes_array.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_routes_array.pkl','wb') as f: pickle.dump(stop_routes_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfers.stop_id.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfer_pandas = transfers[[\"stop_int\",\"stop_int_2\", \"Transfer_time_sec\", \"stop_id_raw\"]].sort_values([\"stop_int\", \"stop_int_2\", \"stop_id_raw\"]).drop_duplicates([\"stop_int\", \"stop_int_2\"])\n", + "transfer_pandas = transfer_pandas.reset_index(drop=True)\n", + "transfer_pandas.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfer_pandas.stop_int_2.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/transfer_df.pkl','wb') as f: pickle.dump(transfers.sort_values(\"stop_id\"), f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfer_array = transfer_pandas[[\"stop_int_2\", \"Transfer_time_sec\"]].to_numpy()\n", + "transfer_array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/transfer_array.pkl','wb') as f: pickle.dump(transfer_array, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.size(transfer_array, 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Check if indexes in stops is correct" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see first the pointers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stops_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that at the index 8 there should be a new stop. we check and it is false" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transfer_pandas.loc[5:10].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that at index 4 we should have a new stop. we check and it true" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_routes.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_routes.loc[stop_routes['stop_int'] == 172]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "route_stops.loc[route_stops['stop_int'] == 172]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "read files as pickles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_times_array.pkl','rb') as f: arrayname1 = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/routes_array.pkl','rb') as f: arrayname2 = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/route_stops_array.pkl','rb') as f: arrayname3 = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "arrayname1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "arrayname2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "arrayname3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/OLD_hdfs_data_processing_spark.ipynb b/notebooks/old_notebooks/OLD_hdfs_data_processing_spark.ipynb similarity index 100% rename from notebooks/OLD_hdfs_data_processing_spark.ipynb rename to notebooks/old_notebooks/OLD_hdfs_data_processing_spark.ipynb diff --git a/notebooks/old_notebooks/debugging.ipynb b/notebooks/old_notebooks/debugging.ipynb new file mode 100644 index 0000000..9a66390 --- /dev/null +++ b/notebooks/old_notebooks/debugging.ipynb @@ -0,0 +1,1469 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def pkload(path):\n", + " with open(path, 'rb') as f:\n", + " obj = pickle.load(f)\n", + " return obj" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Searching for journeys from Uster, Gschwader (stop 8588052) to Buchs ZH, Furttalstrasse (stop 8595356) with arrival at 17:30 leads to a footpath of over 3.2km + in reality, while the algorithm prints this:\n", + "\n", + "\" Walk 2.1 minutes from Birmensdorf ZH (stop 8502221)\n", + " to Dällikon, Industrie (stop 8576276)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexroute_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descmonotonically_increasing_id
2770819701326-5-A-j19-18502221114.TA.26-5-A-j19-1.37.R8502221:0:22020-05-24 19:35:002020-05-24 19:35:009Birmensdorf ZH47.3574968.437543Pfäffikon SZ18575119:22:0014910276S-Bahn180388626589
\n", + "
" + ], + "text/plain": [ + " index route_id stop_id_general trip_id \\\n", + "27708 197013 26-5-A-j19-1 8502221 114.TA.26-5-A-j19-1.37.R \n", + "\n", + " stop_id arrival_time departure_time stop_sequence \\\n", + "27708 8502221:0:2 2020-05-24 19:35:00 2020-05-24 19:35:00 9 \n", + "\n", + " stop_name stop_lat stop_lon trip_headsign trip_short_name \\\n", + "27708 Birmensdorf ZH 47.357496 8.437543 Pfäffikon SZ 18575 \n", + "\n", + " direction_id departure_first_stop route_int stop_count stop_int \\\n", + "27708 1 19:22:00 149 10 276 \n", + "\n", + " route_desc monotonically_increasing_id \n", + "27708 S-Bahn 180388626589 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Birmensdorf ZH\n", + "stop_times = pkload(\"../data/stop_times_df_cyril.pkl\")\n", + "stop_times[stop_times['stop_id_general']==8502221].head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexroute_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descmonotonically_increasing_id
193522235226-449-j19-1857627617.TA.26-449-j19-1.1.H8576276NaT2020-05-24 07:00:002Dällikon, Industrie47.4447378.438783Buchs-Dällikon, Bahnhof4747007:00:00193474Bus8589935205
\n", + "
" + ], + "text/plain": [ + " index route_id stop_id_general trip_id stop_id \\\n", + "1935 222352 26-449-j19-1 8576276 17.TA.26-449-j19-1.1.H 8576276 \n", + "\n", + " arrival_time departure_time stop_sequence stop_name \\\n", + "1935 NaT 2020-05-24 07:00:00 2 Dällikon, Industrie \n", + "\n", + " stop_lat stop_lon trip_headsign trip_short_name \\\n", + "1935 47.444737 8.438783 Buchs-Dällikon, Bahnhof 4747 \n", + "\n", + " direction_id departure_first_stop route_int stop_count stop_int \\\n", + "1935 0 07:00:00 19 3 474 \n", + "\n", + " route_desc monotonically_increasing_id \n", + "1935 Bus 8589935205 " + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Dallikon Industrie\n", + "stop_times[stop_times['stop_id_general']==8576276].head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A google maps footpaths using GPS coordinates yields a walk of 14.1 km. Is there a footpath defined between those two stops in transfers ?" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "transfers = pkload(\"../data/transfer_df_cyril.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexstop_id_generalstop_intstop_lat_firststop_lon_firststop_name_firststop_id_general_2stop_int_2stop_lat_first_2stop_lon_first_2stop_name_first_2distancewalking_timemonotonically_increasing_id
13003887850222127647.3575578.437543Birmensdorf ZH857371847347.3571258.438801Birmensdorf ZH, Bahnhof0.106219127352187318287
13013888850222127647.3575578.437543Birmensdorf ZH850295087747.3539368.437175Birmensdorf ZH, Zentrum0.403584484352187318288
13023889850222127647.3575578.437543Birmensdorf ZH8583870115447.3572348.437013Birmensdorf ZH, Bahnhof Süd0.05366664352187318289
\n", + "
" + ], + "text/plain": [ + " index stop_id_general stop_int stop_lat_first stop_lon_first \\\n", + "1300 3887 8502221 276 47.357557 8.437543 \n", + "1301 3888 8502221 276 47.357557 8.437543 \n", + "1302 3889 8502221 276 47.357557 8.437543 \n", + "\n", + " stop_name_first stop_id_general_2 stop_int_2 stop_lat_first_2 \\\n", + "1300 Birmensdorf ZH 8573718 473 47.357125 \n", + "1301 Birmensdorf ZH 8502950 877 47.353936 \n", + "1302 Birmensdorf ZH 8583870 1154 47.357234 \n", + "\n", + " stop_lon_first_2 stop_name_first_2 distance walking_time \\\n", + "1300 8.438801 Birmensdorf ZH, Bahnhof 0.106219 127 \n", + "1301 8.437175 Birmensdorf ZH, Zentrum 0.403584 484 \n", + "1302 8.437013 Birmensdorf ZH, Bahnhof Süd 0.053666 64 \n", + "\n", + " monotonically_increasing_id \n", + "1300 352187318287 \n", + "1301 352187318288 \n", + "1302 352187318289 " + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# transfers from Birmensdorf ZH\n", + "transfers[transfers['stop_id_general']==8502221]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "# Transfers from Dallikon Industrie" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexstop_id_generalstop_intstop_lat_firststop_lon_firststop_name_firststop_id_general_2stop_int_2stop_lat_first_2stop_lon_first_2stop_name_first_2distancewalking_timemonotonically_increasing_id
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [index, stop_id_general, stop_int, stop_lat_first, stop_lon_first, stop_name_first, stop_id_general_2, stop_int_2, stop_lat_first_2, stop_lon_first_2, stop_name_first_2, distance, walking_time, monotonically_increasing_id]\n", + "Index: []" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfers[transfers['stop_id_general']==8576276]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are no footpaths between Birmensdorf ZH and Dallikon Industrie, and interestingly, Dallikon Industrie has no footpaths at all. That points to a bug with the gestion of 'None' pointers in the array stops for footpaths" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "transfer_array = pkload(\"../data/transfer_array_cyril.pkl\")\n", + "stops = pkload(\"../data/stops_array_cyril.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[3075, 1297.0],\n", + " [3119, 1300.0],\n", + " [3131, 1303.0]], dtype=object)" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# transfers from Birmensdorf ZH\n", + "stop_int = 276\n", + "stops[stop_int-1:stop_int+2]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 473, 127],\n", + " [ 877, 484],\n", + " [1154, 64]])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**The first entry of transfer_array[276] corresponding to Birmensdorf ZH is 473, which is one less than 474, the stop_id of Dallikon Industrie**. That may be a clue to what's going wrong with the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[5408, 2157.0],\n", + " [5417, nan],\n", + " [5420, 2160.0]], dtype=object)" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# transfers from Dallikon Industrie\n", + "stop_int = 474\n", + "stops[stop_int-1:stop_int+2]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "cannot convert float NaN to integer", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Trying to access the transfer for the nan pointer (SHOULD FAIL)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtransfer_array\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mValueError\u001b[0m: cannot convert float NaN to integer" + ] + } + ], + "source": [ + "# Trying to access the transfer for the nan pointer (SHOULD FAIL)\n", + "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# checking out what's around the nan pointer:" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 276, 127],\n", + " [ 877, 450],\n", + " [1154, 162]])" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfer_array[int(stops[stop_int-1][1]):int(stops[stop_int+1][1])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first entry of transfer_array[stop_int-1] is 276, the stop_int of Birmensdorf ZH." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## recapitulating the bug with another journey\n", + "\n", + "Searching for journeys from Dübendorf, Branzenäsch (stop 8590551) to Kloten, Weinbergstrasse (stop 8579967) with arrival at 17:30 gives an impossible first walk from Dübendorf, Branzenäsch to Uetliberg (stop 8503057)." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexroute_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descmonotonically_increasing_id
9587311035026-752-j19-18590551190.TA.26-752-j19-1.4.R85905512020-05-24 07:16:002020-05-24 07:16:0011Dübendorf, Branzenäsch47.3946658.631157Dübendorf, Kunsteisbahn1420107:06:0056315503Bus627065226053
\n", + "
" + ], + "text/plain": [ + " index route_id stop_id_general trip_id \\\n", + "95873 110350 26-752-j19-1 8590551 190.TA.26-752-j19-1.4.R \n", + "\n", + " stop_id arrival_time departure_time stop_sequence \\\n", + "95873 8590551 2020-05-24 07:16:00 2020-05-24 07:16:00 11 \n", + "\n", + " stop_name stop_lat stop_lon trip_headsign \\\n", + "95873 Dübendorf, Branzenäsch 47.394665 8.631157 Dübendorf, Kunsteisbahn \n", + "\n", + " trip_short_name direction_id departure_first_stop route_int \\\n", + "95873 1420 1 07:06:00 563 \n", + "\n", + " stop_count stop_int route_desc monotonically_increasing_id \n", + "95873 15 503 Bus 627065226053 " + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Dübendorf, Branzenäsch\n", + "stop_times[stop_times['stop_id_general']== 8590551].head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexroute_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descmonotonically_increasing_id
53045174926-10-B-j19-18503057181.TA.26-10-B-j19-1.9.H85030572020-05-24 07:14:00NaT8Uetliberg47.3523668.487651Uetliberg23511007:03:00624415S-Bahn25769805095
\n", + "
" + ], + "text/plain": [ + " index route_id stop_id_general trip_id \\\n", + "5304 51749 26-10-B-j19-1 8503057 181.TA.26-10-B-j19-1.9.H \n", + "\n", + " stop_id arrival_time departure_time stop_sequence stop_name \\\n", + "5304 8503057 2020-05-24 07:14:00 NaT 8 Uetliberg \n", + "\n", + " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n", + "5304 47.352366 8.487651 Uetliberg 23511 0 \n", + "\n", + " departure_first_stop route_int stop_count stop_int route_desc \\\n", + "5304 07:03:00 62 4 415 S-Bahn \n", + "\n", + " monotonically_increasing_id \n", + "5304 25769805095 " + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Uetliberg (stop 8503057)\n", + "stop_times[stop_times['stop_id_general']== 8503057].head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A google maps search for footpaths between the coordinates yields a 14.5 km walk." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexstop_id_generalstop_intstop_lat_firststop_lon_firststop_name_firststop_id_general_2stop_int_2stop_lat_first_2stop_lon_first_2stop_name_first_2distancewalking_timemonotonically_increasing_id
22562938859055150347.3946658.631157Dübendorf, Branzenäsch859058741447.3930538.633367Dübendorf, Sonnenberg0.244501293618475290624
22572939859055150347.3946658.631157Dübendorf, Branzenäsch859055559947.3948778.628714Dübendorf, Claridenstrasse0.185439222618475290625
22582940859055150347.3946658.631157Dübendorf, Branzenäsch859058188647.3915578.634876Dübendorf, Raubbühl0.444715533618475290626
22592941859055150347.3946658.631157Dübendorf, Branzenäsch859055093147.3970488.625614Dübendorf, Bettlistrasse0.494279593618475290627
22602942859055150347.3946658.631157Dübendorf, Branzenäsch8590590122647.3905358.632469Dübendorf, Sunnhalde0.469644563618475290628
22612943859055150347.3946658.631157Dübendorf, Branzenäsch8590562124447.3962408.629935Dübendorf, Glärnischstrasse0.197806237618475290629
22622944859055150347.3946658.631157Dübendorf, Branzenäsch8590549139247.3948178.634580Dübendorf, Bauhof0.258181309618475290630
\n", + "
" + ], + "text/plain": [ + " index stop_id_general stop_int stop_lat_first stop_lon_first \\\n", + "2256 2938 8590551 503 47.394665 8.631157 \n", + "2257 2939 8590551 503 47.394665 8.631157 \n", + "2258 2940 8590551 503 47.394665 8.631157 \n", + "2259 2941 8590551 503 47.394665 8.631157 \n", + "2260 2942 8590551 503 47.394665 8.631157 \n", + "2261 2943 8590551 503 47.394665 8.631157 \n", + "2262 2944 8590551 503 47.394665 8.631157 \n", + "\n", + " stop_name_first stop_id_general_2 stop_int_2 stop_lat_first_2 \\\n", + "2256 Dübendorf, Branzenäsch 8590587 414 47.393053 \n", + "2257 Dübendorf, Branzenäsch 8590555 599 47.394877 \n", + "2258 Dübendorf, Branzenäsch 8590581 886 47.391557 \n", + "2259 Dübendorf, Branzenäsch 8590550 931 47.397048 \n", + "2260 Dübendorf, Branzenäsch 8590590 1226 47.390535 \n", + "2261 Dübendorf, Branzenäsch 8590562 1244 47.396240 \n", + "2262 Dübendorf, Branzenäsch 8590549 1392 47.394817 \n", + "\n", + " stop_lon_first_2 stop_name_first_2 distance walking_time \\\n", + "2256 8.633367 Dübendorf, Sonnenberg 0.244501 293 \n", + "2257 8.628714 Dübendorf, Claridenstrasse 0.185439 222 \n", + "2258 8.634876 Dübendorf, Raubbühl 0.444715 533 \n", + "2259 8.625614 Dübendorf, Bettlistrasse 0.494279 593 \n", + "2260 8.632469 Dübendorf, Sunnhalde 0.469644 563 \n", + "2261 8.629935 Dübendorf, Glärnischstrasse 0.197806 237 \n", + "2262 8.634580 Dübendorf, Bauhof 0.258181 309 \n", + "\n", + " monotonically_increasing_id \n", + "2256 618475290624 \n", + "2257 618475290625 \n", + "2258 618475290626 \n", + "2259 618475290627 \n", + "2260 618475290628 \n", + "2261 618475290629 \n", + "2262 618475290630 " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# footpaths from Dübendorf, Branzenäsch\n", + "transfers[transfers['stop_id_general']== 8590551]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No sign of Uetliberg here." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexstop_id_generalstop_intstop_lat_firststop_lon_firststop_name_firststop_id_general_2stop_int_2stop_lat_first_2stop_lon_first_2stop_name_first_2distancewalking_timemonotonically_increasing_id
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [index, stop_id_general, stop_int, stop_lat_first, stop_lon_first, stop_name_first, stop_id_general_2, stop_int_2, stop_lat_first_2, stop_lon_first_2, stop_name_first_2, distance, walking_time, monotonically_increasing_id]\n", + "Index: []" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# transfers from Uetliberg (stop 8503057)\n", + "transfers[transfers['stop_id_general']== 8503057]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, utliberg has no footpaths, like Dallikon Industrie in the previous bug." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[5599, 2253.0],\n", + " [5603, 2256.0],\n", + " [5606, 2263.0]], dtype=object)" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# footpaths from Dübendorf, Branzenäsch\n", + "stop_int = 503\n", + "stops[stop_int-1:stop_int+2]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 414, 293],\n", + " [ 599, 222],\n", + " [ 886, 533],\n", + " [ 931, 593],\n", + " [1226, 563],\n", + " [1244, 237],\n", + " [1392, 309]])" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice here again that the first transfer from Dübendorf, Branzenäsch goes to stop_int= 414, which is one less than the stop_int of Utliberg (415)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[4850, 1901.0],\n", + " [4853, nan],\n", + " [4860, 1907.0]], dtype=object)" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# footpahts from Utliberg:\n", + "stop_int = 415\n", + "stops[stop_int-1:stop_int+2]" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "cannot convert float NaN to integer", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# expected to fail: accessing a none pointer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtransfer_array\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mValueError\u001b[0m: cannot convert float NaN to integer" + ] + } + ], + "source": [ + "# expected to fail: accessing a none pointer\n", + "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "cannot convert float NaN to integer", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtransfer_array\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstops\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstop_int\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mValueError\u001b[0m: cannot convert float NaN to integer" + ] + } + ], + "source": [ + "transfer_array[int(stops[stop_int][1]):int(stops[stop_int+1][1])]" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 503, 293],\n", + " [ 599, 485],\n", + " [ 886, 241],\n", + " [1226, 345],\n", + " [1244, 526],\n", + " [1392, 259]])" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfer_array[int(stops[stop_int-1][1]):int(stops[stop_int+1][1])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, the first entry of transfer_array[stop_int-1] is 503, the stop_int of Dübendorf, Branzenäsch !" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing Felix's stop conversion:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([False])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.isnan(np.arange(1))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0 75]\n", + "[0 0]\n", + "(1407, 2)\n", + "[ 0 0 0 75]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[ 0, 11, 0, 2],\n", + " [ 11, 20, 2, 7],\n", + " [ 20, 38, 7, 22],\n", + " ...,\n", + " [15303, 15334, 6242, 6250],\n", + " [15334, 15339, 6250, 6257],\n", + " [15339, 15344, 6257, 6264]], dtype=uint32)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stops = pkload(\"../data/stops_array_cyril.pkl\")\n", + "stopRoutes = pkload(\"../data/stop_routes_array_cyril.pkl\")\n", + "print(np.isnan(stops.astype(np.float64)).sum(axis=0))\n", + "print(np.equal(stops, None).sum(axis=0))\n", + "print(stops.shape)\n", + "stops = stops[:,[0,0,1,1]]\n", + "# Make column 1 contain the start_index of the next stop in stopRoutes\n", + "stops[:-1,1] = stops[1:,0]\n", + "stops[-1, 1] = stopRoutes.shape[0]\n", + "# Make column 3 contain the start_index of the next stop in stopRoutes\n", + "if np.isnan(stops[-1,2]).item():\n", + " stops[-1,2] = transfers.shape[0]\n", + "for i in np.isnan(stops[:-1,2].astype(np.float64)).nonzero()[0][::-1]:\n", + " stops[i,2] = stops[i+1,2]\n", + "print(np.isnan(stops.astype(np.float64)).sum(axis=0))\n", + "stops[:-1,3] = stops[1:,2]\n", + "stops[-1, 3] = transfers.shape[0]\n", + "# Convert to int\n", + "stops = stops.astype(np.uint32)\n", + "stops" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[5599, 5603, 2253, 2256],\n", + " [5603, 5606, 2256, 2263],\n", + " [5606, 5615, 2263, 2266]], dtype=uint32)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# footpaths from Dübendorf, Branzenäsch\n", + "stop_int = 503\n", + "stops[stop_int-1:stop_int+2]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 414, 293],\n", + " [ 599, 222],\n", + " [ 886, 533],\n", + " [ 931, 593],\n", + " [1226, 563],\n", + " [1244, 237],\n", + " [1392, 309]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfer_array[stops[stop_int][2]:stops[stop_int][3]]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[4850, 4853, 1901, 1907],\n", + " [4853, 4860, 1907, 1907],\n", + " [4860, 4885, 1907, 1914]], dtype=uint32)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# footpaths from utliberg\n", + "# footpahts from Utliberg:\n", + "stop_int = 415\n", + "stops[stop_int-1:stop_int+2]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There's the bug ! The pointers for transfer_array in the first entry shows 1901, 1901 when it should in fact be the second entry that shows 1901, 1901. The first entry should show (1901, 1907) (see cell nr 81)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/old_notebooks/raptor_toy_example_2020_05_21.ipynb b/notebooks/old_notebooks/raptor_toy_example_2020_05_21.ipynb new file mode 100644 index 0000000..0be9aca --- /dev/null +++ b/notebooks/old_notebooks/raptor_toy_example_2020_05_21.ipynb @@ -0,0 +1,1916 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Coding a RAPTOR toy example\n", + "\n", + "## Goal\n", + "\n", + "Learn the RAPTOR algorithm by coding it for a toy example with the data structures advised in the paper. We code RAPTOR for a super simple toy example with only two routes and two trips each.\n", + "\n", + "## Toy example\n", + "- TODO updates:\n", + " - additional route r2 that goes from A to E slowly\n", + " - walking paths\n", + "![toy_example](img/RAPTOR_example.png) \n", + "\n", + "## Encoding the data structures\n", + "### General considerations\n", + "We adhere to the data structures proposed by Delling et al. These structures aim to minimize read times in memory by making use of consecutive in-memory adresses. Thus, structures with varying dimensions (e.g dataframes, python lists) are excluded. We illustrate the difficulty with an example. \n", + "\n", + "Each route has a potentially unique number of stops. Therefore, we cannot store stops in a 2D array of routes by stops, as the number of stops is not the same for each route. We adress this problem by storing stops consecutively by route, and keeping track of the index of the first stop for each route.\n", + "\n", + "This general strategy is applied to all the required data structures.\n", + "\n", + "### routes\n", + "The `routes` array will contain arrays `[n_trips, n_stops, pt_1st_stop, pt_1st_trip]` where all four values are `int`. To avoid overcomplicating things and try to mimic pointers in python, `pt_1st_stop` and `pt_1st_trip` contain integer indices." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "routes = np.array([[2, 3, 0, 0], #r0\n", + " [2, 3, 3, 6], #r1\n", + " [2, 2, 6, 12]]) # r2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### routeStops\n", + "`routeStops` is an array that contains the ordered lists of stops for each route. `pt_1st_stop` in `routes` is required to get to the first stop of the route. is itself an array that contains the sequence of stops for route $r_i$." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "routeStops = np.array([0, 1, 2, # A, B, C\n", + " 3, 2, 4, # D, C, E\n", + " 0, 4]) # A, E" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### stopTimes\n", + "\n", + "The i-th entry in the `stopTimes` array is itself an array which contains the arrival and departure time at a particular stop for a particular trip. `stopTimes` is sorted by routes, and then by trips. We retrieve the index of the first (earliest) trip of the route with the pointer `pt_1st_trip` stored in `routes`. We may use the built-in `numpy` [date and time data structures](https://blog.finxter.com/how-to-work-with-dates-and-times-in-python/). In short, declaring dates and times is done like this: `np.datetime64('YYYY-MM-DDThh:mm')`. Entries with a `NaT` arrival or departure times correspond to beginning and end of trips respectively.\n", + "\n", + "Note that trips are indexed implicitely in stopTimes, but we decided to change a little bit from the paper and index them according to their parent route instead of giving them an absolute index. It makes things a bit easier when coding the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "stopTimes = np.array([\n", + " # r0, t0\n", + " [None, '2020-05-11T08:00'],\n", + " ['2020-05-11T08:25', '2020-05-11T08:30'],\n", + " ['2020-05-11T08:55', None],\n", + "\n", + " # ro, t1\n", + " [None, '2020-05-11T08:10'],\n", + " ['2020-05-11T08:35', '2020-05-11T08:40'],\n", + " ['2020-05-11T09:05', None],\n", + " \n", + " # r1, t0 \n", + " [None, '2020-05-11T08:00'],\n", + " ['2020-05-11T08:05', '2020-05-11T08:10'],\n", + " ['2020-05-11T08:15', None],\n", + "\n", + " # r1, t1\n", + " [None, '2020-05-11T09:00'],\n", + " ['2020-05-11T09:05', '2020-05-11T09:10'],\n", + " ['2020-05-11T09:15', None],\n", + " \n", + " #r2, t0\n", + " [None, '2020-05-11T08:20'],\n", + " ['2020-05-11T09:20', None],\n", + " \n", + " #r2, t1\n", + " [None, '2020-05-11T08:30'],\n", + " ['2020-05-11T09:30', None]],\n", + " dtype='datetime64')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`NaT` is the `None` equivalent for `numpy datetime64`." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ True, False])" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.isnat(stopTimes[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### stopRoutes\n", + "\n", + "`stopRoutes` contains the routes associated with each stop. We need the pointer in `stops` to index `stopRoutes` correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "stopRoutes = np.array([0, 2, # A\n", + " 0, # B\n", + " 0,1, # C\n", + " 1, # D\n", + " 1, 2]) # E" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We should also build an array for transfer times (including walking times), but for now let's ignore this additional complexity. Finally, the i-th entry in the `stops` array points to the first entry in `stopRoutes` (and `transfers` when that will be tried) associated with stop $p_i$" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "stops = np.array([[0, None],# A\n", + " [2, None], # B\n", + " [3, None],# C\n", + " [5, None], # D\n", + " [6, None], # E\n", + " [len(stopRoutes), None]]) # fictive stop to account for length of E" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Coding the standard RAPTOR\n", + "\n", + "Below, we code RAPTOR as it is described in the paper, with all optimizations. That corresponds to the pseudocode block in the article. It solves the earliest arrival time problem: we enter an start stop, a target stop and a departure time and it finds the earliest arrival time in k rounds (i.e taking at most k transports). Note that walking between stops is not considered a transport." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "p_s = 0 # start stop = A\n", + "p_t = 4 # target stop = E\n", + "tau_0 = np.datetime64('2020-05-11T08:05') # departure time 08:05\n", + "k_max = 10 # we set a maximum number of transports to pre-allocate memory for the numpy array tau_i" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "def raptor_standard(p_s, p_t, tau_0, routes, routeStops, stopTimes, stopRoutes, stops,\n", + " k_max=10):\n", + " \n", + " #******************************************initialization******************************************\n", + " n_stops = len(stops)-1 # to remove the fictive stop to account for all the routes belonging to the last stop\n", + "\n", + " # earliest arrival time at each stop for each round.\n", + " tau = np.full(shape=(k_max, n_stops), fill_value = np.datetime64('2100-01-01T00:00')) # 2100 instead of infinity # number of stops * max number of transports\n", + "\n", + " # earliest arrival time at each stop, indep. of round\n", + " tau_star = np.full(shape=n_stops, fill_value = np.datetime64('2100-01-01T00:00'))\n", + "\n", + " # to backtrack the journey of TRANSPORTS once it is finished\n", + " #[route, trip, boarding stop, exit stop]\n", + " # we will keep [r, t, p_b, p_e, p_f1, pf2, t_w] i.e \n", + " # [route, trip (offset by route, not absolute), boarding stop, exit stop, beginning stop of the walk, target stop of the walk, time walked]\n", + " journey = np.full(shape=(k_max, n_stops, 7), fill_value = -1, dtype=int)\n", + " \n", + " marked = [p_s]\n", + " q = []\n", + " tau[0, p_s] = tau_0\n", + " \n", + " #Maybe TODO (but not in original raptor): footpaths from the departure stop\n", + "\n", + " #****************************************** main loop******************************************\n", + " for k in np.arange(1, k_max+1):\n", + " print('\\n******************************STARTING round k={}******************************'.format(k))\n", + " # accumulate routes serving marked stops from previous rounds\n", + " q = []\n", + " marked = list(set(marked)) # removing potential duplicate stops in marked due to walking paths\n", + " print('Marked stops at the start of the round: {}'.format(marked))\n", + " for p in marked:\n", + " for r in stopRoutes[stops[p][0]:stops[p+1][0]]: # foreach route r serving p\n", + " print('Route considered for the queue: ({0}, {1})'.format(r, p))\n", + " inQueue = False\n", + " for idx, (rPrime, pPrime) in enumerate(q): \n", + " # is there already another stop from the same route in q ?\n", + " if (rPrime == r): \n", + " # is there already a later stop from the same route in q ?\n", + " if(np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == pPrime)[0][0] >\\\n", + " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p)[0][0]):\n", + " #in that case, replace the later stop pPrime by stop p in q\n", + " q[idx] = (r, p)\n", + " inQueue = True\n", + " # is there already an earlier stop from the same route in q ?\n", + " else:\n", + " # in that case, do not add p to the q.\n", + " inQueue=True\n", + " if not inQueue:\n", + " q.append((r, p))\n", + "\n", + " marked = [] # unmarking all stops\n", + "\n", + " print('Queue before traversing each route: {}'.format(q))\n", + " # traverse each route\n", + " for (r, p) in q:\n", + " print('\\n****TRAVERSING ROUTE r={0} from stop p={1}****'.format(r, p))\n", + " # t is the t-th trip in route r, not the t-th trip in all trips. This makes things easier\n", + " t = None\n", + " # we will keep [r, t, p_b, p_e, p_f, t_w] i.e \n", + " # [route, trip (offset by route, not absolute), boarding stop, exit stop, target stop of the walk, time walked]\n", + " t_journey = np.empty(4, dtype=int)# contains tripID, board and exit stops to backtrack the journey\n", + "\n", + "\n", + " # we only traverse the route starting at p, not from the beginning of the route\n", + " for p_i in routeStops[routes[r][2]+np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p)[0][0]:\\\n", + " routes[r][2]+routes[r][1]]:\n", + " print(\"p_i: {}\".format(p_i))\n", + "\n", + " if (t is not None):\n", + " # 1st trip of route + \n", + " # offset for the right trip + \n", + " # offset for the right stop\n", + " arr_t_p_i = stopTimes[routes[r][3] + \\\n", + " t * routes[r][1] + \\\n", + " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0]][0]\n", + " print(\"arr_t_p_i: {}\".format(arr_t_p_i))\n", + "\n", + " if arr_t_p_i < min(tau_star[p_i], tau_star[p_t]):\n", + " tau[k][p_i] = arr_t_p_i\n", + " tau_star[p_i] = arr_t_p_i\n", + " marked.append(p_i)\n", + " # keep a trace that we went down the trip taken before at this stop\n", + " t_journey[3] = p_i\n", + " journey[k][p_i][0:4] = t_journey\n", + " # Can we catch an earlier trip at p_i ?\n", + " print('\\n----scanning departure times for route r={0} at stop p_i={1}----'.format(r, p_i))\n", + " t_r = 0\n", + " while True:\n", + " t_r_dep = stopTimes[routes[r][3]+\\\n", + " # offset corresponding to stop p_i in route r\n", + " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0] + \\\n", + " routes[r][1]*t_r][1]\n", + "\n", + " print(\"Earliest arrival time at previous step: tau[k-1][p_i]: {}\".format(tau[k-1][p_i]))\n", + " print(\"Departure time considered: t_r_dep: {}\".format(t_r_dep))\n", + " # We hop on the first trip that departs later than our arrival time at p_i in k-1 transports\n", + " if t_r_dep > tau[k-1][p_i]:\n", + " t = t_r\n", + " print('\\n!!!!Hopped on route r={0}, trip t={1} at stop p_i={2}!!!!'.format(r, t, p_i))\n", + "\n", + " # here we probably need to save the trip and boarding stop (boarding time will not be useful)\n", + " t_journey[0] = r\n", + " t_journey[1] = t\n", + " t_journey[2] = p_i\n", + " break\n", + " t_r += 1\n", + "\n", + " # we could not hop on any trip at this stop\n", + " if t_r == routes[r][0]:\n", + " break\n", + " \n", + " print('\\n****FOOTPATHS****')\n", + " \n", + " marked_footpaths = [] # storing marked stops for footpaths in a separate list to avoid inifinite loops\n", + " for p in marked:\n", + " if stops[p][1] is not None:\n", + " print('checking walking paths from stop {}'.format(p))\n", + " # making sure there are footpaths for that stop\n", + " # finding the next stop where there are footpaths to find the next index\n", + " next_stop = p\n", + " next_stop_found = False\n", + " while next_stop < len(stops)-1: #carefully check that's the correct version\n", + " next_stop = next_stop+1\n", + " if stops[next_stop][1] is not None:\n", + " next_stop_found = True\n", + " break\n", + " \n", + " # reinitializing next_stop to p in case no next stop with not 'None' stops[p][1] is found\n", + " if not next_stop_found:\n", + " next_stop = p+1 # this works because transfers[p:None] is equivalent to transfers[p:]\n", + " \n", + " \n", + " for f in transfers[stops[p][1]:stops[next_stop][1]]:\n", + " print(\"Considering footpaths from {} to {}\".format(p, f[0]))\n", + " \n", + " # we only consider footpaths if they strictly ameliorate the arrival time at the arrival stop of the path.\n", + " if(tau[k][p]+np.timedelta64(f[1], 's') < min(tau_star[f[0]], tau_star[p_t])): \n", + " print(\"Walking to {} is faster !\".format(f[0]))\n", + " tau[k][f[0]] = tau[k][p]+np.timedelta64(f[1], 's')\n", + " tau_star[f[0]] = tau[k][p]+np.timedelta64(f[1], 's')\n", + " marked_footpaths.append(f[0])\n", + " \n", + " # keeping tracks of footpaths to backtrack the journey:\n", + " # [departure stop, arrival stop, walking time]\n", + " journey[k][f[0]][4:7] = [p, f[0], f[1]]\n", + " \n", + " marked.extend(marked_footpaths) # to avoid infinite loops if marked gets appended dynamically\n", + " # stopping criterion: no stops were marked\n", + " if not marked:\n", + " break\n", + " return(tau, tau_star, k, journey)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "******************************STARTING round k=1******************************\n", + "Marked stops at the start of the round: [0]\n", + "Route considered for the queue: (0, 0)\n", + "Route considered for the queue: (2, 0)\n", + "Queue before traversing each route: [(0, 0), (2, 0)]\n", + "\n", + "****TRAVERSING ROUTE r=0 from stop p=0****\n", + "p_i: 0\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=0----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:00\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:10\n", + "\n", + "!!!!Hopped on route r=0, trip t=1 at stop p_i=0!!!!\n", + "p_i: 1\n", + "arr_t_p_i: 2020-05-11T08:35\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=1----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-11T08:30\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-11T08:40\n", + "p_i: 2\n", + "arr_t_p_i: 2020-05-11T09:05\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=2 from stop p=0****\n", + "p_i: 0\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=0----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:20\n", + "\n", + "!!!!Hopped on route r=2, trip t=0 at stop p_i=0!!!!\n", + "p_i: 4\n", + "arr_t_p_i: 2020-05-11T09:20\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****FOOTPATHS****\n", + "\n", + "******************************STARTING round k=2******************************\n", + "Marked stops at the start of the round: [1, 2, 4]\n", + "Route considered for the queue: (0, 1)\n", + "Route considered for the queue: (0, 2)\n", + "Route considered for the queue: (1, 2)\n", + "Route considered for the queue: (1, 4)\n", + "Route considered for the queue: (2, 4)\n", + "Queue before traversing each route: [(0, 1), (1, 2), (2, 4)]\n", + "\n", + "****TRAVERSING ROUTE r=0 from stop p=1****\n", + "p_i: 1\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=1----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n", + "Departure time considered: t_r_dep: 2020-05-11T08:30\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n", + "Departure time considered: t_r_dep: 2020-05-11T08:40\n", + "\n", + "!!!!Hopped on route r=0, trip t=1 at stop p_i=1!!!!\n", + "p_i: 2\n", + "arr_t_p_i: 2020-05-11T09:05\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=1 from stop p=2****\n", + "p_i: 2\n", + "\n", + "----scanning departure times for route r=1 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:10\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: 2020-05-11T09:10\n", + "\n", + "!!!!Hopped on route r=1, trip t=1 at stop p_i=2!!!!\n", + "p_i: 4\n", + "arr_t_p_i: 2020-05-11T09:15\n", + "\n", + "----scanning departure times for route r=1 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=2 from stop p=4****\n", + "p_i: 4\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****FOOTPATHS****\n", + "\n", + "******************************STARTING round k=3******************************\n", + "Marked stops at the start of the round: [4]\n", + "Route considered for the queue: (1, 4)\n", + "Route considered for the queue: (2, 4)\n", + "Queue before traversing each route: [(1, 4), (2, 4)]\n", + "\n", + "****TRAVERSING ROUTE r=1 from stop p=4****\n", + "p_i: 4\n", + "\n", + "----scanning departure times for route r=1 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=2 from stop p=4****\n", + "p_i: 4\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:15\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****FOOTPATHS****\n" + ] + } + ], + "source": [ + "tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n", + " routes = routes, routeStops = routeStops, stopTimes = stopTimes, stopRoutes = stopRoutes, stops = stops)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n", + " '2100-01-01T00:00', '2020-05-11T09:15'], dtype='datetime64[m]')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tau_star" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([['2020-05-11T08:05', '2100-01-01T00:00', '2100-01-01T00:00',\n", + " '2100-01-01T00:00', '2100-01-01T00:00'],\n", + " ['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n", + " '2100-01-01T00:00', '2020-05-11T09:20'],\n", + " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n", + " '2100-01-01T00:00', '2020-05-11T09:15'],\n", + " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n", + " '2100-01-01T00:00', '2100-01-01T00:00']], dtype='datetime64[m]')" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "k_last = k\n", + "\n", + "tau[0:k_last+1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`journey` contains all the necessary information to backtrack from the solution to the actual journey in terms of sequence of transports.\n", + "\n", + "`journey` has dimensions `k` by `n_stops` by 4+3.\n", + "- The 4 first values store the route and trip taken, the departure and arrival stops.\n", + "- The 3 last values are used by footpaths. They contain the departure stop for the walk, the arrival stop for the walk and the walking time in seconds.\n", + "\n", + "When we hop on a trip, we store the trip (with the route) and the boarding and exit stops as the array `t_journey`: `(r, t, p_boarding, p_exit)`. At each stop `p_i` where we ameliorate the arrival time in round `k`, we store `t_journey` in the first 4 cells of `journey[k][p_i]`. `p_i` corresponds to the exit stop when backtracking.\n", + "\n", + "When walking to stop `p_i` is shorter, we store the departure, arrival stops and walking time in the 3 last cells of `journey[k][p_i]`.\n", + "\n", + "The end result is a `journey` array which contains -1 values in all seven cells in `journey[k][p_i]` if the arrival time at `p_i` was not ameliorated at step `k`. `journey[k][p_i]` where there are values other than -1 indicate that the arrival time was ameliorated either by walking or by taking a transport. " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[-1, -1, -1, -1, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1]],\n", + "\n", + " [[-1, -1, -1, -1, -1, -1, -1],\n", + " [ 0, 1, 0, 1, -1, -1, -1],\n", + " [ 0, 1, 0, 2, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1],\n", + " [ 2, 0, 0, 4, -1, -1, -1]],\n", + "\n", + " [[-1, -1, -1, -1, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1],\n", + " [-1, -1, -1, -1, -1, -1, -1],\n", + " [ 1, 1, 2, 4, -1, -1, -1]]])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "journey[0:k_last]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Backtracking\n", + "\n", + "We reconstruct the actual journey from the `journey` array by backtracking from the arrival stop to the departure stop. At each round k where we notice that the arrival time for the target stop was ameliorated, we start a new leg corresponding to a journey reaching the target stop in k transports.\n", + "\n", + "When backtracking without footpaths, it is sufficient at each round k to check at which stop the trip at round k-1 began. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "def backtrack_journey(k_last, p_t, journey):\n", + " # journey_act = actual journey, will contain the sequence of transports in the correct order\n", + " journey_act = [[] for k in range(0, k_last)] # there's maximum k routes to get to the final stop\n", + " p_board = p_t\n", + " n_legs = 1 # each leg is a journey to from the departure stop to the target stop in exactly k transports\n", + " journey_found = False\n", + "\n", + " # iterating backwards in rounds from k_last -1 to 1\n", + " for k in range(k_last-1, 0, -1): # second argument in range is not included in the boundaries\n", + " # Was the tarrival time at the target stop ameliorated at round k ? \n", + " if np.any(journey[k][p_t]!=np.array([-1, -1, -1, -1, -1, -1, -1])):\n", + "\n", + " # starting a new leg in the list of actual journeys\n", + " journey_found = True\n", + " # iterating from k to 0 to reconstruct the actual journey in k transports\n", + " p_board = p_t\n", + " for k_prime in range(k, 0, -1):\n", + "\n", + " # did we get to that stop by walking ?\n", + " if journey[k_prime][p_board][5] !=-1:\n", + "\n", + " # we keep track of the stop to which we walked to as well as the departure stop of the walk\n", + " stop_walk_dep = journey[k_prime][p_board][4]\n", + " journey_act[k].append([journey[k_prime][stop_walk_dep], journey[k_prime][p_board]])\n", + " p_board = journey[k_prime][stop_walk_dep][2]\n", + "\n", + " # we did not get to that stop by walking\n", + " else:\n", + "\n", + " journey_act[k].append(journey[k_prime][p_board])\n", + " p_board = journey[k_prime][p_board][2]\n", + "\n", + " # reversing the order of journey_act to get journeys from the start stop to the target stop\n", + " journey_act = [j[::-1] for j in journey_act]\n", + "\n", + " # building a human readable output for the trip:\n", + " for k, j in enumerate(journey_act):\n", + "\n", + " if j: # going only through non-empty journeys\n", + " print('******************JOURNEY IN {} TRIPS******************'.format(k))\n", + " print('raw representation of the journey in {} trips: {}'.format(k, j))\n", + "\n", + " for k_prime, t in enumerate(j):\n", + " # We did not walk at step k\n", + " if len(t) !=2:\n", + " p_boarding = t[2]\n", + " p_exit = t[3]\n", + " r_k = t[0]\n", + " time_boarding = stopTimes[routes[r_k][3] + \\\n", + " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_boarding)[0][0] + \\\n", + " t[1]*routes[r_k][1]][1]\n", + " time_exit = stopTimes[routes[r_k][3] + \\\n", + " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_exit)[0][0] + \\\n", + " t[1]*routes[r_k][1]][0]\n", + " print(\"At stop {}, take route {} leaving at time {} \\n...\".format(p_boarding, r_k, time_boarding))\n", + "\n", + " print(\" and exit at stop {} at time {}\".format(p_exit, time_exit))\n", + "\n", + " # We walked at step k\n", + " elif len(t)==2:\n", + " print(t)\n", + " p_boarding = t[0][2]\n", + " p_exit = t[0][3]\n", + " r_k = t[0][0]\n", + " time_boarding = stopTimes[routes[r_k][3] + \\\n", + " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_boarding)[0][0] + \\\n", + " t[0][1]*routes[r_k][1]][1]\n", + " time_exit = stopTimes[routes[r_k][3] + \\\n", + " np.where(routeStops[routes[r_k][2]:routes[r_k][2]+routes[r_k][1]] == p_exit)[0][0] + \\\n", + " t[0][1]*routes[r_k][1]][0]\n", + " p_start_walk = t[1][4]\n", + " p_end_walk = t[1][5]\n", + " walk_duration = t[1][6]/60\n", + "\n", + " print(\"At stop {}, take route {} leaving at time {} \\n...\".format(p_boarding, r_k, time_boarding))\n", + "\n", + " print(\"... exit at stop {} at time {}... \".format(p_exit, time_exit))\n", + "\n", + " print(\"and walk for {} minutes from stop {} to stop {}.\".format(walk_duration, p_start_walk, p_end_walk))\n", + " \n", + " if not journey_found:\n", + " print('No journey was found for this query')\n", + " return journey_found " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************JOURNEY IN 1 TRIPS******************\n", + "raw representation of the journey in 1 trips: [array([ 2, 0, 0, 4, -1, -1, -1])]\n", + "At stop 0, take route 2 leaving at time 2020-05-11T08:20 \n", + "...\n", + " and exit at stop 4 at time 2020-05-11T09:20\n", + "******************JOURNEY IN 2 TRIPS******************\n", + "raw representation of the journey in 2 trips: [array([ 0, 1, 0, 2, -1, -1, -1]), array([ 1, 1, 2, 4, -1, -1, -1])]\n", + "At stop 0, take route 0 leaving at time 2020-05-11T08:10 \n", + "...\n", + " and exit at stop 2 at time 2020-05-11T09:05\n", + "At stop 2, take route 1 leaving at time 2020-05-11T09:10 \n", + "...\n", + " and exit at stop 4 at time 2020-05-11T09:15\n" + ] + } + ], + "source": [ + "backtrack_journey(k_last, p_t, journey);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's add footpaths\n", + "\n", + "For now, we have not tried including footpaths in our dataset. Below, we modify the timetable by adding a new route r3, which links a new stop F to E in a single travel. F may be reached in a very long time from A, but in a short time from B, meaning that it should become shorter to:\n", + "\n", + "- Take a trip from A to B\n", + "- Walk from B to F\n", + "- Take a trip from F to E\n", + "\n", + "rather than the current best trip:\n", + "- Take a trip from A to C\n", + "- Take a trip from C to E\n", + "\n", + "\n", + "Note that the single transport solution:\n", + "- Take a trip from A to E\n", + "\n", + "should still appear as the optimal solution for k = 1, i.e one transport is taken." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "routes = np.array([[2, 3, 0, 0], #r0\n", + " [2, 3, 3, 6], #r1\n", + " [2, 2, 6, 12], #r2\n", + " [2, 2, 8, 16]]) # r3" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "routeStops = np.array([0, 1, 2, # A, B, C\n", + " 3, 2, 4, # D, C, E\n", + " 0, 4, # A, E\n", + " 5, 4]) #F, E" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "stopTimes = np.array([\n", + " # r0, t0\n", + " [None, '2020-05-11T08:00'],\n", + " ['2020-05-11T08:25', '2020-05-11T08:30'],\n", + " ['2020-05-11T08:55', None],\n", + "\n", + " # ro, t1\n", + " [None, '2020-05-11T08:10'],\n", + " ['2020-05-11T08:35', '2020-05-11T08:40'],\n", + " ['2020-05-11T09:05', None],\n", + " \n", + " # r1, t0 \n", + " [None, '2020-05-11T08:00'],\n", + " ['2020-05-11T08:05', '2020-05-11T08:10'],\n", + " ['2020-05-11T08:15', None],\n", + "\n", + " # r1, t1\n", + " [None, '2020-05-11T09:00'],\n", + " ['2020-05-11T09:05', '2020-05-11T09:10'],\n", + " ['2020-05-11T09:15', None],\n", + " \n", + " #r2, t0\n", + " [None, '2020-05-11T08:20'],\n", + " ['2020-05-11T09:20', None],\n", + " \n", + " #r2, t1\n", + " [None, '2020-05-11T08:30'],\n", + " ['2020-05-11T09:30', None],\n", + " \n", + " #r3, t0\n", + " [None, '2020-05-11T08:05'],\n", + " ['2020-05-11T08:25', None],\n", + "\n", + " #r3, t1\n", + " [None, '2020-05-11T08:45'],\n", + " ['2020-05-11T09:05', None]],\n", + " dtype='datetime64')" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "stopRoutes = np.array([0, 2, # A\n", + " 0, # B\n", + " 0,1, # C\n", + " 1, # D\n", + " 1, 2, 3, # E\n", + " 3]) # F" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## transfers\n", + "The `transfers` is a 2D `np.ndarray` where each entry `[p_j, time]` represents the time it takes to reach p_j from stop p_i. The correspondance between the indexing of `transfers` and p_i is done via `stops[p_i][1]`, i.e the first entry in `transfers` containing a connection from stop p_i.\n", + "\n", + "As we cannot store different data types in numpy arras, `time` will have to be converted to `np.timedelta64`, the format used to make differences between `np.datetime.64` variables. We will consider all `time` values as **positive values in seconds**." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.timedelta64(-30,'m')" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stopTimes[0][1] - stopTimes[1][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.timedelta64(30,'s')" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.timedelta64(30, 's')" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "transfers = np.array([[5, 3600], # A -> F\n", + " [5, 300], # B -> F\n", + " [0, 3600], # F -> A\n", + " [1, 300] # F -> A\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "stops = np.array([[0, 0],# A\n", + " [2, 1], # B\n", + " [3, None],# C\n", + " [5, None], # D\n", + " [6, None], # E\n", + " [9, 2], # F\n", + " [len(stopRoutes), None]]) # fictive stop to account for length of E" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "******************************STARTING round k=1******************************\n", + "Marked stops at the start of the round: [0]\n", + "Route considered for the queue: (0, 0)\n", + "Route considered for the queue: (2, 0)\n", + "Queue before traversing each route: [(0, 0), (2, 0)]\n", + "\n", + "****TRAVERSING ROUTE r=0 from stop p=0****\n", + "p_i: 0\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=0----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:00\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:10\n", + "\n", + "!!!!Hopped on route r=0, trip t=1 at stop p_i=0!!!!\n", + "p_i: 1\n", + "arr_t_p_i: 2020-05-11T08:35\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=1----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-11T08:30\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-11T08:40\n", + "p_i: 2\n", + "arr_t_p_i: 2020-05-11T09:05\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=2 from stop p=0****\n", + "p_i: 0\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=0----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:20\n", + "\n", + "!!!!Hopped on route r=2, trip t=0 at stop p_i=0!!!!\n", + "p_i: 4\n", + "arr_t_p_i: 2020-05-11T09:20\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****FOOTPATHS****\n", + "checking walking paths from stop 1\n", + "Considering footpaths from 1 to 5\n", + "Walking to 5 is faster !\n", + "\n", + "******************************STARTING round k=2******************************\n", + "Marked stops at the start of the round: [1, 2, 4, 5]\n", + "Route considered for the queue: (0, 1)\n", + "Route considered for the queue: (0, 2)\n", + "Route considered for the queue: (1, 2)\n", + "Route considered for the queue: (1, 4)\n", + "Route considered for the queue: (2, 4)\n", + "Route considered for the queue: (3, 4)\n", + "Route considered for the queue: (3, 5)\n", + "Queue before traversing each route: [(0, 1), (1, 2), (2, 4), (3, 5)]\n", + "\n", + "****TRAVERSING ROUTE r=0 from stop p=1****\n", + "p_i: 1\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=1----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n", + "Departure time considered: t_r_dep: 2020-05-11T08:30\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:35\n", + "Departure time considered: t_r_dep: 2020-05-11T08:40\n", + "\n", + "!!!!Hopped on route r=0, trip t=1 at stop p_i=1!!!!\n", + "p_i: 2\n", + "arr_t_p_i: 2020-05-11T09:05\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=1 from stop p=2****\n", + "p_i: 2\n", + "\n", + "----scanning departure times for route r=1 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: 2020-05-11T08:10\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: 2020-05-11T09:10\n", + "\n", + "!!!!Hopped on route r=1, trip t=1 at stop p_i=2!!!!\n", + "p_i: 4\n", + "arr_t_p_i: 2020-05-11T09:15\n", + "\n", + "----scanning departure times for route r=1 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=2 from stop p=4****\n", + "p_i: 4\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=3 from stop p=5****\n", + "p_i: 5\n", + "\n", + "----scanning departure times for route r=3 at stop p_i=5----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:40\n", + "Departure time considered: t_r_dep: 2020-05-11T08:05\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T08:40\n", + "Departure time considered: t_r_dep: 2020-05-11T08:45\n", + "\n", + "!!!!Hopped on route r=3, trip t=1 at stop p_i=5!!!!\n", + "p_i: 4\n", + "arr_t_p_i: 2020-05-11T09:05\n", + "\n", + "----scanning departure times for route r=3 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:20\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****FOOTPATHS****\n", + "\n", + "******************************STARTING round k=3******************************\n", + "Marked stops at the start of the round: [4]\n", + "Route considered for the queue: (1, 4)\n", + "Route considered for the queue: (2, 4)\n", + "Route considered for the queue: (3, 4)\n", + "Queue before traversing each route: [(1, 4), (2, 4), (3, 4)]\n", + "\n", + "****TRAVERSING ROUTE r=1 from stop p=4****\n", + "p_i: 4\n", + "\n", + "----scanning departure times for route r=1 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=2 from stop p=4****\n", + "p_i: 4\n", + "\n", + "----scanning departure times for route r=2 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=3 from stop p=4****\n", + "p_i: 4\n", + "\n", + "----scanning departure times for route r=3 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-11T09:05\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****FOOTPATHS****\n" + ] + } + ], + "source": [ + "tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n", + " routes = routes, routeStops = routeStops, stopTimes = stopTimes, stopRoutes = stopRoutes, stops = stops)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([['2020-05-11T08:05', '2100-01-01T00:00', '2100-01-01T00:00',\n", + " '2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00'],\n", + " ['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n", + " '2100-01-01T00:00', '2020-05-11T09:20', '2020-05-11T08:40'],\n", + " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n", + " '2100-01-01T00:00', '2020-05-11T09:05', '2100-01-01T00:00'],\n", + " ['2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00',\n", + " '2100-01-01T00:00', '2100-01-01T00:00', '2100-01-01T00:00']],\n", + " dtype='datetime64[m]')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "k_last = k\n", + "tau[0:k_last+1]" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2100-01-01T00:00', '2020-05-11T08:35', '2020-05-11T09:05',\n", + " '2100-01-01T00:00', '2020-05-11T09:05', '2020-05-11T08:40'],\n", + " dtype='datetime64[m]')" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tau_star" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1]],\n", + "\n", + " [[ -1, -1, -1, -1, -1, -1, -1],\n", + " [ 0, 1, 0, 1, -1, -1, -1],\n", + " [ 0, 1, 0, 2, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ 2, 0, 0, 4, -1, -1, -1],\n", + " [ -1, -1, -1, -1, 1, 5, 300]],\n", + "\n", + " [[ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1],\n", + " [ 3, 1, 5, 4, -1, -1, -1],\n", + " [ -1, -1, -1, -1, -1, -1, -1]]])" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "journey[0:k_last]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Backtracking with footpaths\n", + "\n", + "When backtracking, with footpaths, we first look through the footpaths to backtrack to the departure stop for the walk, and then use the departure stop of the walk as an arrival stop for a transport.\n", + "\n", + "But with footpaths added, it is possible to reach a stop C from stop A by:\n", + "- first taking a transport to a stop B\n", + "- walking from stop B to stop C.\n", + "\n", + "Therefore, we need to keep track of all the footpaths taken at step i that ameliorated arrival times at the target stop.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "******************JOURNEY IN 1 TRIPS******************\n", + "raw representation of the journey in 1 trips: [array([ 2, 0, 0, 4, -1, -1, -1])]\n", + "At stop 0, take route 2 leaving at time 2020-05-11T08:20 \n", + "...\n", + " and exit at stop 4 at time 2020-05-11T09:20\n", + "******************JOURNEY IN 2 TRIPS******************\n", + "raw representation of the journey in 2 trips: [[array([ 0, 1, 0, 1, -1, -1, -1]), array([ -1, -1, -1, -1, 1, 5, 300])], array([ 3, 1, 5, 4, -1, -1, -1])]\n", + "[array([ 0, 1, 0, 1, -1, -1, -1]), array([ -1, -1, -1, -1, 1, 5, 300])]\n", + "At stop 0, take route 0 leaving at time 2020-05-11T08:10 \n", + "...\n", + "... exit at stop 1 at time 2020-05-11T08:35... \n", + "and walk for 5.0 minutes from stop 1 to stop 5.\n", + "At stop 5, take route 3 leaving at time 2020-05-11T08:45 \n", + "...\n", + " and exit at stop 4 at time 2020-05-11T09:05\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "backtrack_journey(k_last, p_t, journey)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Trying to run the standard RAPTOR on real size data\n", + "### Loading real sized data" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1 11 0 0]\n", + " [1 11 11 11]\n", + " [1 11 22 22]\n", + " ...\n", + " [1 6 237432 245713]\n", + " [1 13 237438 245719]\n", + " [3 2 237451 245732]]\n", + "We find 16210 routes in the data\n" + ] + } + ], + "source": [ + "import pickle\n", + "# step 1 convert the data from string to numpy series\n", + "routes_real = pickle.load( open( \"../data/routes_array2.pkl\", \"rb\" ) )\n", + "print(routes_real)\n", + "print('We find {} routes in the data'.format(len(routes_real)))" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0 None]\n", + " [4 None]\n", + " [7 None]\n", + " ...\n", + " [7841 None]\n", + " [7844 None]\n", + " [7847 None]]\n", + "We find 1407 stops in the data\n" + ] + } + ], + "source": [ + "stops_real = pickle.load(open( \"../data/stops_array.pkl\", \"rb\" ) )\n", + "print(stops_real)\n", + "print('We find {} stops in the data'.format(len(stops_real)))" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 'NaT' '2020-05-21T16:53:00.000000000']\n", + " ['2020-05-21T16:55:00.000000000' '2020-05-21T16:55:00.000000000']\n", + " ['2020-05-21T16:57:00.000000000' '2020-05-21T16:57:00.000000000']\n", + " ...\n", + " ['2020-05-21T15:10:00.000000000' 'NaT']\n", + " [ 'NaT' '2020-05-21T16:45:00.000000000']\n", + " ['2020-05-21T17:05:00.000000000' 'NaT']]\n", + "We find 245738 arrival/departure times for stops in the data\n" + ] + } + ], + "source": [ + "stopTimes_real = pickle.load(open( \"../data/stop_times_array1.pkl\", \"rb\" ) )\n", + "print(stopTimes_real)\n", + "print('We find {} arrival/departure times for stops in the data'.format(len(stopTimes_real)))" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1166 146]\n", + " [1270 360]\n", + " [ 2 8]\n", + " ...\n", + " [ 108 371]\n", + " [ 102 439]\n", + " [1739 519]]\n", + "We find 12564 footpaths (bidirectional) in the data\n" + ] + } + ], + "source": [ + "transfer_real = pickle.load(open( \"../data/transfer_array.pkl\", \"rb\" ) )\n", + "print(transfer_real)\n", + "print('We find {} footpaths (bidirectional) in the data'.format(len(transfer_real)))" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0 1 88 ... 736 735 736]\n", + "We find 8000 (r, p) route stops combinations in the data\n" + ] + } + ], + "source": [ + "stopRoutes_real = pickle.load(open( \"../data/stop_routes_array.pkl\", \"rb\" ) )\n", + "# The route index alone was not selected:\n", + "#stopRoutes_real = stopRoutes_real[:, 1]\n", + "print(stopRoutes_real)\n", + "print('We find {} (r, p) route stops combinations in the data'.format(len(stopRoutes_real)))\n", + "#print('We find {} unique routes desserving stops'.format(len(np.unique(stopRoutes_real))))" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0 1 2 ... 759 554 493]\n", + "We find 7849 route, stops combinations\n", + "We find 1407 unique stops desserving routes\n" + ] + } + ], + "source": [ + "routeStops_real = pickle.load(open( \"../data/route_stops_array.pkl\", \"rb\" ) )\n", + "print(routeStops_real)\n", + "print('We find {} route, stops combinations'.format(len(routeStops_real)))\n", + "print('We find {} unique stops desserving routes'.format(len(np.unique(routeStops_real))))" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "******************************STARTING round k=1******************************\n", + "Marked stops at the start of the round: [0]\n", + "Route considered for the queue: (0, 0)\n", + "Route considered for the queue: (1, 0)\n", + "Route considered for the queue: (88, 0)\n", + "Route considered for the queue: (89, 0)\n", + "Queue before traversing each route: [(0, 0), (1, 0), (88, 0), (89, 0)]\n", + "\n", + "****TRAVERSING ROUTE r=0 from stop p=0****\n", + "p_i: 0\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=0----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-21T12:00\n", + "Departure time considered: t_r_dep: 2020-05-21T16:53:00.000000000\n", + "\n", + "!!!!Hopped on route r=0, trip t=0 at stop p_i=0!!!!\n", + "p_i: 1\n", + "arr_t_p_i: 2020-05-21T16:55:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=1----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T16:55:00.000000000\n", + "p_i: 2\n", + "arr_t_p_i: 2020-05-21T16:57:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T16:57:00.000000000\n", + "p_i: 0\n", + "arr_t_p_i: NaT\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=0----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2020-05-21T12:00\n", + "Departure time considered: t_r_dep: 2020-05-21T16:53:00.000000000\n", + "\n", + "!!!!Hopped on route r=0, trip t=0 at stop p_i=0!!!!\n", + "p_i: 1\n", + "arr_t_p_i: 2020-05-21T16:55:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=1----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T16:55:00.000000000\n", + "p_i: 2\n", + "arr_t_p_i: 2020-05-21T16:57:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=2----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T16:57:00.000000000\n", + "p_i: 3\n", + "arr_t_p_i: 2020-05-21T17:01:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=3----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T17:01:00.000000000\n", + "p_i: 4\n", + "arr_t_p_i: 2020-05-21T17:01:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=4----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T17:01:00.000000000\n", + "p_i: 5\n", + "arr_t_p_i: 2020-05-21T17:03:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=5----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T17:03:00.000000000\n", + "p_i: 6\n", + "arr_t_p_i: 2020-05-21T17:03:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=6----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: 2020-05-21T17:03:00.000000000\n", + "p_i: 7\n", + "arr_t_p_i: 2020-05-21T17:04:00.000000000\n", + "\n", + "----scanning departure times for route r=0 at stop p_i=7----\n", + "Earliest arrival time at previous step: tau[k-1][p_i]: 2100-01-01T00:00\n", + "Departure time considered: t_r_dep: NaT\n", + "\n", + "****TRAVERSING ROUTE r=1 from stop p=0****\n" + ] + }, + { + "ename": "IndexError", + "evalue": "index 0 is out of bounds for axis 0 with size 0", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n\u001b[0;32m----> 7\u001b[0;31m routes = routes_real, routeStops = routeStops_real, stopTimes = stopTimes_real, stopRoutes = stopRoutes_real, stops = stops_real)\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mraptor_standard\u001b[0;34m(p_s, p_t, tau_0, routes, routeStops, stopTimes, stopRoutes, stops, k_max)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# we only traverse the route starting at p, not from the beginning of the route\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 66\u001b[0;31m for p_i in routeStops[routes[r][2]+np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p)[0][0]:\\\n\u001b[0m\u001b[1;32m 67\u001b[0m routes[r][2]+routes[r][1]]:\n\u001b[1;32m 68\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"p_i: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp_i\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: index 0 is out of bounds for axis 0 with size 0" + ] + } + ], + "source": [ + "p_s_real = 10 # start stop = A\n", + "p_t = 4 # target stop = E\n", + "tau_0 = np.datetime64('2020-05-21T12:00:00') # departure time 08:05\n", + "k_max = 10 # we set a maximum number of transports to pre-allocate memory for the numpy array tau_i\n", + "\n", + "tau, tau_star, k, journey = raptor_standard(p_s, p_t, tau_0, \n", + " routes = routes_real, routeStops = routeStops_real, stopTimes = stopTimes_real, stopRoutes = stopRoutes_real, stops = stops_real)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code for prototyping and debugging:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p_s = 0 # start stop = A\n", + "p_t = 4 # target stop = E\n", + "tau_0 = np.datetime64('2020-05-11T08:05') # departure time 08:05\n", + "k_max = 10 # we set a maximum number of transports to pre-allocate memory for the numpy array tau_i" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# initialization\n", + "n_stops = len(stops)\n", + "\n", + "# earliest arrival time at each stop for each round.\n", + "tau = np.full(shape=(k_max, n_stops), fill_value = np.datetime64('2100-01-01T00:00')) # 2100 instead of infinity # number of stops * max number of transports\n", + "\n", + "# earliest arrival time at each stop, indep. of round\n", + "tau_star = np.full(shape=n_stops, fill_value = np.datetime64('2100-01-01T00:00'))\n", + "\n", + "marked = [p_s]\n", + "q = []\n", + "tau[0, p_s] = tau_0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p_i" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t_r_dep = stopTimes[routes[r][3]+\\\n", + " # offset corresponding to stop p_i in route r\n", + " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0] + \\\n", + " routes[r][1]*t_r][1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2) <\\\n", + "np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 3):\n", + " print(\"hello\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routeStops[routes[1][2] + np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2)[0][0]:routes[1][2]+routes[1][1]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routeStops[routes[1][2] + np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2)[0][0]:6]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routeStops[routes[1][2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "routeStops[np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 2)[0][0]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if True and \\\n", + " True:\n", + " print(\"hello\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tau[0][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "stopTimes[3][1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = np.arange(1, 10)\n", + "a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a[1:10:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stopTimes[routes[0][3]+\\\n", + " # offset corresponding to stop p_i in route r\n", + " np.where(routeStops[routes[0][2]:routes[0][2]+routes[0][1]] == 0)[0][0]:\\\n", + " # end of the trips of r\n", + " routes[0][3]+routes[0][0]*routes[0][1]:\\\n", + " # we can jump from the number of stops in r to find the next departure of route r at p_i\n", + " routes[0][1]\n", + " ]\n", + "# we may more simply loop through all trips, and stop as soon as the departure time is after the arrival time\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stopTimes[routes[0][3]+\\\n", + " # offset corresponding to stop p_i in route r\n", + " np.where(routeStops[routes[0][2]:routes[0][2]+routes[0][1]] == 0)[0][0]][1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stopTimes[routes[1][3]+\\\n", + " # offset corresponding to stop p_i in route r\n", + " np.where(routeStops[routes[1][2]:routes[1][2]+routes[1][1]] == 3)[0][0] + \\\n", + " routes[1][1]*1][1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# t_r is a trip that belongs to route r. t_r can take value 0 to routes[r][0]-1\n", + "t = None\n", + "r = 1\n", + "tau_k_1 = tau[0][0]\n", + "p_i = 3\n", + "\n", + "t_r = 0\n", + "while True:\n", + " \n", + " t_r_dep = stopTimes[routes[r][3]+\\\n", + " # offset corresponding to stop p_i in route r\n", + " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0] + \\\n", + " routes[r][1]*t_r][1]\n", + " \n", + " if t_r_dep > tau_k_1:\n", + " # retrieving the index of the departure time of the trip in stopTimes\n", + " #t = routes[r][3] + t_r * routes[r][1]\n", + " t = t_r\n", + " break\n", + " t_r += 1\n", + " # we could not hop on any trip at this stop\n", + " if t_r == routes[r][0]:\n", + " break\n", + " \n", + "print(\"done\")\n", + "print(t)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r = 1\n", + "t = 1\n", + "p_i = 2\n", + "# 1st trip of route + offset for the right trip + offset for the right stop\n", + "stopTimes[routes[r][3] + t * routes[r][1] + np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "d = []\n", + "not d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "r = 1\n", + "t = 0\n", + "p_i = 4\n", + "arr_t_p_i = stopTimes[routes[r][3] + \\\n", + " t * routes[r][1] + \\\n", + " np.where(routeStops[routes[r][2]:routes[r][2]+routes[r][1]] == p_i)[0][0]][0]\n", + "arr_t_p_i" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.datetime64('NaT') > np.datetime64('2100-01-01')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.datetime64('NaT') < np.datetime64('2100-01-01')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,md,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}