diff --git a/.gitattributes b/.gitattributes index 101dcef..699bf1a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,37 +1,38 @@ data/distributions.pickle filter=lfs diff=lfs merge=lfs -text data/stop_times_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/transfer_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/routes_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/route_stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/stop_routes_array_version3.csv filter=lfs diff=lfs merge=lfs -text data/stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text object.data filter=lfs diff=lfs merge=lfs -text *.pkl filter=lfs diff=lfs merge=lfs -text data/*.pkl filter=lfs diff=lfs merge=lfs -text data/dere.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_array.pkl filter=lfs diff=lfs merge=lfs -text data/stops_array.pkl filter=lfs diff=lfs merge=lfs -text data/stop_times_array.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_array.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array.pkl filter=lfs diff=lfs merge=lfs -text data/route_stops_array.pkl filter=lfs diff=lfs merge=lfs -text data/route_stops_df.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array_df.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_df.pkl filter=lfs diff=lfs merge=lfs -text data/stop_times_df.pkl filter=lfs diff=lfs merge=lfs -text data/stops_df.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_df.pkl filter=lfs diff=lfs merge=lfs -text data/distrib_recov_tab_stopID_hour.pkl.gz filter=lfs diff=lfs merge=lfs -text data/join_distribution_all.pkl.gz filter=lfs diff=lfs merge=lfs -text data/join_distribution_cumulative_p.pkl.gz filter=lfs diff=lfs merge=lfs -text data/join_distribution_cumulative_p_2.pkl.gz filter=lfs diff=lfs merge=lfs -text data/route_stops_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/route_stops_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stop_times_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stops_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text +data/stop_times_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/data/route_stops_array.pkl b/data/route_stops_array.pkl deleted file mode 100644 index 3a462f4..0000000 --- a/data/route_stops_array.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33781be33c690045bf3ac6f60923baf31e26b2f06d4af4bfc06f0073b009b105 -size 62951 diff --git a/data/route_stops_array2.pkl b/data/route_stops_array2.pkl deleted file mode 100644 index 6103926..0000000 --- a/data/route_stops_array2.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec9af358ad5e0802d624b2795ac20f3282fb689c21752d71f9fef698cbc3785b -size 1899785 diff --git a/data/route_stops_df.pkl b/data/route_stops_df.pkl deleted file mode 100644 index 540c4ae..0000000 --- a/data/route_stops_df.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a55d740bc2affad2b88f0e0a6725b3a8edf8e41feda7e65858bc7b2466c7c997 -size 189233 diff --git a/data/route_stops_df_cyril.pkl b/data/route_stops_df_cyril.pkl index d22734d..70461ea 100644 --- a/data/route_stops_df_cyril.pkl +++ b/data/route_stops_df_cyril.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c62d0046e831eee39e5ee0605aa0eac60d6bad85c0c5d386d91ee81380216265 +oid sha256:3f86289015b10f212f8e2415b29cf5bd584a575fdd860d701cef87a46edb0bb8 size 369113 diff --git a/data/routes_array.pkl b/data/routes_array.pkl deleted file mode 100644 index 8c462d7..0000000 --- a/data/routes_array.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4edf41375449fd8f7c047b87e7eefabf62ce9bcd207501afffcb41289b3f0a38 -size 8349 diff --git a/data/routes_array2.pkl b/data/routes_array2.pkl deleted file mode 100644 index dd063c1..0000000 --- a/data/routes_array2.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76c403daeb38714084b99e1ecc0dd151ac2977d3445120df958715ebaf3478d8 -size 206451 diff --git a/data/routes_array_df.pkl b/data/routes_array_df.pkl deleted file mode 100644 index af1e073..0000000 --- a/data/routes_array_df.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7383cccd985e005f6d42317719be584862bacb677b45da30fd478fa92ab6a4a4 -size 24047 diff --git a/data/stop_routes_array.pkl b/data/stop_routes_array.pkl deleted file mode 100644 index 3b3b087..0000000 --- a/data/stop_routes_array.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dacc5dff452323522fc787b66a55e8704e72726298d6de190e8a6a2f493dd48a -size 62951 diff --git a/data/stop_routes_df.pkl b/data/stop_routes_df.pkl deleted file mode 100644 index 0db7ae1..0000000 --- a/data/stop_routes_df.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e47a3f2e849d4d5a2e20f232ff02c4f08e78c996b934a80af06d152425541ab2 -size 252167 diff --git a/data/stop_routes_df_cyril.pkl b/data/stop_routes_df_cyril.pkl index 960efe5..484e287 100644 --- a/data/stop_routes_df_cyril.pkl +++ b/data/stop_routes_df_cyril.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b563d73903bf423b3c49fa4987691f28ee05261baad5595f19c46b56cf2a382f +oid sha256:53fe1e7c5b985f8d14528085832799ded6cf202c28872d6ff1e63953c2c20716 size 538099 diff --git a/data/stop_times_array.pkl b/data/stop_times_array.pkl deleted file mode 100644 index 8e5c056..0000000 --- a/data/stop_times_array.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c38ce272796cd8dcc9f560ba57840e43ce167893d43f4840087be1652bd25a4 -size 4167529 diff --git a/data/stop_times_array1.pkl b/data/stop_times_array1.pkl deleted file mode 100644 index b54a600..0000000 --- a/data/stop_times_array1.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a5b61142e94d00f3e247423d80655e35601baffb7d63810446145e37bcdf775 -size 3931993 diff --git a/data/stop_times_array2.pkl b/data/stop_times_array2.pkl deleted file mode 100644 index b54a600..0000000 --- a/data/stop_times_array2.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a5b61142e94d00f3e247423d80655e35601baffb7d63810446145e37bcdf775 -size 3931993 diff --git a/data/stop_times_array_cyril.pkl b/data/stop_times_array_cyril.pkl index 8e5c056..524a974 100644 --- a/data/stop_times_array_cyril.pkl +++ b/data/stop_times_array_cyril.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c38ce272796cd8dcc9f560ba57840e43ce167893d43f4840087be1652bd25a4 +oid sha256:279ef5c4d19dc2f4d4194553ac95b57364efa7fdf19636831fcdc39d2b2a127d size 4167529 diff --git a/data/stop_times_df.pkl b/data/stop_times_df.pkl deleted file mode 100644 index 87e4d93..0000000 --- a/data/stop_times_df.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd7f45ef2b0660ddc15bc6d752daa248ab85298e925c40bf4f393e6ae1edb084 -size 36954121 diff --git a/data/stop_times_df_cyril.pkl b/data/stop_times_df_cyril.pkl new file mode 100644 index 0000000..daea263 --- /dev/null +++ b/data/stop_times_df_cyril.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b44372accdcd34651a05e4ce97ff137a448fc5f8d87fb5ea97ffe1fd13a1ad0e +size 36173268 diff --git a/data/stops_array.pkl b/data/stops_array.pkl deleted file mode 100644 index 94cf569..0000000 --- a/data/stops_array.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c26208697b3262e9ee15bd779c2c700b2f2fbf9cfd7e4af84679b0a5d1e4e538 -size 8397 diff --git a/data/stops_df.pkl b/data/stops_df.pkl deleted file mode 100644 index 4ac6057..0000000 --- a/data/stops_df.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fec751e9c11205049c3f1947efea0daeaf480b6abec757de949bfb799b11b79e -size 20373 diff --git a/data/transfer_array.pkl b/data/transfer_array.pkl deleted file mode 100644 index 0a980a5..0000000 --- a/data/transfer_array.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aefdfcb53b474746b34a7585bb0439091929fbbaa2ee91223a61bedea4386684 -size 98045 diff --git a/data/transfer_df.pkl b/data/transfer_df.pkl deleted file mode 100644 index 6127bbb..0000000 --- a/data/transfer_df.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff059f33c5f06de1afb7222683b4d58d01f279abb6025642d899319bc3fea3fe -size 763207 diff --git a/notebooks/Arrays_Cyrill_data.ipynb b/notebooks/Arrays_Cyrill_data.ipynb index c7cfa64..cd664e2 100644 --- a/notebooks/Arrays_Cyrill_data.ipynb +++ b/notebooks/Arrays_Cyrill_data.ipynb @@ -1,5236 +1,5236 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing part 2: preparing the arrays\n", "In this notebook we take 2 datasets prepared in spark: stop_times and transfers, and prepare them into the array format needed to run RAPTOR" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Outline\n", "In this notebook the following actions are performed:\n", "- create array" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import packages" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "import itertools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read files\n", "Before running make sure the .csv files are in /data . If not run notebook \"transfer_to_local\"" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
0026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
1126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
2226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
3326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
4426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", "
" ], "text/plain": [ " Unnamed: 0 route_id stop_id_general trip_id stop_id \\\n", "0 0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 \n", "1 1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 \n", "2 2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 \n", "3 3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 \n", "4 4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 \n", "\n", " arrival_time departure_time stop_sequence stop_name \\\n", "0 17:00:00 17:00:00 3 Zürich, Hürlimannplatz \n", "1 17:02:00 17:02:00 4 Zürich, Waffenplatzstrasse \n", "2 17:03:00 17:03:00 5 Zürich, Hügelstrasse \n", "3 17:04:00 17:04:00 6 Zürich, Brunau/Mutschellenstr. \n", "4 17:05:00 17:05:00 7 Zürich, Thujastrasse \n", "\n", " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n", "0 47.365066 8.526539 Zürich, Neubühl 3870 0 \n", "1 47.361482 8.525749 Zürich, Neubühl 3870 0 \n", "2 47.358543 8.526997 Zürich, Neubühl 3870 0 \n", "3 47.355147 8.527141 Zürich, Neubühl 3870 0 \n", "4 47.350187 8.527806 Zürich, Neubühl 3870 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \n", "0 16:55:00 1225 12 1317 Bus \n", "1 16:55:00 1225 12 1267 Bus \n", "2 16:55:00 1225 12 67 Bus \n", "3 16:55:00 1225 12 512 Bus \n", "4 16:55:00 1225 12 403 Bus " ] }, - "execution_count": 2, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#stop_times\n", "stop_times_curated = pd.read_csv(\"../data/stop_times_final_cyril.csv\")\n", "stop_times_curated.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We drop columns not useful to us" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "stop_times_curated = stop_times_curated.drop(columns=[\"Unnamed: 0\"])" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker
11850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde
2285021868502186:0:10.0067628Dietikon StoffelbachDietikon Stoffelbach
3385021868502186:0:20.01352416Dietikon StoffelbachDietikon Stoffelbach
4485021868502186P0.0000000Dietikon StoffelbachDietikon Stoffelbach
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "1 1 8500926 8590737 0.300175 360 \n", "2 2 8502186 8502186:0:1 0.006762 8 \n", "3 3 8502186 8502186:0:2 0.013524 16 \n", "4 4 8502186 8502186P 0.000000 0 \n", "\n", " stop_name stop_name2 \n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n", "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n", "2 Dietikon Stoffelbach Dietikon Stoffelbach \n", "3 Dietikon Stoffelbach Dietikon Stoffelbach \n", "4 Dietikon Stoffelbach Dietikon Stoffelbach " ] }, - "execution_count": 4, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#transfers\n", "transfers = pd.read_csv(\"../data/transfers.csv\")\n", "transfers.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfer: delete transfer to same stop & get stop_int & stop_int2\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "12564" ] }, - "execution_count": 5, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check number stops transfers\n", "transfers.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker
11850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde
2285021868502186:0:10.0067628Dietikon StoffelbachDietikon Stoffelbach
3385021868502186:0:20.01352416Dietikon StoffelbachDietikon Stoffelbach
4485021868502186P0.0000000Dietikon StoffelbachDietikon Stoffelbach
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "1 1 8500926 8590737 0.300175 360 \n", "2 2 8502186 8502186:0:1 0.006762 8 \n", "3 3 8502186 8502186:0:2 0.013524 16 \n", "4 4 8502186 8502186P 0.000000 0 \n", "\n", " stop_name stop_name2 \n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n", "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n", "2 Dietikon Stoffelbach Dietikon Stoffelbach \n", "3 Dietikon Stoffelbach Dietikon Stoffelbach \n", "4 Dietikon Stoffelbach Dietikon Stoffelbach " ] }, - "execution_count": 6, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We delete transfers to the same stop" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "transfers_df = transfers[transfers['stop_id'] != transfers['stop_id2']]" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "12564" ] }, - "execution_count": 8, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.stop_id.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the stop_int column in transfers. This action eliminates stops not in stop_times" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id arrival_time \\\n", "0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 17:00:00 \n", "1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 17:02:00 \n", "2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 17:03:00 \n", "3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 17:04:00 \n", "4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 17:05:00 \n", "\n", " departure_time stop_sequence stop_name stop_lat \\\n", "0 17:00:00 3 Zürich, Hürlimannplatz 47.365066 \n", "1 17:02:00 4 Zürich, Waffenplatzstrasse 47.361482 \n", "2 17:03:00 5 Zürich, Hügelstrasse 47.358543 \n", "3 17:04:00 6 Zürich, Brunau/Mutschellenstr. 47.355147 \n", "4 17:05:00 7 Zürich, Thujastrasse 47.350187 \n", "\n", " stop_lon trip_headsign trip_short_name direction_id \\\n", "0 8.526539 Zürich, Neubühl 3870 0 \n", "1 8.525749 Zürich, Neubühl 3870 0 \n", "2 8.526997 Zürich, Neubühl 3870 0 \n", "3 8.527141 Zürich, Neubühl 3870 0 \n", "4 8.527806 Zürich, Neubühl 3870 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \n", "0 16:55:00 1225 12 1317 Bus \n", "1 16:55:00 1225 12 1267 Bus \n", "2 16:55:00 1225 12 67 Bus \n", "3 16:55:00 1225 12 512 Bus \n", "4 16:55:00 1225 12 403 Bus " ] }, - "execution_count": 9, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_int = stop_times_curated\n", "stop_times_int.head(5)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "transfers_df = transfers_df.merge(stop_times_int[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\"), how=\"inner\", on = \"stop_id\").drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10707" ] }, - "execution_count": 11, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2stop_int
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker1392
381850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde1392
7698502186:0:185021860.0067628Dietikon StoffelbachDietikon Stoffelbach1394
128108502186:0:18502186:0:20.0067628Dietikon StoffelbachDietikon Stoffelbach1394
180118502186:0:18502186P0.0067628Dietikon StoffelbachDietikon Stoffelbach1394
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "38 1 8500926 8590737 0.300175 360 \n", "76 9 8502186:0:1 8502186 0.006762 8 \n", "128 10 8502186:0:1 8502186:0:2 0.006762 8 \n", "180 11 8502186:0:1 8502186P 0.006762 8 \n", "\n", " stop_name stop_name2 stop_int \n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 1392 \n", "38 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 1392 \n", "76 Dietikon Stoffelbach Dietikon Stoffelbach 1394 \n", "128 Dietikon Stoffelbach Dietikon Stoffelbach 1394 \n", "180 Dietikon Stoffelbach Dietikon Stoffelbach 1394 " ] }, - "execution_count": 12, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.head(5)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_id2stop_int_2
085912051317
185914151267
2859120467
38591098512
48591392403
\n", "
" ], "text/plain": [ " stop_id2 stop_int_2\n", "0 8591205 1317\n", "1 8591415 1267\n", "2 8591204 67\n", "3 8591098 512\n", "4 8591392 403" ] }, - "execution_count": 13, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create dataframe with stops\n", "df_stop_int2 = stop_times_int[[\"stop_id\", \"stop_int\"]].rename(columns={\"stop_id\": \"stop_id2\", \"stop_int\" : \"stop_int_2\"})\n", "df_stop_int2.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add the the stop id for the arrival destination, stop_int2" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "transfers_df_int = transfers_df.merge(df_stop_int2.set_index(\"stop_id2\"), how=\"inner\", on = \"stop_id2\").drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2stop_intstop_int_2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker13921310
378193859061885906160.412676495Geroldswil, ZentrumGeroldswil, Schweizäcker5901310
748821859073785906160.422521507Oetwil an der Limmat, HaldeGeroldswil, Schweizäcker9011310
1111850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde1392901
1868189859061685907370.422521507Geroldswil, SchweizäckerOetwil an der Limmat, Halde1310901
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "37 8193 8590618 8590616 0.412676 495 \n", "74 8821 8590737 8590616 0.422521 507 \n", "111 1 8500926 8590737 0.300175 360 \n", "186 8189 8590616 8590737 0.422521 507 \n", "\n", " stop_name stop_name2 stop_int \\\n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 1392 \n", "37 Geroldswil, Zentrum Geroldswil, Schweizäcker 590 \n", "74 Oetwil an der Limmat, Halde Geroldswil, Schweizäcker 901 \n", "111 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 1392 \n", "186 Geroldswil, Schweizäcker Oetwil an der Limmat, Halde 1310 \n", "\n", " stop_int_2 \n", "0 1310 \n", "37 1310 \n", "74 1310 \n", "111 901 \n", "186 901 " ] }, - "execution_count": 15, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df_int.head(5)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9434" ] }, - "execution_count": 16, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df_int.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "transfers = transfers_df_int" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1503" ] }, - "execution_count": 18, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check number unique stops2 in transfers\n", "transfers.stop_id2.nunique()" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1503" ] }, - "execution_count": 19, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers.stop_id.nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id arrival_time \\\n", "0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 17:00:00 \n", "1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 17:02:00 \n", "2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 17:03:00 \n", "3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 17:04:00 \n", "4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 17:05:00 \n", "\n", " departure_time stop_sequence stop_name stop_lat \\\n", "0 17:00:00 3 Zürich, Hürlimannplatz 47.365066 \n", "1 17:02:00 4 Zürich, Waffenplatzstrasse 47.361482 \n", "2 17:03:00 5 Zürich, Hügelstrasse 47.358543 \n", "3 17:04:00 6 Zürich, Brunau/Mutschellenstr. 47.355147 \n", "4 17:05:00 7 Zürich, Thujastrasse 47.350187 \n", "\n", " stop_lon trip_headsign trip_short_name direction_id \\\n", "0 8.526539 Zürich, Neubühl 3870 0 \n", "1 8.525749 Zürich, Neubühl 3870 0 \n", "2 8.526997 Zürich, Neubühl 3870 0 \n", "3 8.527141 Zürich, Neubühl 3870 0 \n", "4 8.527806 Zürich, Neubühl 3870 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \n", "0 16:55:00 1225 12 1317 Bus \n", "1 16:55:00 1225 12 1267 Bus \n", "2 16:55:00 1225 12 67 Bus \n", "3 16:55:00 1225 12 512 Bus \n", "4 16:55:00 1225 12 403 Bus " ] }, - "execution_count": 20, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered = stop_times_curated\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by making sure the order is correct" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
9311126-10-j19-185732051672.TA.26-10-j19-1.11.R857320507:00:0007:01:0027Zürich Flughafen, Bahnhof47.4504418.563729Zürich Flughafen, Fracht4096107:01:0002298Tram
9311226-10-j19-185885531672.TA.26-10-j19-1.11.R858855307:02:0007:02:0028Zürich Flughafen, Fracht47.4524948.572057Zürich Flughafen, Fracht4096107:01:00021295Tram
9311326-13-j19-185762402064.TA.26-13-j19-1.24.H857624007:00:0007:00:005Zürich, Meierhofplatz47.4020108.499374Zürich, Albisgütli1831007:00:001261222Tram
9311426-13-j19-185913532064.TA.26-13-j19-1.24.H859135307:01:0007:01:006Zürich, Schwert47.3997308.504611Zürich, Albisgütli1831007:00:00126816Tram
9311526-13-j19-185910392064.TA.26-13-j19-1.24.H859103907:02:0007:02:007Zürich, Alte Trotte47.3977668.507252Zürich, Albisgütli1831007:00:00126778Tram
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n", "93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n", "93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n", "93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n", "93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n", "\n", " arrival_time departure_time stop_sequence stop_name \\\n", "93111 07:00:00 07:01:00 27 Zürich Flughafen, Bahnhof \n", "93112 07:02:00 07:02:00 28 Zürich Flughafen, Fracht \n", "93113 07:00:00 07:00:00 5 Zürich, Meierhofplatz \n", "93114 07:01:00 07:01:00 6 Zürich, Schwert \n", "93115 07:02:00 07:02:00 7 Zürich, Alte Trotte \n", "\n", " stop_lat stop_lon trip_headsign trip_short_name \\\n", "93111 47.450441 8.563729 Zürich Flughafen, Fracht 4096 \n", "93112 47.452494 8.572057 Zürich Flughafen, Fracht 4096 \n", "93113 47.402010 8.499374 Zürich, Albisgütli 1831 \n", "93114 47.399730 8.504611 Zürich, Albisgütli 1831 \n", "93115 47.397766 8.507252 Zürich, Albisgütli 1831 \n", "\n", " direction_id departure_first_stop route_int stop_count stop_int \\\n", "93111 1 07:01:00 0 2 298 \n", "93112 1 07:01:00 0 2 1295 \n", "93113 0 07:00:00 1 26 1222 \n", "93114 0 07:00:00 1 26 816 \n", "93115 0 07:00:00 1 26 778 \n", "\n", " route_desc \n", "93111 Tram \n", "93112 Tram \n", "93113 Tram \n", "93114 Tram \n", "93115 Tram " ] }, - "execution_count": 23, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"departure_first_stop\", \"stop_sequence\"])\n", + "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"departure_first_stop\", \"departure_time\"])\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
arrival_timedeparture_time
9311107:00:0007:01:00
9311207:02:0007:02:00
9311307:00:0007:00:00
9311407:01:0007:01:00
9311507:02:0007:02:00
\n", "
" ], "text/plain": [ " arrival_time departure_time\n", "93111 07:00:00 07:01:00\n", "93112 07:02:00 07:02:00\n", "93113 07:00:00 07:00:00\n", "93114 07:01:00 07:01:00\n", "93115 07:02:00 07:02:00" ] }, - "execution_count": 24, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add None to first arrival time and last departure time." ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descsequence_shift_1departure_first_shift_1
9311126-10-j19-185732051672.TA.26-10-j19-1.11.R857320507:00:0007:01:0027Zürich Flughafen, Bahnhof47.4504418.563729Zürich Flughafen, Fracht4096107:01:0002298Tram2807:01:00
9311226-10-j19-185885531672.TA.26-10-j19-1.11.R858855307:02:0007:02:0028Zürich Flughafen, Fracht47.4524948.572057Zürich Flughafen, Fracht4096107:01:00021295Tram507:00:00
9311326-13-j19-185762402064.TA.26-13-j19-1.24.H857624007:00:0007:00:005Zürich, Meierhofplatz47.4020108.499374Zürich, Albisgütli1831007:00:001261222Tram607:00:00
9311426-13-j19-185913532064.TA.26-13-j19-1.24.H859135307:01:0007:01:006Zürich, Schwert47.3997308.504611Zürich, Albisgütli1831007:00:00126816Tram707:00:00
9311526-13-j19-185910392064.TA.26-13-j19-1.24.H859103907:02:0007:02:007Zürich, Alte Trotte47.3977668.507252Zürich, Albisgütli1831007:00:00126778Tram807:00:00
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n", "93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n", "93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n", "93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n", "93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n", "\n", " arrival_time departure_time stop_sequence stop_name \\\n", "93111 07:00:00 07:01:00 27 Zürich Flughafen, Bahnhof \n", "93112 07:02:00 07:02:00 28 Zürich Flughafen, Fracht \n", "93113 07:00:00 07:00:00 5 Zürich, Meierhofplatz \n", "93114 07:01:00 07:01:00 6 Zürich, Schwert \n", "93115 07:02:00 07:02:00 7 Zürich, Alte Trotte \n", "\n", " stop_lat stop_lon trip_headsign trip_short_name \\\n", "93111 47.450441 8.563729 Zürich Flughafen, Fracht 4096 \n", "93112 47.452494 8.572057 Zürich Flughafen, Fracht 4096 \n", "93113 47.402010 8.499374 Zürich, Albisgütli 1831 \n", "93114 47.399730 8.504611 Zürich, Albisgütli 1831 \n", "93115 47.397766 8.507252 Zürich, Albisgütli 1831 \n", "\n", " direction_id departure_first_stop route_int stop_count stop_int \\\n", "93111 1 07:01:00 0 2 298 \n", "93112 1 07:01:00 0 2 1295 \n", "93113 0 07:00:00 1 26 1222 \n", "93114 0 07:00:00 1 26 816 \n", "93115 0 07:00:00 1 26 778 \n", "\n", - " route_desc sequence_shift_1 \n", - "93111 Tram 28 \n", - "93112 Tram 5 \n", - "93113 Tram 6 \n", - "93114 Tram 7 \n", - "93115 Tram 8 " + " route_desc departure_first_shift_1 \n", + "93111 Tram 07:01:00 \n", + "93112 Tram 07:00:00 \n", + "93113 Tram 07:00:00 \n", + "93114 Tram 07:00:00 \n", + "93115 Tram 07:00:00 " ] }, - "execution_count": 25, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#adding a shift\n", - "stop_times_ordered[\"sequence_shift_1\"] = stop_times_ordered[\"stop_sequence\"].shift(-1, fill_value=0)\n", + "stop_times_ordered[\"departure_first_shift_1\"] = stop_times_ordered[\"departure_first_stop\"].shift(-1, fill_value=0)\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ - "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"stop_sequence\"] > stop_times_ordered[\"sequence_shift_1\"]), None, stop_times_ordered['departure_time'])" + "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"departure_first_stop\"] != stop_times_ordered[\"departure_first_shift_1\"]), None, stop_times_ordered['departure_time'])" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ - "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"stop_sequence\"] == 1), None, stop_times_ordered['arrival_time'])" + "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"departure_first_stop\"] == stop_times_ordered[\"departure_time\"]), None, stop_times_ordered['arrival_time'])" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", "
arrival_timedeparture_timestop_sequencesequence_shift_1
9311107:00:00None07:01:002728
9311207:02:00None285
93113None07:00:0007:00:0056
9311407:01:0007:01:0067
9311507:02:0007:02:0078
\n", "
" ], "text/plain": [ - " arrival_time departure_time stop_sequence sequence_shift_1\n", - "93111 07:00:00 07:01:00 27 28\n", - "93112 07:02:00 None 28 5\n", - "93113 07:00:00 07:00:00 5 6\n", - "93114 07:01:00 07:01:00 6 7\n", - "93115 07:02:00 07:02:00 7 8" + " arrival_time departure_time\n", + "93111 None 07:01:00\n", + "93112 07:02:00 None\n", + "93113 None 07:00:00\n", + "93114 07:01:00 07:01:00\n", + "93115 07:02:00 07:02:00" ] }, - "execution_count": 28, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "stop_times_ordered[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_shift_1\"]].head(5)" + "stop_times_ordered[[\"arrival_time\",\"departure_time\"]].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Array structure preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### StopTimes: \n", "[[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We transform it in datetime as required by the raptor algorithm" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n", "stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
arrival_timedeparture_time
931112020-05-22 07:00:002020-05-22 07:01:00NaT2020-05-23 07:01:00
931122020-05-22 07:02:002020-05-23 07:02:00NaT
931132020-05-22 07:00:002020-05-22 07:00:00NaT2020-05-23 07:00:00
931142020-05-22 07:01:002020-05-22 07:01:002020-05-23 07:01:002020-05-23 07:01:00
931152020-05-22 07:02:002020-05-22 07:02:002020-05-23 07:02:002020-05-23 07:02:00
\n", "
" ], "text/plain": [ " arrival_time departure_time\n", - "93111 2020-05-22 07:00:00 2020-05-22 07:01:00\n", - "93112 2020-05-22 07:02:00 NaT\n", - "93113 2020-05-22 07:00:00 2020-05-22 07:00:00\n", - "93114 2020-05-22 07:01:00 2020-05-22 07:01:00\n", - "93115 2020-05-22 07:02:00 2020-05-22 07:02:00" + "93111 NaT 2020-05-23 07:01:00\n", + "93112 2020-05-23 07:02:00 NaT\n", + "93113 NaT 2020-05-23 07:00:00\n", + "93114 2020-05-23 07:01:00 2020-05-23 07:01:00\n", + "93115 2020-05-23 07:02:00 2020-05-23 07:02:00" ] }, - "execution_count": 30, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ - "with open('../data/stop_times_df.pkl','wb') as f: pickle.dump(stop_times_ordered, f)" + "with open('../data/stop_times_df_cyril.pkl','wb') as f: pickle.dump(stop_times_ordered, f)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descsequence_shift_1departure_first_shift_1
9311126-10-j19-185732051672.TA.26-10-j19-1.11.R85732052020-05-22 07:00:002020-05-22 07:01:00NaT2020-05-23 07:01:0027Zürich Flughafen, Bahnhof47.4504418.563729Zürich Flughafen, Fracht4096107:01:0002298Tram2807:01:00
9311226-10-j19-185885531672.TA.26-10-j19-1.11.R85885532020-05-22 07:02:002020-05-23 07:02:00NaT28Zürich Flughafen, Fracht47.4524948.572057Zürich Flughafen, Fracht4096107:01:00021295Tram507:00:00
9311326-13-j19-185762402064.TA.26-13-j19-1.24.H85762402020-05-22 07:00:002020-05-22 07:00:00NaT2020-05-23 07:00:005Zürich, Meierhofplatz47.4020108.499374Zürich, Albisgütli1831007:00:001261222Tram607:00:00
9311426-13-j19-185913532064.TA.26-13-j19-1.24.H85913532020-05-22 07:01:002020-05-22 07:01:002020-05-23 07:01:002020-05-23 07:01:006Zürich, Schwert47.3997308.504611Zürich, Albisgütli1831007:00:00126816Tram707:00:00
9311526-13-j19-185910392064.TA.26-13-j19-1.24.H85910392020-05-22 07:02:002020-05-22 07:02:002020-05-23 07:02:002020-05-23 07:02:007Zürich, Alte Trotte47.3977668.507252Zürich, Albisgütli1831007:00:00126778Tram807:00:00
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n", "93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n", "93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n", "93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n", "93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n", "\n", " arrival_time departure_time stop_sequence \\\n", - "93111 2020-05-22 07:00:00 2020-05-22 07:01:00 27 \n", - "93112 2020-05-22 07:02:00 NaT 28 \n", - "93113 2020-05-22 07:00:00 2020-05-22 07:00:00 5 \n", - "93114 2020-05-22 07:01:00 2020-05-22 07:01:00 6 \n", - "93115 2020-05-22 07:02:00 2020-05-22 07:02:00 7 \n", + "93111 NaT 2020-05-23 07:01:00 27 \n", + "93112 2020-05-23 07:02:00 NaT 28 \n", + "93113 NaT 2020-05-23 07:00:00 5 \n", + "93114 2020-05-23 07:01:00 2020-05-23 07:01:00 6 \n", + "93115 2020-05-23 07:02:00 2020-05-23 07:02:00 7 \n", "\n", " stop_name stop_lat stop_lon \\\n", "93111 Zürich Flughafen, Bahnhof 47.450441 8.563729 \n", "93112 Zürich Flughafen, Fracht 47.452494 8.572057 \n", "93113 Zürich, Meierhofplatz 47.402010 8.499374 \n", "93114 Zürich, Schwert 47.399730 8.504611 \n", "93115 Zürich, Alte Trotte 47.397766 8.507252 \n", "\n", " trip_headsign trip_short_name direction_id \\\n", "93111 Zürich Flughafen, Fracht 4096 1 \n", "93112 Zürich Flughafen, Fracht 4096 1 \n", "93113 Zürich, Albisgütli 1831 0 \n", "93114 Zürich, Albisgütli 1831 0 \n", "93115 Zürich, Albisgütli 1831 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \\\n", "93111 07:01:00 0 2 298 Tram \n", "93112 07:01:00 0 2 1295 Tram \n", "93113 07:00:00 1 26 1222 Tram \n", "93114 07:00:00 1 26 816 Tram \n", "93115 07:00:00 1 26 778 Tram \n", "\n", - " sequence_shift_1 \n", - "93111 28 \n", - "93112 5 \n", - "93113 6 \n", - "93114 7 \n", - "93115 8 " + " departure_first_shift_1 \n", + "93111 07:01:00 \n", + "93112 07:00:00 \n", + "93113 07:00:00 \n", + "93114 07:00:00 \n", + "93115 07:00:00 " ] }, - "execution_count": 33, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered = stop_times_ordered.sort_values(by=[\"route_int\", \"departure_first_stop\", \"stop_sequence\"])\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we transform it to array, ready ti be used by raptor" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([['2020-05-22T07:00:00.000000000', '2020-05-22T07:01:00.000000000'],\n", - " ['2020-05-22T07:02:00.000000000', 'NaT'],\n", - " ['2020-05-22T07:00:00.000000000', '2020-05-22T07:00:00.000000000'],\n", + "array([[ 'NaT', '2020-05-23T07:01:00.000000000'],\n", + " ['2020-05-23T07:02:00.000000000', 'NaT'],\n", + " [ 'NaT', '2020-05-23T07:00:00.000000000'],\n", " ...,\n", - " ['2020-05-22T07:35:00.000000000', '2020-05-22T07:35:00.000000000'],\n", - " ['2020-05-22T07:36:00.000000000', '2020-05-22T07:36:00.000000000'],\n", - " ['2020-05-22T07:37:00.000000000', 'NaT']],\n", + " ['2020-05-23T07:35:00.000000000', '2020-05-23T07:35:00.000000000'],\n", + " ['2020-05-23T07:36:00.000000000', '2020-05-23T07:36:00.000000000'],\n", + " ['2020-05-23T07:37:00.000000000', 'NaT']],\n", " dtype='datetime64[ns]')" ] }, - "execution_count": 34, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_array = stop_times_ordered[[\"arrival_time\", \"departure_time\"]].to_numpy()\n", "stop_times_array" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "260459" ] }, - "execution_count": 35, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stop_times_array,0)" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_times_array_cyril.pkl','wb') as f: pickle.dump(stop_times_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Routes: \n", "[[route0_nr.Trips, route0_nr. Stops, route0_pointerRoutes, route0_pointerStops_times],[route1_nr.Trips, route1_nr. Stops,, route1_pointerRoutes, route1_pointerStops_times],…]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by getting the number of trips and stops there is for each route" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stops
route_int
012
1126
218
3117
415
\n", "
" ], "text/plain": [ " n_Trips n_stops\n", "route_int \n", "0 1 2\n", "1 1 26\n", "2 1 8\n", "3 1 17\n", "4 1 5" ] }, - "execution_count": 40, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops = stop_times_ordered.groupby([\"route_int\"]).nunique()[[\"trip_id\",\"stop_int\"]].sort_index().rename(columns={\"trip_id\": \"n_Trips\", \"stop_int\": \"n_stops\"})\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1461, 2)" ] }, - "execution_count": 41, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the pointer for the route stops, by adding the unique stops for each route" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stops
route_int
0120
11262
21828
311736
41553
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops\n", "route_int \n", "0 1 2 0\n", "1 1 26 2\n", "2 1 8 28\n", "3 1 17 36\n", "4 1 5 53" ] }, - "execution_count": 42, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops['pointer_routes_stops'] = distinct_trips_stops.n_stops.cumsum().shift(1, fill_value=0)\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the pointer for stop_times by adding the number of stops in each route, counting duplicates (due to several trips)" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "distinct_trips_stops[\"pointer_stop_times\"] = (stop_times_ordered.groupby([\"route_int\"]).count().stop_id).cumsum().shift(1, fill_value=0)" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stopspointer_stop_timespointer_routes_stops_shiftpointer_stop_times_shift
route_int
0120022
1126222828
21828283636
311736365353
41553535858
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n", "route_int \n", "0 1 2 0 0 \n", "1 1 26 2 2 \n", "2 1 8 28 28 \n", "3 1 17 36 36 \n", "4 1 5 53 53 \n", "\n", " pointer_routes_stops_shift pointer_stop_times_shift \n", "route_int \n", "0 2 2 \n", "1 28 28 \n", "2 36 36 \n", "3 53 53 \n", "4 58 58 " ] }, - "execution_count": 44, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops[\"pointer_routes_stops_shift\"] = distinct_trips_stops['pointer_routes_stops'].shift(-1, fill_value=0)\n", "distinct_trips_stops[\"pointer_stop_times_shift\"] = distinct_trips_stops['pointer_stop_times'].shift(-1, fill_value=0)\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "distinct_trips_stops['pointer_routes_stops'] = np.where((distinct_trips_stops[\"pointer_routes_stops\"] == distinct_trips_stops[\"pointer_routes_stops_shift\"]), None, distinct_trips_stops['pointer_routes_stops'])\n", "distinct_trips_stops['pointer_stop_times'] = np.where((distinct_trips_stops[\"pointer_stop_times\"] == distinct_trips_stops[\"pointer_stop_times_shift\"]), None, distinct_trips_stops['pointer_stop_times'])\n" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "n_Trips False\n", "n_stops False\n", "pointer_routes_stops False\n", "pointer_stop_times False\n", "pointer_routes_stops_shift False\n", "pointer_stop_times_shift False\n", "dtype: bool" ] }, - "execution_count": 46, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.isna().any()" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "with open('../data/routes_array_df_cyril.pkl','wb') as f: pickle.dump(distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']], f)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 1461 entries, 0 to 1460\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 n_Trips 1461 non-null int64 \n", " 1 n_stops 1461 non-null int64 \n", " 2 pointer_routes_stops 1461 non-null object\n", " 3 pointer_stop_times 1461 non-null object\n", " 4 pointer_routes_stops_shift 1461 non-null int64 \n", " 5 pointer_stop_times_shift 1461 non-null int64 \n", "dtypes: int64(4), object(2)\n", "memory usage: 79.9+ KB\n" ] } ], "source": [ "distinct_trips_stops.info()" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 2, 0, 0],\n", " [1, 26, 2, 2],\n", " [1, 8, 28, 28],\n", " ...,\n", " [1, 3, 15297, 260396],\n", " [2, 16, 15300, 260399],\n", " [1, 28, 15316, 260431]], dtype=object)" ] }, - "execution_count": 49, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "routes_array = distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']].to_numpy()\n", "routes_array" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1461" ] }, - "execution_count": 50, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(routes_array, 0)" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "with open('../data/routes_array_cyril.pkl','wb') as f: pickle.dump(routes_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
0931110298
19311201295
29311311222
3931141816
4931151778
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "0 93111 0 298\n", "1 93112 0 1295\n", "2 93113 1 1222\n", "3 93114 1 816\n", "4 93115 1 778" ] }, - "execution_count": 52, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops = stop_times_ordered.sort_values([\"route_int\", \"stop_sequence\"])\n", "route_stops = route_stops[['route_int', 'stop_int']].drop_duplicates().reset_index()\n", "route_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 15344 entries, 0 to 15343\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype\n", "--- ------ -------------- -----\n", " 0 index 15344 non-null int64\n", " 1 route_int 15344 non-null int64\n", " 2 stop_int 15344 non-null int64\n", "dtypes: int64(3)\n", "memory usage: 359.8 KB\n" ] } ], "source": [ "route_stops.info()" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1461" ] }, - "execution_count": 54, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.route_int.nunique()" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "with open('../data/route_stops_df_cyril.pkl','wb') as f: pickle.dump(route_stops, f)" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 298, 1295, 1222, ..., 1349, 1042, 549])" ] }, - "execution_count": 56, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops_array = route_stops.stop_int.to_numpy()\n", "route_stops_array" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 57, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(np.unique(route_stops_array))" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15344" ] }, - "execution_count": 58, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(route_stops_array, 0)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15344,)" ] }, - "execution_count": 59, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops_array.shape" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "with open('../data/route_stops_array_cyril.pkl','wb') as f: pickle.dump(route_stops_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if pointers are correct\n", "It is fundamental that the indexes, that serve as pointers, in Routes are correct" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by looking at where the indexes for stop_times and route_stops diverge. This will allow us to change. We can see that Route stops should have a new route at 3 while stop_times should have it at 78, so we try with that" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stopspointer_stop_timespointer_routes_stops_shiftpointer_stop_times_shift
route_int
0120022
1126222828
21828283636
311736365353
41553535858
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n", "route_int \n", "0 1 2 0 0 \n", "1 1 26 2 2 \n", "2 1 8 28 28 \n", "3 1 17 36 36 \n", "4 1 5 53 53 \n", "\n", " pointer_routes_stops_shift pointer_stop_times_shift \n", "route_int \n", "0 2 2 \n", "1 28 28 \n", "2 36 36 \n", "3 53 53 \n", "4 58 58 " ] }, - "execution_count": 61, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can check if the pointer indicates the routes index number. At the pointer_routes should indicate the first stop of a new route. We try with 3 to see if route_stops has a new route at this index. It does so it works" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
0931110298
19311201295
29311311222
3931141816
4931151778
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "0 93111 0 298\n", "1 93112 0 1295\n", "2 93113 1 1222\n", "3 93114 1 816\n", "4 93115 1 778" ] }, - "execution_count": 62, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We go and see if stop_times has a new route at 78. It does, so it works" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descsequence_shift_1departure_first_shift_1
7526-66-j19-185910988.TA.26-66-j19-1.1.H85910982020-05-22 18:04:002020-05-22 18:04:002020-05-23 18:04:002020-05-23 18:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3762017:55:00122512512Bus717:55:00
7626-66-j19-185913928.TA.26-66-j19-1.1.H85913922020-05-22 18:05:002020-05-22 18:05:002020-05-23 18:05:002020-05-23 18:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3762017:55:00122512403Bus817:55:00
7726-66-j19-185912168.TA.26-66-j19-1.1.H85912162020-05-22 18:06:002020-05-22 18:06:002020-05-23 18:06:002020-05-23 18:06:008Zürich, Jugendherberge47.3480028.528210Zürich, Neubühl3762017:55:001225121375Bus917:55:00
7826-66-j19-185912798.TA.26-66-j19-1.1.H85912792020-05-22 18:08:002020-05-22 18:08:002020-05-23 18:08:002020-05-23 18:08:009Zürich, Morgental47.3439488.530141Zürich, Neubühl3762017:55:001225121349Bus1017:55:00
7926-66-j19-185912178.TA.26-66-j19-1.1.H85912172020-05-22 18:09:002020-05-22 18:09:002020-05-23 18:09:002020-05-23 18:09:0010Zürich, Kalchbühlweg47.3418188.531049Zürich, Neubühl3762017:55:001225121303Bus1117:55:00
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "75 26-66-j19-1 8591098 8.TA.26-66-j19-1.1.H 8591098 \n", "76 26-66-j19-1 8591392 8.TA.26-66-j19-1.1.H 8591392 \n", "77 26-66-j19-1 8591216 8.TA.26-66-j19-1.1.H 8591216 \n", "78 26-66-j19-1 8591279 8.TA.26-66-j19-1.1.H 8591279 \n", "79 26-66-j19-1 8591217 8.TA.26-66-j19-1.1.H 8591217 \n", "\n", " arrival_time departure_time stop_sequence \\\n", - "75 2020-05-22 18:04:00 2020-05-22 18:04:00 6 \n", - "76 2020-05-22 18:05:00 2020-05-22 18:05:00 7 \n", - "77 2020-05-22 18:06:00 2020-05-22 18:06:00 8 \n", - "78 2020-05-22 18:08:00 2020-05-22 18:08:00 9 \n", - "79 2020-05-22 18:09:00 2020-05-22 18:09:00 10 \n", + "75 2020-05-23 18:04:00 2020-05-23 18:04:00 6 \n", + "76 2020-05-23 18:05:00 2020-05-23 18:05:00 7 \n", + "77 2020-05-23 18:06:00 2020-05-23 18:06:00 8 \n", + "78 2020-05-23 18:08:00 2020-05-23 18:08:00 9 \n", + "79 2020-05-23 18:09:00 2020-05-23 18:09:00 10 \n", "\n", " stop_name stop_lat stop_lon trip_headsign \\\n", "75 Zürich, Brunau/Mutschellenstr. 47.355147 8.527141 Zürich, Neubühl \n", "76 Zürich, Thujastrasse 47.350187 8.527806 Zürich, Neubühl \n", "77 Zürich, Jugendherberge 47.348002 8.528210 Zürich, Neubühl \n", "78 Zürich, Morgental 47.343948 8.530141 Zürich, Neubühl \n", "79 Zürich, Kalchbühlweg 47.341818 8.531049 Zürich, Neubühl \n", "\n", " trip_short_name direction_id departure_first_stop route_int stop_count \\\n", "75 3762 0 17:55:00 1225 12 \n", "76 3762 0 17:55:00 1225 12 \n", "77 3762 0 17:55:00 1225 12 \n", "78 3762 0 17:55:00 1225 12 \n", "79 3762 0 17:55:00 1225 12 \n", "\n", - " stop_int route_desc sequence_shift_1 \n", - "75 512 Bus 7 \n", - "76 403 Bus 8 \n", - "77 1375 Bus 9 \n", - "78 1349 Bus 10 \n", - "79 1303 Bus 11 " + " stop_int route_desc departure_first_shift_1 \n", + "75 512 Bus 17:55:00 \n", + "76 403 Bus 17:55:00 \n", + "77 1375 Bus 17:55:00 \n", + "78 1349 Bus 17:55:00 \n", + "79 1303 Bus 17:55:00 " ] }, - "execution_count": 63, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered.loc[75:80].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intUnnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2stop_int_2
09311102984536.085732058503016:0:20.101546121.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984558.08573205:0:A8503016:0:20.118159141.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984580.08573205:0:B8503016:0:20.104861125.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984624.08573205:0:D8503016:0:20.103327123.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984646.08573205:0:E8503016:0:20.101546121.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
\n", "
" ], "text/plain": [ " index route_int stop_int Unnamed: 0 stop_id stop_id2 distance \\\n", "0 93111 0 298 4536.0 8573205 8503016:0:2 0.101546 \n", "0 93111 0 298 4558.0 8573205:0:A 8503016:0:2 0.118159 \n", "0 93111 0 298 4580.0 8573205:0:B 8503016:0:2 0.104861 \n", "0 93111 0 298 4624.0 8573205:0:D 8503016:0:2 0.103327 \n", "0 93111 0 298 4646.0 8573205:0:E 8503016:0:2 0.101546 \n", "\n", " Transfer_time_sec stop_name stop_name2 stop_int_2 \n", "0 121.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 141.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 125.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 123.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 121.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 " ] }, - "execution_count": 64, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_join = route_stops.join(transfers.set_index(\"stop_int\"), how=\"left\", on=\"stop_int\").drop_duplicates()\n", "stops_join.head(5)" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 65, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_join.stop_int.nunique()" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_Transfers
stop_int
01816
1112
2234
366
460
\n", "
" ], "text/plain": [ " n_Routes n_Transfers\n", "stop_int \n", "0 18 16\n", "1 11 2\n", "2 23 4\n", "3 6 6\n", "4 6 0" ] }, - "execution_count": 66, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers = stops_join.sort_values(\"stop_int\").groupby([\"stop_int\"]).nunique().rename(columns={\"route_int\": \"n_Routes\", \"stop_int_2\": \"n_Transfers\"})\n", "distinct_route_transfers = distinct_route_transfers[[\"n_Routes\", \"n_Transfers\"]].sort_index()\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_Transferspointer_stop_routespointer_transfers
stop_int
0181600
11121816
22342918
3665222
4605828
\n", "
" ], "text/plain": [ " n_Routes n_Transfers pointer_stop_routes pointer_transfers\n", "stop_int \n", "0 18 16 0 0\n", "1 11 2 18 16\n", "2 23 4 29 18\n", "3 6 6 52 22\n", "4 6 0 58 28" ] }, - "execution_count": 67, + "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers['pointer_stop_routes'] = distinct_route_transfers.n_Routes.cumsum().shift(1, fill_value=0)\n", "distinct_route_transfers['pointer_transfers'] = distinct_route_transfers.n_Transfers.cumsum().shift(1, fill_value=0)\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_Transferspointer_stop_routespointer_transferspointer_stop_routes_shiftpointer_transfers_shift
stop_int
01816001816
111218162918
223429185222
36652225828
46058286428
\n", "
" ], "text/plain": [ " n_Routes n_Transfers pointer_stop_routes pointer_transfers \\\n", "stop_int \n", "0 18 16 0 0 \n", "1 11 2 18 16 \n", "2 23 4 29 18 \n", "3 6 6 52 22 \n", "4 6 0 58 28 \n", "\n", " pointer_stop_routes_shift pointer_transfers_shift \n", "stop_int \n", "0 18 16 \n", "1 29 18 \n", "2 52 22 \n", "3 58 28 \n", "4 64 28 " ] }, - "execution_count": 68, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers[\"pointer_stop_routes_shift\"] = distinct_route_transfers['pointer_stop_routes'].shift(-1, fill_value=0)\n", "distinct_route_transfers[\"pointer_transfers_shift\"] = distinct_route_transfers['pointer_transfers'].shift(-1, fill_value=0)\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "distinct_route_transfers['pointer_stop_routes'] = np.where((distinct_route_transfers[\"pointer_stop_routes\"] == distinct_route_transfers[\"pointer_stop_routes_shift\"]), None, distinct_route_transfers['pointer_stop_routes'])\n", "distinct_route_transfers['pointer_transfers'] = np.where((distinct_route_transfers[\"pointer_transfers\"] == distinct_route_transfers[\"pointer_transfers_shift\"]), None, distinct_route_transfers['pointer_transfers'])\n" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "n_Routes False\n", "n_Transfers False\n", "pointer_stop_routes False\n", "pointer_transfers True\n", "pointer_stop_routes_shift False\n", "pointer_transfers_shift False\n", "dtype: bool" ] }, - "execution_count": 70, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers.isna().any()" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "stops_df = distinct_route_transfers[['pointer_stop_routes', 'pointer_transfers']]" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "with open('../data/stops_df.pkl','wb') as f: pickle.dump(stops_df, f)" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0],\n", " [18, 16],\n", " [29, 18],\n", " ...,\n", " [15329, 6322],\n", " [15334, 6329],\n", " [15339, 6334]], dtype=object)" ] }, - "execution_count": 73, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_array = stops_df.to_numpy()\n", "stops_array" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 74, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stops_array, 0)" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1407, 2)" ] }, - "execution_count": 75, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_array.shape" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "with open('../data/stops_array_cyril.pkl','wb') as f: pickle.dump(stops_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intstop_id
0871638208503088:0:22
19554312908503088:0:21
212933218708503088:0:21
37384821108503088:0:22
414728525108503088:0:21
\n", "
" ], "text/plain": [ " index route_int stop_int stop_id\n", "0 87163 82 0 8503088:0:22\n", "1 95543 129 0 8503088:0:21\n", "2 129332 187 0 8503088:0:21\n", "3 73848 211 0 8503088:0:22\n", "4 147285 251 0 8503088:0:21" ] }, - "execution_count": 77, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes = stop_times_ordered[[\"route_int\", \"stop_int\", \"stop_id\"]].drop_duplicates().sort_values([\"stop_int\", \"route_int\"])\n", "stop_routes = stop_routes.reset_index()\n", "stop_routes.head(5)" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15486, 4)" ] }, - "execution_count": 78, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.shape" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "249" ] }, - "execution_count": 79, + "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.route_id.nunique()" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1461" ] }, - "execution_count": 80, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.route_int.nunique()" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_routes_df_cyril.pkl','wb') as f: pickle.dump(stop_routes, f)" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 82, 129, 187, ..., 855, 977, 1087])" ] }, - "execution_count": 82, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes_array = stop_routes[\"route_int\"].to_numpy()\n", "stop_routes_array" ] }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15486" ] }, - "execution_count": 83, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stop_routes_array, 0)" ] }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15486,)" ] }, - "execution_count": 84, + "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes_array.shape" ] }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_routes_array_cyril.pkl','wb') as f: pickle.dump(stop_routes_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]" ] }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 116, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9434" ] }, - "execution_count": 86, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_intstop_int_2Transfer_time_sec
0008
1051564
20274441
30375594
40462489
\n", "
" ], "text/plain": [ " stop_int stop_int_2 Transfer_time_sec\n", "0 0 0 8\n", "1 0 51 564\n", "2 0 274 441\n", "3 0 375 594\n", "4 0 462 489" ] }, - "execution_count": 89, + "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_pandas = transfers[[\"stop_int\",\"stop_int_2\", \"Transfer_time_sec\"]].sort_values([\"stop_int\", \"stop_int_2\"]).drop_duplicates([\"stop_int\", \"stop_int_2\"])\n", "transfer_pandas = transfer_pandas.reset_index(drop=True)\n", "transfer_pandas.head()" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 135, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1342" ] }, - "execution_count": 90, + "execution_count": 135, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_pandas.stop_int_2.nunique()" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ "with open('../data/transfer_df_cyril.pkl','wb') as f: pickle.dump(transfers.sort_values(\"stop_id\"), f)" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 137, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 8],\n", " [ 51, 564],\n", " [ 274, 441],\n", " ...,\n", " [1120, 345],\n", " [1266, 561],\n", " [1406, 8]])" ] }, - "execution_count": 92, + "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_array = transfer_pandas[[\"stop_int_2\", \"Transfer_time_sec\"]].to_numpy()\n", "transfer_array" ] }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "with open('../data/transfer_array_cyril.pkl','wb') as f: pickle.dump(transfer_array, f)" ] }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 139, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "6342" + "(6342, 2)" ] }, - "execution_count": 94, + "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "np.size(transfer_array, 0)" + "transfer_array.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Check if indexes in stops is correct" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see first the pointers" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pointer_stop_routespointer_transfers
stop_int
000
11816
22918
35222
458None
\n", "
" ], "text/plain": [ " pointer_stop_routes pointer_transfers\n", "stop_int \n", "0 0 0\n", "1 18 16\n", "2 29 18\n", "3 52 22\n", "4 58 None" ] }, - "execution_count": 95, + "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that at the index 16 there should be a new stop. we check and it is false" ] }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_intstop_int_2Transfer_time_sec
1501289460
161814267
1711350569
18238346
1921062413
\n", "
" ], "text/plain": [ " stop_int stop_int_2 Transfer_time_sec\n", "15 0 1289 460\n", "16 1 814 267\n", "17 1 1350 569\n", "18 2 38 346\n", "19 2 1062 413" ] }, - "execution_count": 97, + "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_pandas.loc[15:20].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that at index 18 we should have a new stop. we check and it true" ] }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intstop_id
15240179103908503088:0:21
16150919107808503088:0:21
1726670131608503088:0:21
18938571818502508
1923650811418502508
\n", "
" ], "text/plain": [ " index route_int stop_int stop_id\n", "15 240179 1039 0 8503088:0:21\n", "16 150919 1078 0 8503088:0:21\n", "17 26670 1316 0 8503088:0:21\n", "18 93857 18 1 8502508\n", "19 236508 114 1 8502508" ] }, - "execution_count": 99, + "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.loc[15:20].head(5)" ] }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 126, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intstop_id
70241089033826178503006:0:5
82041089013827248503011:0:2
1259910890038211388503010:0:2
1294010890238211768503000:0:33
1359010890438212188503016:0:3
\n", "
" ], "text/plain": [ " index route_int stop_int stop_id\n", "7024 108903 382 617 8503006:0:5\n", "8204 108901 382 724 8503011:0:2\n", "12599 108900 382 1138 8503010:0:2\n", "12940 108902 382 1176 8503000:0:33\n", "13590 108904 382 1218 8503016:0:3" ] }, - "execution_count": 106, + "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.loc[stop_routes['route_int'] == 382]" ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
40241089003821138
4025108901382724
40261089023821176
4027108903382617
40281089043821218
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "4024 108900 382 1138\n", "4025 108901 382 724\n", "4026 108902 382 1176\n", "4027 108903 382 617\n", "4028 108904 382 1218" ] }, - "execution_count": 108, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.loc[route_stops['route_int'] == 382]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "read files as pickles" ] }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 128, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../data/stop_times_array.pkl'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/stop_times_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/stop_times_array.pkl'" + ] + } + ], "source": [ - "with open('../data/stop_times_array.pkl','rb') as f: arrayname1 = pickle.load(f)" + "with open('../data/stop_times_array_cyril.pkl','rb') as f: arrayname1 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "ename": "UnpicklingError", "evalue": "invalid load key, 'v'.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/routes_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m: invalid load key, 'v'." ] } ], "source": [ - "with open('../data/routes_array.pkl','rb') as f: arrayname2 = pickle.load(f)" + "with open('../data/routes_array_cyril.pkl','rb') as f: arrayname2 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "ename": "UnpicklingError", "evalue": "invalid load key, 'v'.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/route_stops_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname3\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m: invalid load key, 'v'." ] } ], "source": [ - "with open('../data/route_stops_array.pkl','rb') as f: arrayname3 = pickle.load(f)" + "with open('../data/route_stops_array_cyril.pkl','rb') as f: arrayname3 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([['2020-05-22T07:00:00.000000000', '2020-05-22T07:01:00.000000000'],\n", " ['2020-05-22T07:02:00.000000000', 'NaT'],\n", " ['2020-05-22T07:00:00.000000000', '2020-05-22T07:00:00.000000000'],\n", " ...,\n", " ['2020-05-22T07:35:00.000000000', '2020-05-22T07:35:00.000000000'],\n", " ['2020-05-22T07:36:00.000000000', '2020-05-22T07:36:00.000000000'],\n", " ['2020-05-22T07:37:00.000000000', 'NaT']],\n", " dtype='datetime64[ns]')" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arrayname1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "arrayname2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "arrayname3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/notebooks/From_Stop_id_to_stop_int.ipynb b/notebooks/From_Stop_id_to_stop_int.ipynb new file mode 100644 index 0000000..1591163 --- /dev/null +++ b/notebooks/From_Stop_id_to_stop_int.ipynb @@ -0,0 +1,386 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# function to transfer from stop_id to stop_int\n", + "Note book to be deleted after incorporating function in MC raptor" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import the stop_times dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_times_df_cyril.pkl','rb') as f: stop_times_df = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descdeparture_first_shift_1
9311126-10-j19-185732051672.TA.26-10-j19-1.11.R8573205NaT2020-05-23 07:01:0027Zürich Flughafen, Bahnhof47.4504418.563729Zürich Flughafen, Fracht4096107:01:0002298Tram07:01:00
9311226-10-j19-185885531672.TA.26-10-j19-1.11.R85885532020-05-23 07:02:00NaT28Zürich Flughafen, Fracht47.4524948.572057Zürich Flughafen, Fracht4096107:01:00021295Tram07:00:00
9311326-13-j19-185762402064.TA.26-13-j19-1.24.H8576240NaT2020-05-23 07:00:005Zürich, Meierhofplatz47.4020108.499374Zürich, Albisgütli1831007:00:001261222Tram07:00:00
9311426-13-j19-185913532064.TA.26-13-j19-1.24.H85913532020-05-23 07:01:002020-05-23 07:01:006Zürich, Schwert47.3997308.504611Zürich, Albisgütli1831007:00:00126816Tram07:00:00
9311526-13-j19-185910392064.TA.26-13-j19-1.24.H85910392020-05-23 07:02:002020-05-23 07:02:007Zürich, Alte Trotte47.3977668.507252Zürich, Albisgütli1831007:00:00126778Tram07:00:00
\n", + "
" + ], + "text/plain": [ + " route_id stop_id_general trip_id stop_id \\\n", + "93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n", + "93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n", + "93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n", + "93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n", + "93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n", + "\n", + " arrival_time departure_time stop_sequence \\\n", + "93111 NaT 2020-05-23 07:01:00 27 \n", + "93112 2020-05-23 07:02:00 NaT 28 \n", + "93113 NaT 2020-05-23 07:00:00 5 \n", + "93114 2020-05-23 07:01:00 2020-05-23 07:01:00 6 \n", + "93115 2020-05-23 07:02:00 2020-05-23 07:02:00 7 \n", + "\n", + " stop_name stop_lat stop_lon \\\n", + "93111 Zürich Flughafen, Bahnhof 47.450441 8.563729 \n", + "93112 Zürich Flughafen, Fracht 47.452494 8.572057 \n", + "93113 Zürich, Meierhofplatz 47.402010 8.499374 \n", + "93114 Zürich, Schwert 47.399730 8.504611 \n", + "93115 Zürich, Alte Trotte 47.397766 8.507252 \n", + "\n", + " trip_headsign trip_short_name direction_id \\\n", + "93111 Zürich Flughafen, Fracht 4096 1 \n", + "93112 Zürich Flughafen, Fracht 4096 1 \n", + "93113 Zürich, Albisgütli 1831 0 \n", + "93114 Zürich, Albisgütli 1831 0 \n", + "93115 Zürich, Albisgütli 1831 0 \n", + "\n", + " departure_first_stop route_int stop_count stop_int route_desc \\\n", + "93111 07:01:00 0 2 298 Tram \n", + "93112 07:01:00 0 2 1295 Tram \n", + "93113 07:00:00 1 26 1222 Tram \n", + "93114 07:00:00 1 26 816 Tram \n", + "93115 07:00:00 1 26 778 Tram \n", + "\n", + " departure_first_shift_1 \n", + "93111 07:01:00 \n", + "93112 07:00:00 \n", + "93113 07:00:00 \n", + "93114 07:00:00 \n", + "93115 07:00:00 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1222" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = 8576240\n", + "stop_times_df[stop_times_df.stop_id == str(p)].iloc[0].stop_int" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1222" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_int = stop_times_df[stop_times_df.stop_id == str(8576240)].iloc[0].stop_int\n", + "stop_int" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def get_stop_int_from_id(p):\n", + " stop_int = stop_times_df[stop_times_df.stop_id == str(p)].iloc[0].stop_int\n", + " return stop_int" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def get_stop_id_from_int(p):\n", + " stop_id = stop_times_df[stop_times_df.stop_int == p].iloc[0].stop_id\n", + " return stop_id" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1222" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_stop_int_from_id(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'8576240'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_stop_id_from_int(1222)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/data_cyril.ipynb b/notebooks/data_cyril.ipynb index a734be7..a6b574c 100644 --- a/notebooks/data_cyril.ipynb +++ b/notebooks/data_cyril.ipynb @@ -1,4236 +1,5004 @@ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7676application_1589299642358_2172pysparkidleLinkLink
7684application_1589299642358_2180pysparkidleLinkLink
7686application_1589299642358_2182pysparkidleLinkLink
7691application_1589299642358_2187pysparkidleLinkLink
7694application_1589299642358_2190pysparkidleLinkLink
7699application_1589299642358_2195pysparkbusyLinkLink
7700application_1589299642358_2196pysparkidleLinkLink
7701application_1589299642358_2197pysparkidleLinkLink
7704application_1589299642358_2200pysparkidleLinkLink
7705application_1589299642358_2201pysparkidleLinkLink
7711application_1589299642358_2207pysparkidleLinkLink
7718application_1589299642358_2214pysparkidleLinkLink
7719application_1589299642358_2215pysparkidleLinkLink
7720application_1589299642358_2216pysparkidleLinkLink
7721application_1589299642358_2217pysparkidleLinkLink
7722application_1589299642358_2218pysparkbusyLinkLink
7724application_1589299642358_2220pysparkidleLinkLink
7725application_1589299642358_2221pysparkbusyLinkLink
7727application_1589299642358_2223pysparkidleLinkLink
7728application_1589299642358_2224pysparkbusyLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7933application_1589299642358_2451pysparkidleLinkLink7946application_1589299642358_2464pysparkidleLinkLink7951application_1589299642358_2469pysparkidleLinkLink7958application_1589299642358_2476pysparkidleLinkLink7959application_1589299642358_2477pysparkidleLinkLink7962application_1589299642358_2480pysparkidleLinkLink7965application_1589299642358_2485pysparkidleLinkLink7968application_1589299642358_2488pysparkidleLinkLink7971application_1589299642358_2491pysparkidleLinkLink7972application_1589299642358_2492pysparkidleLinkLink7973application_1589299642358_2493pysparkidleLinkLink7975application_1589299642358_2495pysparkidleLinkLink7977application_1589299642358_2497pysparkidleLinkLink7978application_1589299642358_2498pysparkbusyLinkLink7980application_1589299642358_2501pysparkidleLinkLink7981application_1589299642358_2502pysparkidleLinkLink7982application_1589299642358_2503pysparkbusyLinkLink7983application_1589299642358_2504pysparkidleLinkLink7984application_1589299642358_2505pysparkbusyLinkLink" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7729application_1589299642358_2225pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7985application_1589299642358_2506pysparkidleLinkLink✔" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below, we pre-process the data to generate the files required to run RAPTOR.\n", "\n", "The main reasoning behind this way of cleaning the data is the following:\n", "**Given a cleaned stop_times.txt file, it is possible to reconstruct everything required to run RAPTOR** to the exception of footpaths. In particular, routes are reconstructed from the cleaned stop_times.txt and not from routes.txt.\n", "\n", "We use the following strategy:\n", "\n", "- 1) Filter out stops out of 15km of ZH HB\n", " - Done on stops\n", "- 2) Merge stops that share a parent stop to the first 7 characters of the stop name\n", " - Done on stops (add a general_stop column)\n", " - Used as an input for stopTimes (add a general_stop column). At this point, stop_times contains only stops within 15km of ZH HB\n", " \n", "- 3) keep only services that run each day of the business week:\n", " - Obtain the list of services from calendar\n", " - Serves as an input to filter trips\n", " - Serves as an input to filter stop_times\n", "- 4) keep only stop times between 7am and 7pm\n", " - Do that on stop_times\n", " \n", "- 5) Find unique trips, based on the stops sequence and the departure times sequence\n", " - sort by trip, arrival_time (which is the same as stop_sequence)\n", " - build a (sorted) all_stops column for each trip\n", " - build a (sorted) all_departure_times column for each trip\n", " - keep only one trip that has the same all_stops and all_departure_times column\n", "- 6) building routes based on unique trips\n", " - order unique_trips by stop_sequence, earliest departure time\n", " - each window with the same stop_sequence gets a unique routeID\n", " \n", "- 7) giving unique integer indices to stops\n", " - get unique general (parent) stop names from stop times\n", " - assign an index with zipWithIndex\n", " - stop_times -> inner join on result\n", "- 8) indicating transport type from the route\n", " - inner join stop_times with routes.txt on route_id" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from geopy.distance import great_circle\n", "from pyspark.sql.functions import *\n", "import numpy as np\n", "import pandas as pd\n", "from geopy.distance import great_circle\n", "from pyspark.sql.types import DoubleType\n", "from pyspark.sql.types import DateType" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1) Filtering out stops not within 15km of ZH HB" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+----------------+-----------------+-------------+--------------+\n", "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+-------+--------------------+----------------+-----------------+-------------+--------------+\n", "|1322000| Altoggio|46.1672513851495| 8.345807131427| null| null|\n", "|1322001| Antronapiana| 46.060121674738| 8.11361957990831| null| null|\n", "|1322002| Anzola|45.9898698225697| 8.34571729989858| null| null|\n", "|1322003| Baceno|46.2614983591677| 8.31925293162473| null| null|\n", "|1322004|Beura Cardezza, c...|46.0790618438814| 8.29927439970313| null| null|\n", "|1322005|Bognanco, T. Vill...|46.1222963432243| 8.21077237789936| null| null|\n", "|1322006| Boschetto|46.0656504576122| 8.26113193273411| null| null|\n", "|1322007| Cadarese|46.2978807772998| 8.3626325767009| null| null|\n", "|1322010| Campioli|45.9695691829797| 8.04585965801774| null| null|\n", "|1322011| Cascate del Toce|46.4091810825782| 8.4117524564434| null| null|\n", "|1322012| Castiglione|46.0205875326422| 8.2148866619012| null| null|\n", "|1322013| Ceppo Morelli|45.9710364221151| 8.06992552448265| null| null|\n", "|1322014|Chiesa (Val Forma...|46.3530849443472| 8.42787721579558| null| null|\n", "|1322015| Cosasca di Trontano|46.0967496675661| 8.31182386422403| null| null|\n", "|1322016| Cresti|46.0664046229574| 8.2328978833503| null| null|\n", "|1322017| Crevoladossola|46.1562758593614| 8.30343359946918| null| null|\n", "|1322018| Crodo, Bagni|46.2141837457637| 8.32131905677849| null| null|\n", "|1322019| Crodo, paese| 46.224016613202| 8.3235648449891| null| null|\n", "|1322021| Croppo di Trontano|46.1103590121829| 8.31194064521098| null| null|\n", "|1322022| Crusinallo|45.6945937446539|0.595870494345107| null| null|\n", "+-------+--------------------+----------------+-----------------+-------------+--------------+\n", "only showing top 20 rows" ] } ], "source": [ "stops = spark.read.csv(\"/data/sbb/timetables/csv/stops/2019/05/14/stops.txt\", header=True, sep = \",\")\n", "stops.show()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "30631" ] } ], "source": [ "stops.count()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#defining udf function\n", "@udf(\"float\")\n", "def great_circle_udf(x, y):\n", " return great_circle(x, y).kilometers" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n", "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n", "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n", "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n", "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n", "| 8502187|Rudolfstetten Hof...|47.3646945560768|8.37709545277724| null| 8502187P|\n", "|8502187:0:1|Rudolfstetten Hof...|47.3647554015789|8.37709545277724| null| 8502187P|\n", "|8502187:0:2|Rudolfstetten Hof...|47.3648162470108|8.37709545277724| null| 8502187P|\n", "| 8502187P|Rudolfstetten Hof...|47.3646945560768|8.37709545277724| 1| null|\n", "| 8502188| Zufikon Hammergut|47.3558347019549|8.35472740219955| null| 8502188P|\n", "|8502188:0:1| Zufikon Hammergut|47.3558955576756|8.35472740219955| null| 8502188P|\n", "|8502188:0:2| Zufikon Hammergut|47.3559564133261|8.35472740219955| null| 8502188P|\n", "| 8502188P| Zufikon Hammergut|47.3558347019549|8.35472740219955| 1| null|\n", "| 8502208| Horgen Oberdorf|47.2587475534877|8.58979854578067| null| 8502208P|\n", "|8502208:0:2| Horgen Oberdorf|47.2589304560815|8.58979854578067| null| 8502208P|\n", "|8502208:0:3| Horgen Oberdorf|47.2588085210892|8.58979854578067| null| 8502208P|\n", "|8502208:0:4| Horgen Oberdorf|47.2588694886204|8.58979854578067| null| 8502208P|\n", "| 8502208P| Horgen Oberdorf|47.2587475534877|8.58979854578067| 1| null|\n", "| 8502209| Oberrieden Dorf|47.2767238569466| 8.577635356832| null| 8502209P|\n", "|8502209:0:1| Oberrieden Dorf|47.2768457506749| 8.577635356832| null| 8502209P|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 20 rows" ] } ], "source": [ "# Zurich HB coordinates\n", "zurich_geo = (47.378177, 8.540192)\n", "\n", "#transforming Zurich HB coordinates in a spark dataframe column object\n", "zurich_geo_col = struct(lit(zurich_geo[0]), lit(zurich_geo[1]))\n", "\n", "#applying filter function based on distance\n", "stops_15km = stops.filter(great_circle_udf(zurich_geo_col, struct(stops.stop_lat, stops.stop_lon)) < 15)\n", "stops_15km.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "1883" ] } ], "source": [ "stops_15km.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2) Merging stops that share a parent stop" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+--------------------+----------------+----------------+-------------+--------------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+------------+--------------------+----------------+----------------+-------------+--------------+\n", "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n", "| 8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n", "| 8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n", "| 8502187|Rudolfstetten Hof...|47.3646945560768|8.37709545277724| null| 8502187P|\n", "| 8502187:0:1|Rudolfstetten Hof...|47.3647554015789|8.37709545277724| null| 8502187P|\n", "| 8502187:0:2|Rudolfstetten Hof...|47.3648162470108|8.37709545277724| null| 8502187P|\n", "| 8502188| Zufikon Hammergut|47.3558347019549|8.35472740219955| null| 8502188P|\n", "| 8502188:0:1| Zufikon Hammergut|47.3558955576756|8.35472740219955| null| 8502188P|\n", "| 8502188:0:2| Zufikon Hammergut|47.3559564133261|8.35472740219955| null| 8502188P|\n", "| 8502208| Horgen Oberdorf|47.2587475534877|8.58979854578067| null| 8502208P|\n", "| 8502208:0:2| Horgen Oberdorf|47.2589304560815|8.58979854578067| null| 8502208P|\n", "| 8502208:0:3| Horgen Oberdorf|47.2588085210892|8.58979854578067| null| 8502208P|\n", "| 8502208:0:4| Horgen Oberdorf|47.2588694886204|8.58979854578067| null| 8502208P|\n", "| 8502209| Oberrieden Dorf|47.2767238569466| 8.577635356832| null| 8502209P|\n", "| 8502209:0:1| Oberrieden Dorf|47.2768457506749| 8.577635356832| null| 8502209P|\n", "| 8502209:0:2| Oberrieden Dorf|47.2767848038458| 8.577635356832| null| 8502209P|\n", "| 8502220| Urdorf|47.3908820565997|8.43471339510869| null| 8502220P|\n", "| 8502220:0:1| Urdorf|47.3909428718897|8.43471339510869| null| 8502220P|\n", "| 8502220:0:2| Urdorf|47.3910036871096|8.43471339510869| null| 8502220P|\n", "| 8502221| Birmensdorf ZH|47.3574351840587|8.43754308825406| null| 8502221P|\n", "| 8502221:0:1| Birmensdorf ZH|47.3575568917382|8.43754308825406| null| 8502221P|\n", "| 8502221:0:2| Birmensdorf ZH|47.3574960379336|8.43754308825406| null| 8502221P|\n", "| 8502222| Bonstetten-Wettswil|47.3258973534906|8.46817563944679| null| 8502222P|\n", "| 8502222:0:2| Bonstetten-Wettswil| 47.325958243729|8.46817563944679| null| 8502222P|\n", "| 8502222:0:3| Bonstetten-Wettswil|47.3260191338971|8.46817563944679| null| 8502222P|\n", "| 8502223| Hedingen|47.2987820476816|8.44595131931459| null| 8502223P|\n", "| 8502223:0:1| Hedingen|47.2988429691695|8.44595131931459| null| 8502223P|\n", "| 8502223:0:2| Hedingen|47.2989038905872|8.44595131931459| null| 8502223P|\n", "| 8502224| Affoltern am Albis|47.2760656259617|8.44658014001356| null| 8502224P|\n", "| 8502224:0:1| Affoltern am Albis|47.2761875212063|8.44658014001356| null| 8502224P|\n", "| 8502224:0:2| Affoltern am Albis|47.2762484687233|8.44658014001356| null| 8502224P|\n", "| 8502224:0:3| Affoltern am Albis|47.2761265736191|8.44658014001356| null| 8502224P|\n", "| 8502229:0:1| Urdorf Weihermatt|47.3810351357388|8.43032961652157| null| 8502229P|\n", "| 8502229:0:2| Urdorf Weihermatt|47.3810959623905|8.43032961652157| null| 8502229P|\n", "| 8502273:0:1| Bremgarten|47.3519945640447| 8.3474779978557| null| 8502273P|\n", "| 8502273:0:2| Bremgarten|47.3519337038252| 8.3474779978557| null| 8502273P|\n", "| 8502276:0:1| Berikon-Widen|47.3622485087742|8.36679177646695| null| 8502276P|\n", "| 8502276:0:2| Berikon-Widen|47.3623093570976|8.36679177646695| null| 8502276P|\n", "| 8502758:0:A|Hausen am Albis, ...|47.2448085174147| 8.5329801040522| null| 8502758P|\n", "| 8502758:0:B|Hausen am Albis, ...|47.2448695010648| 8.5329801040522| null| 8502758P|\n", "| 8502758:0:C|Hausen am Albis, ...|47.2449304846447| 8.5329801040522| null| 8502758P|\n", "| 8503000| Zürich HB|47.3781762039461|8.54019357578468| null| 8503000P|\n", "|8503000:0:10| Zürich HB|47.3794536181612|8.54019357578468| null| 8503000P|\n", "|8503000:0:11| Zürich HB|47.3795144466376|8.54019357578468| null| 8503000P|\n", "|8503000:0:12| Zürich HB|47.3786020121232|8.54019357578468| null| 8503000P|\n", "|8503000:0:13| Zürich HB|47.3785411825942|8.54019357578468| null| 8503000P|\n", "|8503000:0:14| Zürich HB|47.3783586935859|8.54019357578468| null| 8503000P|\n", "|8503000:0:15| Zürich HB|47.3784803529949|8.54019357578468| null| 8503000P|\n", "|8503000:0:16| Zürich HB|47.3784195233255|8.54019357578468| null| 8503000P|\n", "|8503000:0:17| Zürich HB| 47.379271132311|8.54019357578468| null| 8503000P|\n", "+------------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 50 rows" ] } ], "source": [ "stops_15km.filter(stops_15km.parent_station.isNotNull()).show(50)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n", "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n", "| 8502187P|Rudolfstetten Hof...|47.3646945560768|8.37709545277724| 1| null|\n", "| 8502188P| Zufikon Hammergut|47.3558347019549|8.35472740219955| 1| null|\n", "| 8502208P| Horgen Oberdorf|47.2587475534877|8.58979854578067| 1| null|\n", "| 8502209P| Oberrieden Dorf|47.2767238569466| 8.577635356832| 1| null|\n", "| 8502220P| Urdorf|47.3908820565997|8.43471339510869| 1| null|\n", "| 8502221P| Birmensdorf ZH|47.3574351840587|8.43754308825406| 1| null|\n", "| 8502222P| Bonstetten-Wettswil|47.3258973534906|8.46817563944679| 1| null|\n", "| 8502223P| Hedingen|47.2987820476816|8.44595131931459| 1| null|\n", "| 8502224P| Affoltern am Albis|47.2760656259617|8.44658014001356| 1| null|\n", "| 8502229P| Urdorf Weihermatt|47.3809743090169|8.43032961652157| 1| null|\n", "| 8502268| Zufikon Belvédère|47.3575812332404|8.35923694492646| null| null|\n", "|8502268:0:1| Zufikon Belvédère|47.3576420869468|8.35923694492646| null| null|\n", "| 8502270| Bergfrieden|47.3977111751049|8.39908621093555| null| null|\n", "|8502270:0:1| Bergfrieden|47.3977719825142|8.39908621093555| null| null|\n", "| 8502273P| Bremgarten|47.3518728435356| 8.3474779978557| 1| null|\n", "| 8502274| Zufikon|47.3525240449924|8.35470943589386| null| null|\n", "|8502274:0:1| Zufikon|47.3525849045311|8.35470943589386| null| null|\n", "| 8502275| Widen Heinrüti|47.3620598785256|8.35486214949218| null| null|\n", "|8502275:0:1| Widen Heinrüti|47.3621207270666|8.35486214949218| null| null|\n", "| 8502276P| Berikon-Widen|47.3621876603806|8.36679177646695| 1| null|\n", "| 8502277| Rudolfstetten|47.3700243558558|8.38180262486668| null| null|\n", "|8502277:0:1| Rudolfstetten| 47.37008519521|8.38180262486668| null| null|\n", "| 8502278| Reppischhof|47.3847211041004| 8.3963463493186| null| null|\n", "|8502278:0:1| Reppischhof|47.3847819264993| 8.3963463493186| null| null|\n", "| 8502495|Zürich Wollishofe...|47.3476976601166|8.53331248070737| null| null|\n", "| 8502508|Spreitenbach, Rai...|47.4154457211288|8.37718528430566| null| null|\n", "| 8502553|Unterlunkhofen, B...|47.3221585583935| 8.380473118246| null| null|\n", "| 8502559|Waldegg, Birmensd...|47.3683025730349|8.46346846735736| null| null|\n", "| 8502560| Berikon, Kirche|47.3510512227562|8.37141810018081| null| null|\n", "| 8502570| Rottenschwil, Hecht|47.3190589331876| 8.372091836644| null| null|\n", "| 8502572|Zürich, Goldbrunn...|47.3702920484894|8.51391785372053| null| null|\n", "| 8502574|Affoltern a. A., ...|47.2784669105587|8.45910265507593| null| null|\n", "| 8502575| Widen, Dorf|47.3675724714769|8.36359377405504| null| null|\n", "| 8502750| Bellikon, Post|47.3895076123306| 8.3433726970067| null| null|\n", "| 8502758P|Hausen am Albis, ...|47.2447475336943| 8.5329801040522| 1| null|\n", "| 8502762|Langnau a.A., Alb...|47.2761509526624|8.52069115096373| null| null|\n", "| 8502763|Hausen am Albis, ...|47.2629785384073|8.51724162027223| null| null|\n", "| 8502764|Hausen am Albis, ...|47.2590158104091| 8.5220386238901| null| null|\n", "| 8502771|Aeugst am Albis, ...|47.2678309520619|8.48534244452871| null| null|\n", "| 8502776|Gattikon, Obstgarten|47.2847255843239| 8.5511170896411| null| null|\n", "| 8502779| Ottenbach, Post|47.2816481400898|8.40452101840523| null| null|\n", "| 8502876|Aesch ZH, Gemeind...|47.3382079319594|8.43870191497073| null| null|\n", "| 8502879| Jonen, Post|47.2961806346557|8.39551091610425| null| null|\n", "| 8502883|Zwillikon, Gemein...|47.2873945890065|8.43218912915996| null| null|\n", "| 8502885|Bonstetten, Dorfp...|47.3150882242354|8.46778038072173| null| null|\n", "| 8502894|Oberwil-Lieli, Ob...|47.3371669407057|8.38639301596917| null| null|\n", "| 8502950|Birmensdorf ZH, Z...|47.3539359682156|8.43717477898752| null| null|\n", "| 8502953|Affoltern a. A., ...|47.2812215495339|8.45441344929217| null| null|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 50 rows" ] } ], "source": [ "stops_15km.filter(stops_15km.parent_station.isNull()).show(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is clear that parent stops were not properly assigned for all stops (e.g Zufikon Belvédère where there is a platform stop, but no parent stop). Thus, we create a new column `stop_id_general` that contains only the 7 first characters of `stop_id`" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+----------------+----------------+-------------+--------------+---------------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_id_general|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+---------------+\n", "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 8500926|\n", "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 8502186|\n", "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P| 8502186|\n", "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P| 8502186|\n", "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null| 8502186|\n", "| 8502187|Rudolfstetten Hof...|47.3646945560768|8.37709545277724| null| 8502187P| 8502187|\n", "|8502187:0:1|Rudolfstetten Hof...|47.3647554015789|8.37709545277724| null| 8502187P| 8502187|\n", "|8502187:0:2|Rudolfstetten Hof...|47.3648162470108|8.37709545277724| null| 8502187P| 8502187|\n", "| 8502187P|Rudolfstetten Hof...|47.3646945560768|8.37709545277724| 1| null| 8502187|\n", "| 8502188| Zufikon Hammergut|47.3558347019549|8.35472740219955| null| 8502188P| 8502188|\n", "|8502188:0:1| Zufikon Hammergut|47.3558955576756|8.35472740219955| null| 8502188P| 8502188|\n", "|8502188:0:2| Zufikon Hammergut|47.3559564133261|8.35472740219955| null| 8502188P| 8502188|\n", "| 8502188P| Zufikon Hammergut|47.3558347019549|8.35472740219955| 1| null| 8502188|\n", "| 8502208| Horgen Oberdorf|47.2587475534877|8.58979854578067| null| 8502208P| 8502208|\n", "|8502208:0:2| Horgen Oberdorf|47.2589304560815|8.58979854578067| null| 8502208P| 8502208|\n", "|8502208:0:3| Horgen Oberdorf|47.2588085210892|8.58979854578067| null| 8502208P| 8502208|\n", "|8502208:0:4| Horgen Oberdorf|47.2588694886204|8.58979854578067| null| 8502208P| 8502208|\n", "| 8502208P| Horgen Oberdorf|47.2587475534877|8.58979854578067| 1| null| 8502208|\n", "| 8502209| Oberrieden Dorf|47.2767238569466| 8.577635356832| null| 8502209P| 8502209|\n", "|8502209:0:1| Oberrieden Dorf|47.2768457506749| 8.577635356832| null| 8502209P| 8502209|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+---------------+\n", "only showing top 20 rows" ] } ], "source": [ "stops_15km = stops_15km.withColumn('stop_id_general',col('stop_id').substr(1, 7))\n", "stops_15km.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we filter stop_times with the 15km radius, and add the stop_id_general column" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+---------------+--------------------+----------------+----------------+\n", "| stop_id|stop_id_general| stop_name| stop_lat| stop_lon|\n", "+-----------+---------------+--------------------+----------------+----------------+\n", "| 8500926| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317|\n", "| 8502186| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007|\n", "|8502186:0:1| 8502186|Dietikon Stoffelbach|47.3934666445388|8.39894248049007|\n", "|8502186:0:2| 8502186|Dietikon Stoffelbach|47.3935274568464|8.39894248049007|\n", "| 8502186P| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007|\n", "| 8502187| 8502187|Rudolfstetten Hof...|47.3646945560768|8.37709545277724|\n", "|8502187:0:1| 8502187|Rudolfstetten Hof...|47.3647554015789|8.37709545277724|\n", "|8502187:0:2| 8502187|Rudolfstetten Hof...|47.3648162470108|8.37709545277724|\n", "| 8502187P| 8502187|Rudolfstetten Hof...|47.3646945560768|8.37709545277724|\n", "| 8502188| 8502188| Zufikon Hammergut|47.3558347019549|8.35472740219955|\n", "|8502188:0:1| 8502188| Zufikon Hammergut|47.3558955576756|8.35472740219955|\n", "|8502188:0:2| 8502188| Zufikon Hammergut|47.3559564133261|8.35472740219955|\n", "| 8502188P| 8502188| Zufikon Hammergut|47.3558347019549|8.35472740219955|\n", "| 8502208| 8502208| Horgen Oberdorf|47.2587475534877|8.58979854578067|\n", "|8502208:0:2| 8502208| Horgen Oberdorf|47.2589304560815|8.58979854578067|\n", "|8502208:0:3| 8502208| Horgen Oberdorf|47.2588085210892|8.58979854578067|\n", "|8502208:0:4| 8502208| Horgen Oberdorf|47.2588694886204|8.58979854578067|\n", "| 8502208P| 8502208| Horgen Oberdorf|47.2587475534877|8.58979854578067|\n", "| 8502209| 8502209| Oberrieden Dorf|47.2767238569466| 8.577635356832|\n", "|8502209:0:1| 8502209| Oberrieden Dorf|47.2768457506749| 8.577635356832|\n", "+-----------+---------------+--------------------+----------------+----------------+\n", "only showing top 20 rows" ] } ], "source": [ "stops_15km_for_join = stops_15km.select(stops_15km.stop_id, \n", " stops_15km.stop_id_general, \n", " stops.stop_name, \n", " stops.stop_lat, \n", " stops.stop_lon)\n", "stops_15km_for_join.show()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "| trip_id|arrival_time|departure_time| stop_id|stop_sequence|pickup_type|drop_off_type|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "|1.TA.1-1-B-j19-1.1.R| 04:20:00| 04:20:00|8500010:0:3| 1| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:24:00| 04:24:00|8500020:0:3| 2| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:28:00| 04:28:00|8500021:0:5| 3| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:30:00| 04:30:00|8517131:0:2| 4| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:32:00| 04:32:00|8500300:0:5| 5| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:35:00| 04:35:00|8500313:0:2| 6| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:37:00| 04:38:00|8500301:0:3| 7| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:40:00| 04:41:00|8500302:0:3| 8| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:45:00| 04:45:00|8500303:0:2| 9| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:48:00| 04:49:00|8500320:0:3| 10| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:52:00| 04:52:00|8500304:0:2| 11| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:56:00| 04:56:00|8500305:0:1| 12| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 05:50:00| 05:50:00|8500010:0:3| 1| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 05:54:00| 05:54:00|8500020:0:3| 2| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 05:58:00| 05:58:00|8500021:0:5| 3| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 06:00:00| 06:00:00|8517131:0:2| 4| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 06:02:00| 06:02:00|8500300:0:5| 5| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 06:05:00| 06:05:00|8500313:0:2| 6| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 06:07:00| 06:08:00|8500301:0:3| 7| 0| 0|\n", "|25.TA.1-1-B-j19-1...| 06:10:00| 06:11:00|8500302:0:3| 8| 0| 0|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times = spark.read.csv(\"/data/sbb/timetables/csv/stop_times/2019/05/14/stop_times.txt\", header=True, sep = \",\")\n", "stop_times.show()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "11128930" ] } ], "source": [ "stop_times.count()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+------------+--------------+-------------+-----------+-------------+---------------+-------------------+----------------+----------------+\n", "| stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon|\n", "+-----------+--------------------+------------+--------------+-------------+-----------+-------------+---------------+-------------------+----------------+----------------+\n", "|8503202:0:5|61.TA.25-75-j19-1...| 15:12:00| 15:14:00| 5| 0| 0| 8503202| Thalwil|47.2962171893553|8.56475351565593|\n", "|8503202:0:5|99.TA.25-75-j19-1...| 19:14:00| 19:15:00| 5| 0| 0| 8503202| Thalwil|47.2962171893553|8.56475351565593|\n", "|8503202:0:5|137.TA.25-75-j19-...| 23:12:00| 23:14:00| 5| 0| 0| 8503202| Thalwil|47.2962171893553|8.56475351565593|\n", "|8503000:0:6|287.TA.25-75-j19-...| 19:35:00| 19:35:00| 1| 0| 0| 8503000| Zürich HB|47.3786628415821|8.54019357578468|\n", "|8503000:0:7|337.TA.25-75-j19-...| 22:35:00| 22:35:00| 1| 0| 0| 8503000| Zürich HB|47.3787236709708|8.54019357578468|\n", "| 8587860|8.TA.26-811-j19-1...| 23:38:00| 23:38:00| 9| 0| 0| 8587860| Uster, Strick|47.3564493415083|8.71275994188806|\n", "| 8588052|35.TA.26-811-j19-...| 06:48:00| 06:48:00| 4| 0| 0| 8588052| Uster, Gschwader|47.3582688760385|8.71277790819375|\n", "| 8588052|38.TA.26-811-j19-...| 18:33:00| 18:33:00| 4| 0| 0| 8588052| Uster, Gschwader|47.3582688760385|8.71277790819375|\n", "| 8573504|42.TA.26-811-j19-...| 06:30:00| 06:30:00| 1| 0| 0| 8573504| Uster, Bahnhof|47.3511851173852|8.71683829327853|\n", "| 8573504|98.TA.26-811-j19-...| 15:15:00| 15:15:00| 1| 0| 0| 8573504| Uster, Bahnhof|47.3511851173852|8.71683829327853|\n", "| 8503152|133.TA.26-811-j19...| 17:51:00| 17:51:00| 7| 0| 0| 8503152|Uster, Brandschenke|47.3611410570261|8.71073873249851|\n", "| 8588050|27.TA.26-812-j19-...| 20:05:00| 20:05:00| 7| 0| 0| 8588050| Uster, Bordacker|47.3519945640447|8.72933385888238|\n", "| 8588051|35.TA.26-812-j19-...| 09:30:00| 09:30:00| 2| 0| 0| 8588051| Uster, Dammstrasse|47.3534491024058|8.71567946656186|\n", "| 8588059|61.TA.26-812-j19-...| 06:04:00| 06:04:00| 6| 0| 0| 8588059| Uster, Weidli| 47.353996826126|8.73063641604453|\n", "| 8503567|66.TA.26-812-j19-...| 06:17:00| 06:17:00| 3| 0| 0| 8503567| Uster, Spital|47.3538507670231|8.72349480953479|\n", "| 8503567|105.TA.26-812-j19...| 08:32:00| 08:32:00| 3| 0| 0| 8503567| Uster, Spital|47.3538507670231|8.72349480953479|\n", "| 8588056|116.TA.26-812-j19...| 20:07:00| 20:07:00| 9| 0| 0| 8588056| Uster, Talweg|47.3485132466833|8.73078014649001|\n", "| 8503872|166.TA.26-812-j19...| 12:18:00| 12:18:00| 4| 0| 0| 8503872| Uster, Reithalle|47.3562972036718|8.72678264347512|\n", "| 8588056|182.TA.26-812-j19...| 13:37:00| 13:37:00| 9| 0| 0| 8588056| Uster, Talweg|47.3485132466833|8.73078014649001|\n", "| 8573504|12.TA.26-813-j19-...| 17:57:00| 17:57:00| 8| 0| 0| 8573504| Uster, Bahnhof|47.3511851173852|8.71683829327853|\n", "+-----------+--------------------+------------+--------------+-------------+-----------+-------------+---------------+-------------------+----------------+----------------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times_15km = stop_times.join(stops_15km_for_join, how=\"inner\", on = \"stop_id\").dropDuplicates()\n", "stop_times_15km.show()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_15km.write.csv('data/lgpt_guys/stop_times_15km.csv', header=True, mode='overwrite')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3) Keep only services that run each day of the week" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "2322109" ] } ], "source": [ "stop_times_15km = spark.read.csv('data/lgpt_guys/stop_times_15km.csv', header=True)\n", "stop_times_15km.count()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "calendar = spark.read.csv(\"/data/sbb/timetables/csv/calendar/2019/05/14/calendar.txt\", header=True, sep = \",\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b08k4| 1| 0| 0| 0| 0| 0| 0| 20181209|20191214|\n", "| TA+b06hs| 0| 0| 0| 0| 1| 0| 0| 20181209|20191214|\n", "| TA+b09de| 0| 0| 0| 0| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxn| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b05qx| 1| 1| 1| 0| 0| 0| 0| 20181209|20191214|\n", "| TA+b0nxa| 0| 0| 0| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b05k1| 1| 0| 0| 0| 0| 0| 0| 20181209|20191214|\n", "| TA+b01pq| 0| 0| 0| 0| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxb| 0| 0| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b04l2| 0| 1| 0| 0| 0| 0| 0| 20181209|20191214|\n", "| TA+b063g| 1| 0| 0| 0| 0| 0| 0| 20181209|20191214|\n", "| TA+b08xi| 1| 0| 0| 0| 0| 0| 0| 20181209|20191214|\n", "| TA+b0nxd| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxe| 0| 0| 0| 0| 0| 1| 1| 20181209|20191214|\n", "| TA+b0nxf| 0| 0| 0| 0| 0| 1| 1| 20181209|20191214|\n", "| TA+b08zi| 0| 0| 0| 0| 0| 0| 1| 20181209|20191214|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "only showing top 20 rows" ] } ], "source": [ "calendar.show()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxn| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxd| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxh| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxi| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxl| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0f63| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0f6a| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0ap6| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b03c1| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nke| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b09su| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b00bo| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxc| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxq| 1| 1| 1| 1| 1| 1| 1| 20181209|20191214|\n", "| TA+b0nuo| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxv| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "only showing top 20 rows" ] } ], "source": [ "calendar_business_days = calendar.filter((calendar.monday==1) & \\\n", " (calendar.tuesday==1) & \\\n", " (calendar.wednesday==1) & \\\n", " (calendar.thursday==1) & \\\n", " (calendar.friday==1))\n", "calendar_business_days.show()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+----------+--------------------+------------------+---------------+------------+\n", "| route_id|service_id| trip_id| trip_headsign|trip_short_name|direction_id|\n", "+-----------+----------+--------------------+------------------+---------------+------------+\n", "|1-1-C-j19-1| TA+b0001|5.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 108| 1|\n", "|1-1-C-j19-1| TA+b0001|7.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 112| 1|\n", "|1-1-C-j19-1| TA+b0001|9.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 116| 1|\n", "|1-1-C-j19-1| TA+b0001|11.TA.1-1-C-j19-1...|Zofingen, Altachen| 120| 1|\n", "|1-1-C-j19-1| TA+b0001|13.TA.1-1-C-j19-1...|Zofingen, Altachen| 124| 1|\n", "|1-1-C-j19-1| TA+b0001|15.TA.1-1-C-j19-1...|Zofingen, Altachen| 128| 1|\n", "|1-1-C-j19-1| TA+b0001|17.TA.1-1-C-j19-1...|Zofingen, Altachen| 132| 1|\n", "|1-1-C-j19-1| TA+b0001|18.TA.1-1-C-j19-1...|Zofingen, Altachen| 134| 1|\n", "|1-1-C-j19-1| TA+b0001|19.TA.1-1-C-j19-1...|Zofingen, Altachen| 136| 1|\n", "|1-1-C-j19-1| TA+b0001|20.TA.1-1-C-j19-1...|Zofingen, Altachen| 138| 1|\n", "|1-1-C-j19-1| TA+b0001|21.TA.1-1-C-j19-1...|Zofingen, Altachen| 140| 1|\n", "|1-1-C-j19-1| TA+b0001|22.TA.1-1-C-j19-1...|Zofingen, Altachen| 142| 1|\n", "|1-1-C-j19-1| TA+b0001|23.TA.1-1-C-j19-1...|Zofingen, Altachen| 144| 1|\n", "|1-1-C-j19-1| TA+b0001|24.TA.1-1-C-j19-1...|Zofingen, Altachen| 146| 1|\n", "|1-1-C-j19-1| TA+b0001|25.TA.1-1-C-j19-1...|Zofingen, Altachen| 148| 1|\n", "|1-1-C-j19-1| TA+b0001|26.TA.1-1-C-j19-1...|Zofingen, Altachen| 150| 1|\n", "|1-1-C-j19-1| TA+b0001|27.TA.1-1-C-j19-1...|Zofingen, Altachen| 152| 1|\n", "|1-1-C-j19-1| TA+b0001|30.TA.1-1-C-j19-1...|Zofingen, Altachen| 156| 1|\n", "|1-1-C-j19-1| TA+b0001|37.TA.1-1-C-j19-1...|Zofingen, Altachen| 168| 1|\n", "|1-1-C-j19-1| TA+b0001|38.TA.1-1-C-j19-1...|Zofingen, Altachen| 172| 1|\n", "+-----------+----------+--------------------+------------------+---------------+------------+\n", "only showing top 20 rows" ] } ], "source": [ "trips = spark.read.csv(\"/data/sbb/timetables/csv/trips/2019/05/14/trips.txt\", header=True, sep = \",\")\n", "trips.show()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "1017413" ] } ], "source": [ "trips.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Is there any useful information contained in `start_date` and `end_date` ?" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+--------+\n", "|start_date|end_date|\n", "+----------+--------+\n", "| 20181209|20191214|\n", "+----------+--------+" ] } ], "source": [ "calendar_business_days.select(calendar_business_days.start_date, calendar_business_days.end_date).dropDuplicates().show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`start_date` and `end_date` will not provide us with useful information as their values are the same for all services." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+\n", "|service_id|\n", "+----------+\n", "| TA+b0nx9|\n", "| TA+b03bf|\n", "| TA+b0008|\n", "| TA+b0nxg|\n", "| TA+b0nxn|\n", "| TA+b0nxd|\n", "| TA+b0nxh|\n", "| TA+b0nxi|\n", "| TA+b0nxl|\n", "| TA+b0f63|\n", "| TA+b0f6a|\n", "| TA+b0ap6|\n", "| TA+b03c1|\n", "| TA+b0nke|\n", "| TA+b09su|\n", "| TA+b00bo|\n", "| TA+b0nxc|\n", "| TA+b0nxq|\n", "| TA+b0nuo|\n", "| TA+b0nxv|\n", "+----------+\n", "only showing top 20 rows" ] } ], "source": [ "calendar_business_days_for_join = calendar_business_days.select(calendar_business_days.service_id) \n", "calendar_business_days_for_join.show()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+------------+--------------------+--------------------+---------------+------------+\n", "|service_id| route_id| trip_id| trip_headsign|trip_short_name|direction_id|\n", "+----------+------------+--------------------+--------------------+---------------+------------+\n", "| TA+b0001| 1-1-C-j19-1|46.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 113| 0|\n", "| TA+b0001| 1-1-C-j19-1|59.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 139| 0|\n", "| TA+b0001| 1-340-j19-1|2.TA.1-340-j19-1.1.H| Wohlen AG, Bahnhof| 105| 0|\n", "| TA+b0001| 1-354-j19-1|36.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35435| 0|\n", "| TA+b0001| 1-354-j19-1|47.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35467| 0|\n", "| TA+b0001| 1-393-j19-1|70.TA.1-393-j19-1...|Othmarsingen, Bah...| 14060| 0|\n", "| TA+b0001| 1-508-j19-1|87.TA.1-508-j19-1...|Aarburg-Oftringen...| 8178| 1|\n", "| TA+b0001| 2-230-j19-1|28.TA.2-230-j19-1...| Trogen, Bahnhof| 23023| 0|\n", "| TA+b0001| 3-193-j19-1|221.TA.3-193-j19-...| Appenzell, Bahnhof| 1040| 1|\n", "| TA+b0001| 4-76-j19-1|54.TA.4-76-j19-1.2.H| Lausen, Furlen| 76049| 0|\n", "| TA+b0001| 6-101-j19-1|645.TA.6-101-j19-...| Beatenberg, Station| 10103| 0|\n", "| TA+b0001|6-11-A-j19-1|188.TA.6-11-A-j19...| Bern, Holligen| 11596| 1|\n", "| TA+b0001|6-11-B-j19-1|123.TA.6-11-B-j19...|Biel/Bienne, Bahn...| 11045| 1|\n", "| TA+b0001| 6-151-j19-1|54.TA.6-151-j19-1...| Brienz BE, Bahnhof| 236| 1|\n", "| TA+b0001| 6-21-j19-1|106.TA.6-21-j19-1...| Thun, Bahnhof| 21199| 1|\n", "| TA+b0001| 6-210-j19-1|23.TA.6-210-j19-1...| Frutigen, Bahnhof| 21025| 0|\n", "| TA+b0001| 6-27-j19-1|195.TA.6-27-j19-1...|Bern, Weyermannsh...| 17667| 1|\n", "| TA+b0001| 6-871-j19-1|43.TA.6-871-j19-1...| Waltwil| 71031| 0|\n", "| TA+b0001| 6-9-B-j19-1|28.TA.6-9-B-j19-1...| Bern| 9110| 1|\n", "| TA+b0001| 6-9-B-j19-1|43.TA.6-9-B-j19-1...| Bern| 9146| 1|\n", "+----------+------------+--------------------+--------------------+---------------+------------+\n", "only showing top 20 rows" ] } ], "source": [ "trips_business_week = trips.join(calendar_business_days_for_join, how=\"inner\", on = \"service_id\").dropDuplicates()\n", "trips_business_week.show()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "528368" ] } ], "source": [ "trips_business_week.count()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+--------------------+--------------------+---------------+------------+\n", "| route_id| trip_id| trip_headsign|trip_short_name|direction_id|\n", "+------------+--------------------+--------------------+---------------+------------+\n", "| 1-1-C-j19-1|46.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 113| 0|\n", "| 1-1-C-j19-1|59.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 139| 0|\n", "| 1-340-j19-1|2.TA.1-340-j19-1.1.H| Wohlen AG, Bahnhof| 105| 0|\n", "| 1-354-j19-1|36.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35435| 0|\n", "| 1-354-j19-1|47.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35467| 0|\n", "| 1-393-j19-1|70.TA.1-393-j19-1...|Othmarsingen, Bah...| 14060| 0|\n", "| 1-508-j19-1|87.TA.1-508-j19-1...|Aarburg-Oftringen...| 8178| 1|\n", "| 2-230-j19-1|28.TA.2-230-j19-1...| Trogen, Bahnhof| 23023| 0|\n", "| 3-193-j19-1|221.TA.3-193-j19-...| Appenzell, Bahnhof| 1040| 1|\n", "| 4-76-j19-1|54.TA.4-76-j19-1.2.H| Lausen, Furlen| 76049| 0|\n", "| 6-101-j19-1|645.TA.6-101-j19-...| Beatenberg, Station| 10103| 0|\n", "|6-11-A-j19-1|188.TA.6-11-A-j19...| Bern, Holligen| 11596| 1|\n", "|6-11-B-j19-1|123.TA.6-11-B-j19...|Biel/Bienne, Bahn...| 11045| 1|\n", "| 6-151-j19-1|54.TA.6-151-j19-1...| Brienz BE, Bahnhof| 236| 1|\n", "| 6-21-j19-1|106.TA.6-21-j19-1...| Thun, Bahnhof| 21199| 1|\n", "| 6-210-j19-1|23.TA.6-210-j19-1...| Frutigen, Bahnhof| 21025| 0|\n", "| 6-27-j19-1|195.TA.6-27-j19-1...|Bern, Weyermannsh...| 17667| 1|\n", "| 6-871-j19-1|43.TA.6-871-j19-1...| Waltwil| 71031| 0|\n", "| 6-9-B-j19-1|28.TA.6-9-B-j19-1...| Bern| 9110| 1|\n", "| 6-9-B-j19-1|43.TA.6-9-B-j19-1...| Bern| 9146| 1|\n", "+------------+--------------------+--------------------+---------------+------------+\n", "only showing top 20 rows" ] } ], "source": [ "trips_business_week_for_join = trips_business_week.drop('service_id')\n", "trips_business_week_for_join.show()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+------------+--------------------+---------------+------------+\n", "| trip_id| stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|\n", "+--------------------+-----------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+------------+--------------------+---------------+------------+\n", "|1005.TA.26-131-j1...| 8589111| 16:15:00| 16:15:00| 2| 0| 0| 8589111|Horgen, Gumelenst...| 47.260856991692|8.59230484542371|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", "|1005.TA.26-131-j1...| 8588984| 16:21:00| 16:21:00| 7| 0| 0| 8588984| Horgen, Gehren| 47.252107761586|8.60141376240595|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", + "|1005.TA.26-131-j1...|8503855:0:F| 16:14:00| 16:14:00| 1| 0| 0| 8503855| Horgen, Bahnhof|47.2618568116556|8.59697608490178|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", "|1005.TA.26-131-j1...| 8588985| 16:20:00| 16:20:00| 6| 0| 0| 8588985| Horgen, Heubach| 47.254772314364|8.59791931595024|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", "|1005.TA.26-131-j1...| 8573554| 16:18:00| 16:18:00| 4| 0| 0| 8573554|Horgen Oberdorf, ...|47.2586804890449|8.59024770342279|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", "|1005.TA.26-131-j1...| 8573553| 16:16:00| 16:16:00| 3| 0| 0| 8573553| Horgen, Stocker|47.2615154118397|8.58892717995495|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", "|1005.TA.26-131-j1...| 8573555| 16:19:00| 16:19:00| 5| 0| 0| 8573555| Horgen, Bergli|47.2576623184348| 8.5932121438608|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", - "|1005.TA.26-131-j1...|8503855:0:F| 16:14:00| 16:14:00| 1| 0| 0| 8503855| Horgen, Bahnhof|47.2618568116556|8.59697608490178|26-131-j19-1| Horgen, Aamüli| 636| 0|\n", - "|103.TA.26-925-j19...| 8576080| 07:42:00| 07:42:00| 21| 0| 0| 8576080| Meilen, Bahnhof|47.2694401970586|8.64488323901054|26-925-j19-1| Meilen, Bahnhof| 571| 0|\n", "|103.TA.26-925-j19...| 8576082| 07:38:00| 07:38:00| 20| 0| 0| 8576082| Meilen, Beugen|47.2672701430669|8.65071330520529|26-925-j19-1| Meilen, Bahnhof| 571| 0|\n", + "|103.TA.26-925-j19...| 8576080| 07:42:00| 07:42:00| 21| 0| 0| 8576080| Meilen, Bahnhof|47.2694401970586|8.64488323901054|26-925-j19-1| Meilen, Bahnhof| 571| 0|\n", "|104.TA.26-733-j19...| 8587420| 07:40:00| 07:40:00| 6| 0| 0| 8587420| Kloten, Bahnhof| 47.448965141581|8.58388763121034|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...| 8573211| 07:36:00| 07:36:00| 4| 0| 0| 8573211|Kloten, Zum Wilde...| 47.453545361717|8.58019555539209|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...| 8588553| 07:34:00| 07:34:00| 3| 0| 0| 8588553|Zürich Flughafen,...|47.4524944976638|8.57205681891684|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...|8573205:0:D| 07:33:00| 07:33:00| 1| 0| 0| 8573205|Zürich Flughafen,...|47.4506842895344|8.56372943623189|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...| 8580433| 07:44:00| 07:44:00| 9| 0| 0| 8580433| Kloten, Graswinkel|47.4509394233112|8.59648201149545|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...| 8580434| 07:41:00| 07:41:00| 7| 0| 0| 8580434|Kloten, Lindenstr...|47.4522454458795|8.58714851569215|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...| 8590699| 07:37:00| 07:37:00| 5| 0| 0| 8590699| Kloten, Stadthaus| 47.450745035784|8.58126455058034|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...| 8580301| 07:33:00| 07:33:00| 2| 0| 0| 8580301|Zürich Flughafen,...|47.4526524323306|8.56566081409302|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|104.TA.26-733-j19...| 8576153| 07:42:00| 07:42:00| 8| 0| 0| 8576153| Kloten, Rankstrasse|47.4511884812805|8.59205331714412|26-733-j19-1| Kloten, Graswinkel| 4723| 1|\n", "|1087.TA.26-5-B-j1...| 8591058| 15:49:00| 15:49:00| 6| 0| 0| 8591058|Zürich Enge, Bahnhof|47.3641286895461|8.53156974905593|26-5-B-j19-1|Zürich, Kirche Fl...| 3194| 1|\n", "|1087.TA.26-5-B-j1...| 8591317| 15:51:00| 15:51:00| 7| 0| 0| 8591317|Zürich, Rentenans...|47.3633863608069|8.53503724605312|26-5-B-j19-1|Zürich, Kirche Fl...| 3194| 1|\n", "+--------------------+-----------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+------------+--------------------+---------------+------------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times_15km_business_week = stop_times_15km.join(trips_business_week_for_join, how=\"inner\", on = \"trip_id\").dropDuplicates()\n", "stop_times_15km_business_week.show()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_15km_business_week.write.csv('data/lgpt_guys/stop_times_15km_business_week.csv', header=True, mode='overwrite')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4) Keeping only departure times between a certain time of the day (7am, 8pm)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+\n", "|1.TA.26-925-j19-1...|8576080| 15:27:00| 15:27:00| 23| 0| 0| 8576080| Meilen, Bahnhof|47.2694401970586|8.64488323901054| 26-925-j19-1| Meilen, Bahnhof| 280| 0|\n", "|1.TA.26-925-j19-1...|8576082| 15:22:00| 15:22:00| 22| 0| 0| 8576082| Meilen, Beugen|47.2672701430669|8.65071330520529| 26-925-j19-1| Meilen, Bahnhof| 280| 0|\n", "|1014.TA.26-70-A-j...|8591304| 11:54:00| 11:54:00| 3| 0| 0| 8591304|Zürich, Post Woll...|47.3444717091534|8.53296213774651|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591278| 12:05:00| 12:05:00| 13| 0| 0| 8591278|Zürich, Mittellei...|47.3231389520848|8.51428616298707|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591106| 11:56:00| 11:56:00| 5| 0| 0| 8591106|Zürich, Butzenstr...|47.3414099167461|8.53031210765799|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591270| 12:02:00| 12:02:00| 10| 0| 0| 8591270| Zürich, Marbachweg|47.3303482449491|8.51537312448101|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591410| 11:57:00| 11:57:00| 6| 0| 0| 8591410|Zürich, Verenastr...|47.3408255385719|8.52538035674749|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", + "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", + "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591370| 12:04:00| 12:04:00| 12| 0| 0| 8591370|Zürich, Sihlweids...|47.3264149182794|8.51466345540645|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591154| 11:59:00| 11:59:00| 8| 0| 0| 8591154|Zürich, Frymannst...|47.3351336003511|8.51914604867483|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591210| 12:03:00| 12:03:00| 11| 0| 0| 8591210| Zürich, Im Hüsli|47.3282354882425|8.51269614493396|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591081| 11:53:00| 11:53:00| 2| 0| 0| 8591081|Zürich Wollishofe...|47.3470342259279| 8.5329172219823|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8591061| 12:00:00| 12:01:00| 9| 0| 0| 8591061|Zürich Leimbach, ...|47.3332523864039|8.51859807635144|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|1014.TA.26-70-A-j...|8502495| 11:51:00| 11:51:00| 1| 0| 0| 8502495|Zürich Wollishofe...|47.3476976601166|8.53331248070737|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", - "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", - "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0|\n", "|102.TA.26-765-j19...|8573211| 19:26:00| 19:26:00| 17| 0| 0| 8573211|Kloten, Zum Wilde...| 47.453545361717|8.58019555539209| 26-765-j19-1|Zürich Flughafen,...| 1648| 1|\n", "|102.TA.26-765-j19...|8503700| 19:12:00| 19:13:00| 7| 0| 0| 8503700|Bassersdorf, Bahnhof|47.4387159099396|8.62613539902836| 26-765-j19-1|Zürich Flughafen,...| 1648| 1|\n", "|102.TA.26-765-j19...|8590539| 19:03:00| 19:03:00| 1| 0| 0| 8590539|Dietlikon, Bahnho...|47.4219616816404|8.62080838939279| 26-765-j19-1|Zürich Flughafen,...| 1648| 1|\n", "|102.TA.26-765-j19...|8576153| 19:23:00| 19:23:00| 15| 0| 0| 8576153| Kloten, Rankstrasse|47.4511884812805|8.59205331714412| 26-765-j19-1|Zürich Flughafen,...| 1648| 1|\n", "|102.TA.26-765-j19...|8590503| 19:16:00| 19:16:00| 9| 0| 0| 8590503|Bassersdorf, Chlu...|47.4446275362315|8.62462622935083| 26-765-j19-1|Zürich Flughafen,...| 1648| 1|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times_15km_business_week = spark.read.csv('data/lgpt_guys/stop_times_15km_business_week.csv', header=True)\n", "stop_times_15km_business_week.show()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "398630" ] } ], "source": [ "stop_times_15km_business_week.count()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|departure_hour|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "|1.TA.26-925-j19-1...|8576080| 15:27:00| 15:27:00| 23| 0| 0| 8576080| Meilen, Bahnhof|47.2694401970586|8.64488323901054| 26-925-j19-1| Meilen, Bahnhof| 280| 0| 15|\n", "|1.TA.26-925-j19-1...|8576082| 15:22:00| 15:22:00| 22| 0| 0| 8576082| Meilen, Beugen|47.2672701430669|8.65071330520529| 26-925-j19-1| Meilen, Bahnhof| 280| 0| 15|\n", "|1014.TA.26-70-A-j...|8591304| 11:54:00| 11:54:00| 3| 0| 0| 8591304|Zürich, Post Woll...|47.3444717091534|8.53296213774651|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591278| 12:05:00| 12:05:00| 13| 0| 0| 8591278|Zürich, Mittellei...|47.3231389520848|8.51428616298707|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591106| 11:56:00| 11:56:00| 5| 0| 0| 8591106|Zürich, Butzenstr...|47.3414099167461|8.53031210765799|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591270| 12:02:00| 12:02:00| 10| 0| 0| 8591270| Zürich, Marbachweg|47.3303482449491|8.51537312448101|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591410| 11:57:00| 11:57:00| 6| 0| 0| 8591410|Zürich, Verenastr...|47.3408255385719|8.52538035674749|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", + "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", + "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591370| 12:04:00| 12:04:00| 12| 0| 0| 8591370|Zürich, Sihlweids...|47.3264149182794|8.51466345540645|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591154| 11:59:00| 11:59:00| 8| 0| 0| 8591154|Zürich, Frymannst...|47.3351336003511|8.51914604867483|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591210| 12:03:00| 12:03:00| 11| 0| 0| 8591210| Zürich, Im Hüsli|47.3282354882425|8.51269614493396|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591081| 11:53:00| 11:53:00| 2| 0| 0| 8591081|Zürich Wollishofe...|47.3470342259279| 8.5329172219823|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591061| 12:00:00| 12:01:00| 9| 0| 0| 8591061|Zürich Leimbach, ...|47.3332523864039|8.51859807635144|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8502495| 11:51:00| 11:51:00| 1| 0| 0| 8502495|Zürich Wollishofe...|47.3476976601166|8.53331248070737|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", - "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", - "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|102.TA.26-765-j19...|8573211| 19:26:00| 19:26:00| 17| 0| 0| 8573211|Kloten, Zum Wilde...| 47.453545361717|8.58019555539209| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8503700| 19:12:00| 19:13:00| 7| 0| 0| 8503700|Bassersdorf, Bahnhof|47.4387159099396|8.62613539902836| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8590539| 19:03:00| 19:03:00| 1| 0| 0| 8590539|Dietlikon, Bahnho...|47.4219616816404|8.62080838939279| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8576153| 19:23:00| 19:23:00| 15| 0| 0| 8576153| Kloten, Rankstrasse|47.4511884812805|8.59205331714412| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8590503| 19:16:00| 19:16:00| 9| 0| 0| 8590503|Bassersdorf, Chlu...|47.4446275362315|8.62462622935083| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times_15km_business_week = stop_times_15km_business_week.withColumn(\"departure_hour\", stop_times_15km_business_week.departure_time.substr(0, 2).cast('int'))\n", "stop_times_15km_business_week.show()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|departure_hour|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "|1.TA.26-925-j19-1...|8576080| 15:27:00| 15:27:00| 23| 0| 0| 8576080| Meilen, Bahnhof|47.2694401970586|8.64488323901054| 26-925-j19-1| Meilen, Bahnhof| 280| 0| 15|\n", "|1.TA.26-925-j19-1...|8576082| 15:22:00| 15:22:00| 22| 0| 0| 8576082| Meilen, Beugen|47.2672701430669|8.65071330520529| 26-925-j19-1| Meilen, Bahnhof| 280| 0| 15|\n", "|1014.TA.26-70-A-j...|8591304| 11:54:00| 11:54:00| 3| 0| 0| 8591304|Zürich, Post Woll...|47.3444717091534|8.53296213774651|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591278| 12:05:00| 12:05:00| 13| 0| 0| 8591278|Zürich, Mittellei...|47.3231389520848|8.51428616298707|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591106| 11:56:00| 11:56:00| 5| 0| 0| 8591106|Zürich, Butzenstr...|47.3414099167461|8.53031210765799|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591270| 12:02:00| 12:02:00| 10| 0| 0| 8591270| Zürich, Marbachweg|47.3303482449491|8.51537312448101|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591410| 11:57:00| 11:57:00| 6| 0| 0| 8591410|Zürich, Verenastr...|47.3408255385719|8.52538035674749|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", + "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", + "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591370| 12:04:00| 12:04:00| 12| 0| 0| 8591370|Zürich, Sihlweids...|47.3264149182794|8.51466345540645|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591154| 11:59:00| 11:59:00| 8| 0| 0| 8591154|Zürich, Frymannst...|47.3351336003511|8.51914604867483|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591210| 12:03:00| 12:03:00| 11| 0| 0| 8591210| Zürich, Im Hüsli|47.3282354882425|8.51269614493396|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591081| 11:53:00| 11:53:00| 2| 0| 0| 8591081|Zürich Wollishofe...|47.3470342259279| 8.5329172219823|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591061| 12:00:00| 12:01:00| 9| 0| 0| 8591061|Zürich Leimbach, ...|47.3332523864039|8.51859807635144|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8502495| 11:51:00| 11:51:00| 1| 0| 0| 8502495|Zürich Wollishofe...|47.3476976601166|8.53331248070737|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", - "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", - "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|102.TA.26-765-j19...|8573211| 19:26:00| 19:26:00| 17| 0| 0| 8573211|Kloten, Zum Wilde...| 47.453545361717|8.58019555539209| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8503700| 19:12:00| 19:13:00| 7| 0| 0| 8503700|Bassersdorf, Bahnhof|47.4387159099396|8.62613539902836| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8590539| 19:03:00| 19:03:00| 1| 0| 0| 8590539|Dietlikon, Bahnho...|47.4219616816404|8.62080838939279| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8576153| 19:23:00| 19:23:00| 15| 0| 0| 8576153| Kloten, Rankstrasse|47.4511884812805|8.59205331714412| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8590503| 19:16:00| 19:16:00| 9| 0| 0| 8590503|Bassersdorf, Chlu...|47.4446275362315|8.62462622935083| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "only showing top 20 rows" ] } ], "source": [ "departure_earliest = 7\n", "departure_latest = 19\n", "stop_times_15km_business_week_standard_hours = stop_times_15km_business_week.filter((stop_times_15km_business_week.departure_hour>=departure_earliest) & \\\n", " (stop_times_15km_business_week.departure_hour<= departure_latest))\n", "stop_times_15km_business_week_standard_hours.show()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# intermediate saving point\n", "stop_times_15km_business_week_standard_hours.write.csv('data/lgpt_guys/stop_times_15km_business_week_standard_hours.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5) Order stop_times as to reconstruct routes for RAPTOR" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building a list of unique trips according to 1) the stop sequence and 2) the departure time sequence" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "304085" ] } ], "source": [ "# we start fresh from here, where stop_times is in fact stop_times_15km_business_week_standard_hours loaded from the server\n", "stop_times = spark.read.csv('data/lgpt_guys/stop_times_15km_business_week_standard_hours.csv', header = True)\n", "stop_times.count()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|departure_hour|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "|1.TA.26-925-j19-1...|8576080| 15:27:00| 15:27:00| 23| 0| 0| 8576080| Meilen, Bahnhof|47.2694401970586|8.64488323901054| 26-925-j19-1| Meilen, Bahnhof| 280| 0| 15|\n", "|1.TA.26-925-j19-1...|8576082| 15:22:00| 15:22:00| 22| 0| 0| 8576082| Meilen, Beugen|47.2672701430669|8.65071330520529| 26-925-j19-1| Meilen, Bahnhof| 280| 0| 15|\n", "|1014.TA.26-70-A-j...|8591304| 11:54:00| 11:54:00| 3| 0| 0| 8591304|Zürich, Post Woll...|47.3444717091534|8.53296213774651|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591278| 12:05:00| 12:05:00| 13| 0| 0| 8591278|Zürich, Mittellei...|47.3231389520848|8.51428616298707|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591106| 11:56:00| 11:56:00| 5| 0| 0| 8591106|Zürich, Butzenstr...|47.3414099167461|8.53031210765799|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591270| 12:02:00| 12:02:00| 10| 0| 0| 8591270| Zürich, Marbachweg|47.3303482449491|8.51537312448101|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591410| 11:57:00| 11:57:00| 6| 0| 0| 8591410|Zürich, Verenastr...|47.3408255385719|8.52538035674749|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", + "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", + "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591370| 12:04:00| 12:04:00| 12| 0| 0| 8591370|Zürich, Sihlweids...|47.3264149182794|8.51466345540645|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591154| 11:59:00| 11:59:00| 8| 0| 0| 8591154|Zürich, Frymannst...|47.3351336003511|8.51914604867483|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591210| 12:03:00| 12:03:00| 11| 0| 0| 8591210| Zürich, Im Hüsli|47.3282354882425|8.51269614493396|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8591081| 11:53:00| 11:53:00| 2| 0| 0| 8591081|Zürich Wollishofe...|47.3470342259279| 8.5329172219823|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|1014.TA.26-70-A-j...|8591061| 12:00:00| 12:01:00| 9| 0| 0| 8591061|Zürich Leimbach, ...|47.3332523864039|8.51859807635144|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 12|\n", "|1014.TA.26-70-A-j...|8502495| 11:51:00| 11:51:00| 1| 0| 0| 8502495|Zürich Wollishofe...|47.3476976601166|8.53331248070737|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", - "|1014.TA.26-70-A-j...|8591279| 11:55:00| 11:55:00| 4| 0| 0| 8591279| Zürich, Morgental|47.3439482343686|8.53014142775399|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", - "|1014.TA.26-70-A-j...|8591268| 11:58:00| 11:58:00| 7| 0| 0| 8591268| Zürich, Manegg|47.3369660452942|8.52034979115572|26-70-A-j19-1|Zürich, Mittellei...| 3400| 0| 11|\n", "|102.TA.26-765-j19...|8573211| 19:26:00| 19:26:00| 17| 0| 0| 8573211|Kloten, Zum Wilde...| 47.453545361717|8.58019555539209| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8503700| 19:12:00| 19:13:00| 7| 0| 0| 8503700|Bassersdorf, Bahnhof|47.4387159099396|8.62613539902836| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8590539| 19:03:00| 19:03:00| 1| 0| 0| 8590539|Dietlikon, Bahnho...|47.4219616816404|8.62080838939279| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8576153| 19:23:00| 19:23:00| 15| 0| 0| 8576153| Kloten, Rankstrasse|47.4511884812805|8.59205331714412| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "|102.TA.26-765-j19...|8590503| 19:16:00| 19:16:00| 9| 0| 0| 8590503|Bassersdorf, Chlu...|47.4446275362315|8.62462622935083| 26-765-j19-1|Zürich Flughafen,...| 1648| 1| 19|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times.show()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-----------+--------------------+---------------+------------+--------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|departure_hour|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-----------+--------------------+---------------+------------+--------------+\n", "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 8572747|Bremgarten AG, Ba...|47.3516902622456|8.34617544069354|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8582462| 09:38:00| 09:38:00| 3| 0| 0| 8582462|Bremgarten AG, Ze...|47.3475576701104|8.34819665008309|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8572600| 09:39:00| 09:39:00| 4| 0| 0| 8572600| Zufikon, Emaus| 47.34464822855| 8.3519875405826|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8572601| 09:39:00| 09:39:00| 5| 0| 0| 8572601| Zufikon, Algier|47.3417386266265|8.35463757067112|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8502553| 09:43:00| 09:43:00| 6| 0| 0| 8502553|Unterlunkhofen, B...|47.3221585583935| 8.380473118246|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8572602| 09:45:00| 09:45:00| 7| 0| 0| 8572602|Oberlunkhofen, Ke...|47.3133646488037| 8.3889172819179|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8502955| 09:46:00| 09:47:00| 8| 0| 0| 8502955| Oberlunkhofen, Post|47.3133829202162|8.38868371994399|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8573722| 09:48:00| 09:48:00| 9| 0| 0| 8573722|Oberlunkhofen, Ob...|47.3123840737352|8.39276207133446|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 8573721|Oberlunkhofen, Wa...|47.3134255534873|8.39881671635027|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 8503598| Arni AG, Dorf|47.3183951391194| 8.4197115298618|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 8573720| Arni AG, Stockacker|47.3200332946963|8.42388869593354|1-231-j19-1| Jonen, Post| 23127| 0| 9|\n", "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 8503598| Arni AG, Dorf|47.3183951391194| 8.4197115298618|1-231-j19-1| Jonen, Post| 23127| 0| 10|\n", "|1.TA.1-231-j19-1.1.H|8573721| 10:02:00| 10:02:00| 14| 0| 0| 8573721|Oberlunkhofen, Wa...|47.3134255534873|8.39881671635027|1-231-j19-1| Jonen, Post| 23127| 0| 10|\n", "|1.TA.1-231-j19-1.1.H|8573722| 10:03:00| 10:03:00| 15| 0| 0| 8573722|Oberlunkhofen, Ob...|47.3123840737352|8.39276207133446|1-231-j19-1| Jonen, Post| 23127| 0| 10|\n", "|1.TA.1-231-j19-1.1.H|8573723| 10:04:00| 10:04:00| 16| 0| 0| 8573723|Oberlunkhofen, Do...|47.3113973897738|8.39072289563923|1-231-j19-1| Jonen, Post| 23127| 0| 10|\n", "|1.TA.1-231-j19-1.1.H|8583071| 10:05:00| 10:05:00| 17| 0| 0| 8583071| Jonen, Radmühle|47.3019681473342|8.39299563330837|1-231-j19-1| Jonen, Post| 23127| 0| 10|\n", "|1.TA.1-231-j19-1.1.H|8572603| 10:06:00| 10:06:00| 18| 0| 0| 8572603| Jonen, Käppeli|47.2994704564101|8.39385801598124|1-231-j19-1| Jonen, Post| 23127| 0| 10|\n", "|1.TA.1-231-j19-1.1.H|8502879| 10:07:00| 10:07:00| 19| 0| 0| 8502879| Jonen, Post|47.2961806346557|8.39551091610425|1-231-j19-1| Jonen, Post| 23127| 0| 10|\n", "| 1.TA.1-44-j19-1.1.R|8590275| 08:31:00| 08:31:00| 1| 0| 0| 8590275| Spreitenbach, IKEA|47.4200714067302| 8.3754784852656| 1-44-j19-1|Spreitenbach, Sho...| 2001| 1| 8|\n", "| 1.TA.1-44-j19-1.1.R|8591891| 08:34:00| 08:34:00| 2| 0| 0| 8591891|Spreitenbach, Alt...|47.4188375250837|8.36858840703544| 1-44-j19-1|Spreitenbach, Sho...| 2001| 1| 8|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-----------+--------------------+---------------+------------+--------------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times = stop_times.sort(stop_times.trip_id, stop_times.stop_sequence.cast('int'))\n", "stop_times.show()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from pyspark.sql.window import Window\n", "w= (\n", " Window.partitionBy(\"trip_id\")\n", " .orderBy(stop_times.stop_sequence.cast('int'))\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This step is a bit technical. We aim at identifying trips that are identical, although they may bear a different `trip_id`. Indeed, we used data from services running every day of a standard business week. But we do not take days of the week into account, only departure and arrival **hours**. Therefore, we must find a way to identify and merge identical trips in terms of stops served and arrival and departure times. \n", "\n", "To do so, we use window functions on each trip to build a stop sequence and a list of departure times. When departure times are identical, arrival times are considered identical." ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------------------+---------------+--------------+-------------+--------------+------------------------------------------------------------------------------------------------------------------------------+\n", "|trip_id |stop_id_general|departure_time|stop_sequence|departure_hour|all_stops |\n", "+-------------------------+---------------+--------------+-------------+--------------+------------------------------------------------------------------------------------------------------------------------------+\n", "|1005.TA.26-131-j19-1.9.H |8503855 |16:14:00 |1 |16 |[8503855] |\n", "|1005.TA.26-131-j19-1.9.H |8589111 |16:15:00 |2 |16 |[8503855, 8589111] |\n", "|1005.TA.26-131-j19-1.9.H |8573553 |16:16:00 |3 |16 |[8503855, 8589111, 8573553] |\n", "|1005.TA.26-131-j19-1.9.H |8573554 |16:18:00 |4 |16 |[8503855, 8589111, 8573553, 8573554] |\n", "|1005.TA.26-131-j19-1.9.H |8573555 |16:19:00 |5 |16 |[8503855, 8589111, 8573553, 8573554, 8573555] |\n", "|1005.TA.26-131-j19-1.9.H |8588985 |16:20:00 |6 |16 |[8503855, 8589111, 8573553, 8573554, 8573555, 8588985] |\n", "|1005.TA.26-131-j19-1.9.H |8588984 |16:21:00 |7 |16 |[8503855, 8589111, 8573553, 8573554, 8573555, 8588985, 8588984] |\n", "|103.TA.26-925-j19-1.4.H |8576082 |07:38:00 |20 |7 |[8576082] |\n", "|103.TA.26-925-j19-1.4.H |8576080 |07:42:00 |21 |7 |[8576082, 8576080] |\n", "|104.TA.26-733-j19-1.2.R |8573205 |07:33:00 |1 |7 |[8573205] |\n", "|104.TA.26-733-j19-1.2.R |8580301 |07:33:00 |2 |7 |[8573205, 8580301] |\n", "|104.TA.26-733-j19-1.2.R |8588553 |07:34:00 |3 |7 |[8573205, 8580301, 8588553] |\n", "|104.TA.26-733-j19-1.2.R |8573211 |07:36:00 |4 |7 |[8573205, 8580301, 8588553, 8573211] |\n", "|104.TA.26-733-j19-1.2.R |8590699 |07:37:00 |5 |7 |[8573205, 8580301, 8588553, 8573211, 8590699] |\n", "|104.TA.26-733-j19-1.2.R |8587420 |07:40:00 |6 |7 |[8573205, 8580301, 8588553, 8573211, 8590699, 8587420] |\n", "|104.TA.26-733-j19-1.2.R |8580434 |07:41:00 |7 |7 |[8573205, 8580301, 8588553, 8573211, 8590699, 8587420, 8580434] |\n", "|104.TA.26-733-j19-1.2.R |8576153 |07:42:00 |8 |7 |[8573205, 8580301, 8588553, 8573211, 8590699, 8587420, 8580434, 8576153] |\n", "|104.TA.26-733-j19-1.2.R |8580433 |07:44:00 |9 |7 |[8573205, 8580301, 8588553, 8573211, 8590699, 8587420, 8580434, 8576153, 8580433] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591245 |15:42:00 |1 |15 |[8591245] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591329 |15:43:00 |2 |15 |[8591245, 8591329] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591366 |15:45:00 |3 |15 |[8591245, 8591329, 8591366] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591415 |15:46:00 |4 |15 |[8591245, 8591329, 8591366, 8591415] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591059 |15:47:00 |5 |15 |[8591245, 8591329, 8591366, 8591415, 8591059] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591058 |15:49:00 |6 |15 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591317 |15:51:00 |7 |15 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591105 |15:53:00 |8 |15 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105] |\n", "|1087.TA.26-5-B-j19-1.23.R|8576193 |15:55:00 |9 |15 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591239 |15:57:00 |10 |15 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591220 |15:58:00 |11 |15 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591303 |16:00:00 |12 |16 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220, 8591303] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591412 |16:02:00 |13 |16 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220, 8591303, 8591412] |\n", "|1087.TA.26-5-B-j19-1.23.R|8591230 |16:04:00 |14 |16 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220, 8591303, 8591412, 8591230]|\n", "|109.TA.1-1-E-j19-1.12.R |8578679 |11:04:00 |1 |11 |[8578679] |\n", "|109.TA.1-1-E-j19-1.12.R |8590314 |11:05:00 |2 |11 |[8578679, 8590314] |\n", "|109.TA.1-1-E-j19-1.12.R |8590317 |11:06:00 |3 |11 |[8578679, 8590314, 8590317] |\n", "|109.TA.79-24-j19-1.1.R |8503500 |16:50:00 |1 |16 |[8503500] |\n", "|109.TA.79-24-j19-1.1.R |8503499 |16:53:00 |2 |16 |[8503500, 8503499] |\n", "|1099.TA.26-142-j19-1.2.R |8590815 |17:31:00 |1 |17 |[8590815] |\n", "|1099.TA.26-142-j19-1.2.R |8590817 |17:32:00 |2 |17 |[8590815, 8590817] |\n", "|1099.TA.26-142-j19-1.2.R |8590812 |17:33:00 |3 |17 |[8590815, 8590817, 8590812] |\n", "|1099.TA.26-142-j19-1.2.R |8590825 |17:35:00 |4 |17 |[8590815, 8590817, 8590812, 8590825] |\n", "|1099.TA.26-142-j19-1.2.R |8590830 |17:36:00 |5 |17 |[8590815, 8590817, 8590812, 8590825, 8590830] |\n", "|1099.TA.26-142-j19-1.2.R |8590818 |17:37:00 |6 |17 |[8590815, 8590817, 8590812, 8590825, 8590830, 8590818] |\n", "|1099.TA.26-142-j19-1.2.R |8573167 |17:40:00 |7 |17 |[8590815, 8590817, 8590812, 8590825, 8590830, 8590818, 8573167] |\n", "|11.TA.1-444-j19-1.1.H |8572747 |18:35:00 |2 |18 |[8572747] |\n", "|11.TA.1-444-j19-1.1.H |8580847 |18:36:00 |3 |18 |[8572747, 8580847] |\n", "|11.TA.1-444-j19-1.1.H |8581346 |18:41:00 |4 |18 |[8572747, 8580847, 8581346] |\n", "|11.TA.1-444-j19-1.1.H |8502894 |18:42:00 |5 |18 |[8572747, 8580847, 8581346, 8502894] |\n", "|11.TA.1-444-j19-1.1.H |8502979 |18:43:00 |6 |18 |[8572747, 8580847, 8581346, 8502894, 8502979] |\n", "|11.TA.1-444-j19-1.1.H |8572596 |18:44:00 |7 |18 |[8572747, 8580847, 8581346, 8502894, 8502979, 8572596] |\n", "|11.TA.1-444-j19-1.1.H |8591365 |18:59:00 |8 |18 |[8572747, 8580847, 8581346, 8502894, 8502979, 8572596, 8591365] |\n", "|11.TA.1-444-j19-1.1.H |8591366 |19:01:00 |9 |19 |[8572747, 8580847, 8581346, 8502894, 8502979, 8572596, 8591365, 8591366] |\n", "|11.TA.1-444-j19-1.1.H |8591059 |19:03:00 |10 |19 |[8572747, 8580847, 8581346, 8502894, 8502979, 8572596, 8591365, 8591366, 8591059] |\n", "|111.TA.79-736-j19-1.5.H |8591031 |18:39:00 |1 |18 |[8591031] |\n", "|111.TA.79-736-j19-1.5.H |8588553 |18:41:00 |2 |18 |[8591031, 8588553] |\n", "|111.TA.79-736-j19-1.5.H |8580301 |18:42:00 |3 |18 |[8591031, 8588553, 8580301] |\n", "|111.TA.79-736-j19-1.5.H |8573205 |18:44:00 |4 |18 |[8591031, 8588553, 8580301, 8573205] |\n", "|111.TA.79-736-j19-1.5.H |8573213 |18:45:00 |5 |18 |[8591031, 8588553, 8580301, 8573205, 8573213] |\n", "|111.TA.79-736-j19-1.5.H |8587799 |18:46:00 |6 |18 |[8591031, 8588553, 8580301, 8573205, 8573213, 8587799] |\n", "|111.TA.79-736-j19-1.5.H |8591032 |18:49:00 |7 |18 |[8591031, 8588553, 8580301, 8573205, 8573213, 8587799, 8591032] |\n", "|111.TA.79-736-j19-1.5.H |8593523 |18:51:00 |8 |18 |[8591031, 8588553, 8580301, 8573205, 8573213, 8587799, 8591032, 8593523] |\n", "|1139.TA.26-156-j19-1.4.R |8573167 |11:57:00 |1 |11 |[8573167] |\n", "|1139.TA.26-156-j19-1.4.R |8590824 |11:58:00 |2 |11 |[8573167, 8590824] |\n", "|1139.TA.26-156-j19-1.4.R |8590822 |11:59:00 |3 |11 |[8573167, 8590824, 8590822] |\n", "|1139.TA.26-156-j19-1.4.R |8590811 |12:00:00 |4 |12 |[8573167, 8590824, 8590822, 8590811] |\n", "|1139.TA.26-156-j19-1.4.R |8590826 |12:01:00 |5 |12 |[8573167, 8590824, 8590822, 8590811, 8590826] |\n", "|1139.TA.26-156-j19-1.4.R |8595406 |12:02:00 |6 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406] |\n", "|1139.TA.26-156-j19-1.4.R |8590828 |12:03:00 |7 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828] |\n", "|1139.TA.26-156-j19-1.4.R |8590780 |12:04:00 |8 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828, 8590780] |\n", "|1139.TA.26-156-j19-1.4.R |8590779 |12:06:00 |9 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828, 8590780, 8590779] |\n", "|1139.TA.26-156-j19-1.4.R |8590775 |12:06:00 |10 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828, 8590780, 8590779, 8590775] |\n", "|1139.TA.26-156-j19-1.4.R |8590777 |12:07:00 |11 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828, 8590780, 8590779, 8590775, 8590777] |\n", "|1139.TA.26-156-j19-1.4.R |8590482 |12:09:00 |12 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828, 8590780, 8590779, 8590775, 8590777, 8590482] |\n", "|1139.TA.26-156-j19-1.4.R |8590464 |12:11:00 |13 |12 |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828, 8590780, 8590779, 8590775, 8590777, 8590482, 8590464] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591245 |10:35:00 |1 |10 |[8591245] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591329 |10:36:00 |2 |10 |[8591245, 8591329] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591366 |10:37:00 |3 |10 |[8591245, 8591329, 8591366] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591415 |10:39:00 |4 |10 |[8591245, 8591329, 8591366, 8591415] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591059 |10:40:00 |5 |10 |[8591245, 8591329, 8591366, 8591415, 8591059] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591058 |10:41:00 |6 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591317 |10:43:00 |7 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591105 |10:45:00 |8 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105] |\n", "|1141.TA.26-5-B-j19-1.23.R|8576193 |10:48:00 |9 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591239 |10:50:00 |10 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591220 |10:51:00 |11 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591303 |10:53:00 |12 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220, 8591303] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591412 |10:55:00 |13 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220, 8591303, 8591412] |\n", "|1141.TA.26-5-B-j19-1.23.R|8591230 |10:56:00 |14 |10 |[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220, 8591303, 8591412, 8591230]|\n", "|1158.TA.26-69-j19-1.4.H |8591122 |14:30:00 |1 |14 |[8591122] |\n", "|1158.TA.26-69-j19-1.4.H |8591201 |14:31:00 |2 |14 |[8591122, 8591201] |\n", "|1158.TA.26-69-j19-1.4.H |8591213 |14:32:00 |3 |14 |[8591122, 8591201, 8591213] |\n", "|1158.TA.26-69-j19-1.4.H |8591416 |14:33:00 |4 |14 |[8591122, 8591201, 8591213, 8591416] |\n", "|1158.TA.26-69-j19-1.4.H |8591302 |14:34:00 |5 |14 |[8591122, 8591201, 8591213, 8591416, 8591302] |\n", "|1158.TA.26-69-j19-1.4.H |8591419 |14:35:00 |6 |14 |[8591122, 8591201, 8591213, 8591416, 8591302, 8591419] |\n", "|1158.TA.26-69-j19-1.4.H |8591425 |14:36:00 |7 |14 |[8591122, 8591201, 8591213, 8591416, 8591302, 8591419, 8591425] |\n", "|1158.TA.26-69-j19-1.4.H |8591101 |14:38:00 |8 |14 |[8591122, 8591201, 8591213, 8591416, 8591302, 8591419, 8591425, 8591101] |\n", "|1158.TA.26-69-j19-1.4.H |8591276 |14:41:00 |9 |14 |[8591122, 8591201, 8591213, 8591416, 8591302, 8591419, 8591425, 8591101, 8591276] |\n", "|1164.TA.26-5-B-j19-1.23.R|8591245 |19:58:00 |1 |19 |[8591245] |\n", "|1164.TA.26-5-B-j19-1.23.R|8591329 |19:59:00 |2 |19 |[8591245, 8591329] |\n", "|1186.TA.26-69-j19-1.4.H |8591122 |11:00:00 |1 |11 |[8591122] |\n", "+-------------------------+---------------+--------------+-------------+--------------+------------------------------------------------------------------------------------------------------------------------------+\n", "only showing top 100 rows" ] } ], "source": [ "# code from https://stackoverflow.com/questions/56763946/concat-multiple-string-rows-for-each-unique-id-by-a-particular-order\n", "from pyspark.sql import functions as F\n", "stop_times.withColumn(\"all_stops\",F.collect_list(\"stop_id_general\").over(w))\\\n", ".withColumn(\"all_departures\",F.collect_list(\"departure_time\").over(w))\\\n", ".select(F.col('trip_id'), F.col('stop_id_general'), F.col('departure_time'), \n", " F.col('stop_sequence'), F.col('departure_hour'), F.col('all_stops'))\\\n", ".show(100, 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We successfully built incremental lists of departure times and stop_id sequences. We now need to select for the longest list for each `trip_id`. The `groupBy` line below does exactly that." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n", "|trip_id |all_stops |all_departures |all_arrivals |\n", "+-------------------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n", "|1005.TA.26-131-j19-1.9.H |[8503855, 8589111, 8573553, 8573554, 8573555, 8588985, 8588984] |[16:14:00, 16:15:00, 16:16:00, 16:18:00, 16:19:00, 16:20:00, 16:21:00] |[16:14:00, 16:15:00, 16:16:00, 16:18:00, 16:19:00, 16:20:00, 16:21:00] |\n", "|103.TA.26-925-j19-1.4.H |[8576082, 8576080] |[07:38:00, 07:42:00] |[07:38:00, 07:42:00] |\n", "|104.TA.26-733-j19-1.2.R |[8573205, 8580301, 8588553, 8573211, 8590699, 8587420, 8580434, 8576153, 8580433] |[07:33:00, 07:33:00, 07:34:00, 07:36:00, 07:37:00, 07:40:00, 07:41:00, 07:42:00, 07:44:00] |[07:33:00, 07:33:00, 07:34:00, 07:36:00, 07:37:00, 07:40:00, 07:41:00, 07:42:00, 07:44:00] |\n", "|1087.TA.26-5-B-j19-1.23.R|[8591245, 8591329, 8591366, 8591415, 8591059, 8591058, 8591317, 8591105, 8576193, 8591239, 8591220, 8591303, 8591412, 8591230]|[15:42:00, 15:43:00, 15:45:00, 15:46:00, 15:47:00, 15:49:00, 15:51:00, 15:53:00, 15:55:00, 15:57:00, 15:58:00, 16:00:00, 16:02:00, 16:04:00]|[15:42:00, 15:43:00, 15:45:00, 15:46:00, 15:47:00, 15:49:00, 15:51:00, 15:53:00, 15:55:00, 15:57:00, 15:58:00, 16:00:00, 16:02:00, 16:04:00]|\n", "|109.TA.1-1-E-j19-1.12.R |[8578679, 8590314, 8590317] |[11:04:00, 11:05:00, 11:06:00] |[11:04:00, 11:05:00, 11:06:00] |\n", "|109.TA.79-24-j19-1.1.R |[8503500, 8503499] |[16:50:00, 16:53:00] |[16:50:00, 16:53:00] |\n", "|1099.TA.26-142-j19-1.2.R |[8590815, 8590817, 8590812, 8590825, 8590830, 8590818, 8573167] |[17:31:00, 17:32:00, 17:33:00, 17:35:00, 17:36:00, 17:37:00, 17:40:00] |[17:31:00, 17:32:00, 17:33:00, 17:35:00, 17:36:00, 17:37:00, 17:40:00] |\n", "|11.TA.1-444-j19-1.1.H |[8572747, 8580847, 8581346, 8502894, 8502979, 8572596, 8591365, 8591366, 8591059] |[18:35:00, 18:36:00, 18:41:00, 18:42:00, 18:43:00, 18:44:00, 18:59:00, 19:01:00, 19:03:00] |[18:35:00, 18:36:00, 18:41:00, 18:42:00, 18:43:00, 18:44:00, 18:59:00, 19:01:00, 19:03:00] |\n", "|111.TA.79-736-j19-1.5.H |[8591031, 8588553, 8580301, 8573205, 8573213, 8587799, 8591032, 8593523] |[18:39:00, 18:41:00, 18:42:00, 18:44:00, 18:45:00, 18:46:00, 18:49:00, 18:51:00] |[18:39:00, 18:41:00, 18:42:00, 18:43:00, 18:45:00, 18:46:00, 18:49:00, 18:51:00] |\n", "|1139.TA.26-156-j19-1.4.R |[8573167, 8590824, 8590822, 8590811, 8590826, 8595406, 8590828, 8590780, 8590779, 8590775, 8590777, 8590482, 8590464] |[11:57:00, 11:58:00, 11:59:00, 12:00:00, 12:01:00, 12:02:00, 12:03:00, 12:04:00, 12:06:00, 12:06:00, 12:07:00, 12:09:00, 12:11:00] |[11:57:00, 11:58:00, 11:59:00, 12:00:00, 12:01:00, 12:02:00, 12:03:00, 12:04:00, 12:06:00, 12:06:00, 12:07:00, 12:09:00, 12:11:00] |\n", "+-------------------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n", "only showing top 10 rows" ] } ], "source": [ "trips_with_duplicates= stop_times.withColumn(\"all_stops\",F.collect_list(\"stop_id_general\").over(w))\\\n", ".withColumn(\"all_departures\",F.collect_list(\"departure_time\").over(w))\\\n", ".withColumn(\"all_arrivals\",F.collect_list(\"arrival_time\").over(w))\\\n", ".groupBy(\"trip_id\")\\\n", ".agg(F.max(\"all_stops\").alias(\"all_stops\"), F.max(\"all_departures\").alias(\"all_departures\"), F.max(\"all_arrivals\").alias(\"all_arrivals\"))\\\n", "\n", "trips_with_duplicates.show(10, 0)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "25127" ] } ], "source": [ "trips_with_duplicates.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Are there many trips with a single stop ?" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+---------+--------------+------------+----------+\n", "| trip_id|all_stops|all_departures|all_arrivals|stop_count|\n", "+--------------------+---------+--------------+------------+----------+\n", "|366.TA.11-3-j19-1...|[8503000]| [15:23:00]| [15:23:00]| 1|\n", "|457.TA.26-24-j19-...|[8502208]| [16:45:00]| [16:45:00]| 1|\n", "|99.TA.1-321-j19-1...|[8502750]| [16:07:00]| [16:07:00]| 1|\n", "|31.TA.80-158-Y-j1...|[8503000]| [14:38:00]| [14:38:00]| 1|\n", "|423.TA.1-36-j19-1...|[8503000]| [09:36:00]| [09:36:00]| 1|\n", "|808.TA.26-24-j19-...|[8502208]| [17:15:00]| [17:15:00]| 1|\n", "|1.TA.20-E03-j19-1...|[8596126]| [19:15:00]| [19:10:00]| 1|\n", "|103.TA.1-321-j19-...|[8502750]| [18:07:00]| [18:07:00]| 1|\n", "|123.TA.1-321-j19-...|[8502750]| [12:07:00]| [12:07:00]| 1|\n", "|141.TA.20-2-j19-1...|[8503000]| [08:10:00]| [08:10:00]| 1|\n", "|17.TA.17-4-j19-1.5.H|[8503000]| [12:37:00]| [12:37:00]| 1|\n", "|17.TA.80-158-Y-j1...|[8503000]| [10:38:00]| [10:38:00]| 1|\n", "|346.TA.1-37-j19-1...|[8503000]| [11:08:00]| [11:08:00]| 1|\n", "|399.TA.11-3-j19-1...|[8503000]| [11:37:00]| [11:37:00]| 1|\n", "| 4.TA.17-4-j19-1.4.H|[8503000]| [18:37:00]| [18:37:00]| 1|\n", "|450.TA.1-16-j19-1...|[8503000]| [15:54:00]| [15:54:00]| 1|\n", "|611.TA.26-8-A-j19...|[8503204]| [07:00:00]| [06:59:00]| 1|\n", "|178.TA.20-2-j19-1...|[8503000]| [19:18:00]| [19:18:00]| 1|\n", "|410.TA.26-24-j19-...|[8502208]| [10:45:00]| [10:45:00]| 1|\n", "|1.TA.57-2-Y-j19-1...|[8503000]| [09:34:00]| [09:34:00]| 1|\n", "+--------------------+---------+--------------+------------+----------+\n", "only showing top 20 rows" ] } ], "source": [ "from pyspark.sql.types import IntegerType\n", "slen = udf(lambda s: len(s), IntegerType())\n", "\n", "trips_with_duplicates.withColumn(\"stop_count\", slen(trips_with_duplicates.all_stops)).filter(F.col('stop_count')==1).show()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "714" ] } ], "source": [ "trips_with_duplicates.withColumn(\"stop_count\", slen(trips_with_duplicates.all_stops)).filter(F.col('stop_count')==1).count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many trips share exactly the same departure **and** arrival times at all stops ?" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+\n", "| trip_id| all_stops| all_departures| all_arrivals|\n", "+--------------------+--------------------+--------------------+--------------------+\n", "|382.TA.11-3-j19-1...| [8503000]| [15:53:00]| [15:53:00]|\n", "|87.TA.6-8-j19-1.64.R| [8503016, 8503000]|[08:46:00, 09:02:00]|[08:44:00, 08:55:00]|\n", "|32.TA.79-10-B-j19...|[8503054, 8503053...|[17:57:00, 17:59:...|[17:57:00, 17:59:...|\n", "|448.TA.26-LAF-j19...| [8503082, 8503081]|[19:20:00, 19:25:00]|[19:20:00, 19:25:00]|\n", "|294.TA.26-5-A-j19...|[8503125, 8503003...|[17:21:00, 17:32:...|[17:20:00, 17:32:...|\n", "|996.TA.26-12-j19-...|[8503147, 8503003...|[14:34:00, 14:39:...|[14:33:00, 14:38:...|\n", "|10.TA.30-170-Y-j1...|[8503202, 8502209...|[10:30:00, 10:35:...|[10:30:00, 10:35:...|\n", "|43.TA.26-2-j19-1....|[8503204, 8503202...|[09:24:00, 09:29:...|[09:23:00, 09:28:...|\n", "|580.TA.26-8-A-j19...|[8503204, 8503203...|[18:00:00, 18:02:...|[17:59:00, 18:02:...|\n", "|585.TA.26-24-j19-...|[8503305, 8503307...|[17:53:00, 17:57:...|[17:52:00, 17:57:...|\n", "|246.TA.26-9-A-j19...|[8503313, 8503312...|[10:02:00, 10:05:...|[10:02:00, 10:05:...|\n", "|158.TA.26-9-A-j19...|[8503313, 8503312...|[15:32:00, 15:35:...|[15:32:00, 15:35:...|\n", "|137.TA.26-15-j19-...|[8503316, 8503315...|[10:11:00, 10:14:...|[10:11:00, 10:14:...|\n", "|293.TA.26-640-j19...|[8503382, 8594339...|[08:26:00, 08:27:...|[08:26:00, 08:27:...|\n", "|278.TA.79-24-j19-...| [8503499, 8503500]|[12:25:00, 12:28:00]|[12:25:00, 12:28:00]|\n", "|127.TA.1-17-A-j19...|[8503508, 8517376...|[14:33:00, 14:34:...|[14:33:00, 14:34:...|\n", "|1859.TA.26-9-B-j1...|[8503610, 8580912...|[08:29:00, 08:30:...|[08:29:00, 08:30:...|\n", "|476.TA.26-768-j19...|[8573205, 8573213...|[14:37:00, 14:38:...|[14:37:00, 14:38:...|\n", "|40.TA.26-733-j19-...|[8573205, 8580301...|[14:33:00, 14:33:...|[14:33:00, 14:33:...|\n", "|120.TA.26-731-j19...|[8573205, 8580301...|[12:03:00, 12:03:...|[12:03:00, 12:03:...|\n", "+--------------------+--------------------+--------------------+--------------------+\n", "only showing top 20 rows" ] } ], "source": [ "trips_with_duplicates.dropDuplicates(['all_stops', 'all_departures', 'all_arrivals']).show()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "20086" ] } ], "source": [ "trips_with_duplicates.dropDuplicates(['all_stops', 'all_departures', 'all_arrivals']).count()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "20041" ] } ], "source": [ "trips_with_duplicates.dropDuplicates(['all_stops', 'all_departures']).count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There seem to be a fraction of trips (less 0.2%) that share the exact same departure times at all stops, but not the same arrival times at all stops. To be on the safe side, we define identical trips based on the sequence of stops and the sequence of departure times.\n", "\n", "All in all, we remove ~5000 duplicated trips from all trips." ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_unique = trips_with_duplicates.dropDuplicates(['all_stops', 'all_departures'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lastly, we remove trips that only serve a single stop (most likely due to the pruning of stops ouside the 15km radius of Zürich HB)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# removing trips with a single stop only:\n", "trips_unique = trips_unique.withColumn(\"stop_count\", slen(trips_with_duplicates.all_stops)).filter(F.col('stop_count')>1)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+----------+\n", "| trip_id| all_stops| all_departures| all_arrivals|stop_count|\n", "+--------------------+--------------------+--------------------+--------------------+----------+\n", "|168.TA.1-17-A-j19...|[8502273, 8517377...|[17:51:00, 17:52:...|[17:51:00, 17:52:...| 7|\n", "|15.TA.80-53-Y-j19...| [8503000, 8503202]|[09:12:00, 09:21:00]|[09:12:00, 09:21:00]| 2|\n", "|80.TA.16-5-j19-1....|[8503016, 8503006...|[19:09:00, 19:15:...|[19:07:00, 19:13:...| 3|\n", "|136.TA.26-10-B-j1...|[8503057, 8503056...|[12:54:00, 12:59:...|[12:54:00, 12:59:...| 8|\n", "|73.TA.26-4-B-j19-...|[8503088, 8503090...|[11:38:00, 11:39:...|[11:38:00, 11:39:...| 12|\n", "|55.TA.26-7-A-j19-...|[8503104, 8503003...|[14:03:00, 14:15:...|[14:02:00, 14:14:...| 11|\n", "|551.TA.26-11-j19-...|[8503147, 8503003...|[18:20:00, 18:25:...|[18:20:00, 18:25:...| 8|\n", "|216.TA.26-24-j19-...|[8503204, 8503203...|[19:15:00, 19:17:...|[19:15:00, 19:17:...| 12|\n", "|154.TA.26-9-A-j19...|[8503313, 8503312...|[13:32:00, 13:35:...|[13:32:00, 13:35:...| 13|\n", "|460.TA.79-24-j19-...| [8503499, 8503500]|[16:25:00, 16:28:00]|[16:25:00, 16:28:00]| 2|\n", "|159.TA.79-24-j19-...| [8503500, 8503499]|[12:40:00, 12:43:00]|[12:40:00, 12:43:00]| 2|\n", "|99.TA.79-24-j19-1...| [8503500, 8503499]|[17:40:00, 17:43:00]|[17:40:00, 17:43:00]| 2|\n", "|80.TA.26-36-j19-1...|[8503508, 8503001...|[11:38:00, 11:44:...|[11:36:00, 11:43:...| 5|\n", "|123.TA.1-350-j19-...|[8503610, 8573711...|[07:41:00, 07:43:...|[07:41:00, 07:43:...| 19|\n", "|4.TA.26-769-j19-1...|[8503700, 8576161...|[17:35:00, 17:37:...|[17:35:00, 17:37:...| 10|\n", "|281.TA.26-660-j19...|[8503700, 8588316...|[09:15:00, 09:16:...|[09:15:00, 09:16:...| 14|\n", - "|269.TA.26-660-j19...|[8503700, 8588316...|[09:45:00, 09:46:...|[09:45:00, 09:46:...| 14|\n", + "|279.TA.26-660-j19...|[8503700, 8588316...|[09:45:00, 09:46:...|[09:45:00, 09:46:...| 14|\n", "|983.TA.26-136-j19...|[8503855, 8594182...|[07:50:00, 07:51:...|[07:50:00, 07:51:...| 6|\n", "|7.TA.90-71-Y-j19-...|[8530645, 8530646...|[14:07:00, 14:15:...|[14:07:00, 14:11:...| 3|\n", "|138.TA.1-350-j19-...|[8572560, 8572599...|[17:49:00, 17:50:...|[17:49:00, 17:50:...| 20|\n", "+--------------------+--------------------+--------------------+--------------------+----------+\n", "only showing top 20 rows" ] } ], "source": [ "trips_unique.show()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "19614" ] } ], "source": [ "trips_unique.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6) building routes\n", "\n", "- 6) building routes based on unique trips\n", " - order unique_trips by stop_sequence, earliest departure time\n", " - each window with the same stop_sequence gets a unique routeID\n", " \n", "- 7) generate a RAPTOR compatible stop_times\n", " - filter with unique_trips\n", " - sort by routeID, earliest departure time\n", " \n", "We start by getting the first departure time for each unique trip, to be able to order them by route and first departure time for RAPTOR's `stopTimes` data structure." ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+\n", "| trip_id| all_stops| all_departures| all_arrivals|stop_count|departure_first_stop|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+\n", "|168.TA.1-17-A-j19...|[8502273, 8517377...|[17:51:00, 17:52:...|[17:51:00, 17:52:...| 7| 17:51:00|\n", "|15.TA.80-53-Y-j19...| [8503000, 8503202]|[09:12:00, 09:21:00]|[09:12:00, 09:21:00]| 2| 09:12:00|\n", "|80.TA.16-5-j19-1....|[8503016, 8503006...|[19:09:00, 19:15:...|[19:07:00, 19:13:...| 3| 19:09:00|\n", "|136.TA.26-10-B-j1...|[8503057, 8503056...|[12:54:00, 12:59:...|[12:54:00, 12:59:...| 8| 12:54:00|\n", "|73.TA.26-4-B-j19-...|[8503088, 8503090...|[11:38:00, 11:39:...|[11:38:00, 11:39:...| 12| 11:38:00|\n", "|55.TA.26-7-A-j19-...|[8503104, 8503003...|[14:03:00, 14:15:...|[14:02:00, 14:14:...| 11| 14:03:00|\n", "|551.TA.26-11-j19-...|[8503147, 8503003...|[18:20:00, 18:25:...|[18:20:00, 18:25:...| 8| 18:20:00|\n", "|216.TA.26-24-j19-...|[8503204, 8503203...|[19:15:00, 19:17:...|[19:15:00, 19:17:...| 12| 19:15:00|\n", "|154.TA.26-9-A-j19...|[8503313, 8503312...|[13:32:00, 13:35:...|[13:32:00, 13:35:...| 13| 13:32:00|\n", "|460.TA.79-24-j19-...| [8503499, 8503500]|[16:25:00, 16:28:00]|[16:25:00, 16:28:00]| 2| 16:25:00|\n", "|159.TA.79-24-j19-...| [8503500, 8503499]|[12:40:00, 12:43:00]|[12:40:00, 12:43:00]| 2| 12:40:00|\n", "|99.TA.79-24-j19-1...| [8503500, 8503499]|[17:40:00, 17:43:00]|[17:40:00, 17:43:00]| 2| 17:40:00|\n", "|80.TA.26-36-j19-1...|[8503508, 8503001...|[11:38:00, 11:44:...|[11:36:00, 11:43:...| 5| 11:38:00|\n", "|123.TA.1-350-j19-...|[8503610, 8573711...|[07:41:00, 07:43:...|[07:41:00, 07:43:...| 19| 07:41:00|\n", "|4.TA.26-769-j19-1...|[8503700, 8576161...|[17:35:00, 17:37:...|[17:35:00, 17:37:...| 10| 17:35:00|\n", "|281.TA.26-660-j19...|[8503700, 8588316...|[09:15:00, 09:16:...|[09:15:00, 09:16:...| 14| 09:15:00|\n", "|269.TA.26-660-j19...|[8503700, 8588316...|[09:45:00, 09:46:...|[09:45:00, 09:46:...| 14| 09:45:00|\n", - "|983.TA.26-136-j19...|[8503855, 8594182...|[07:50:00, 07:51:...|[07:50:00, 07:51:...| 6| 07:50:00|\n", + "|982.TA.26-136-j19...|[8503855, 8594182...|[07:50:00, 07:51:...|[07:50:00, 07:51:...| 6| 07:50:00|\n", "|7.TA.90-71-Y-j19-...|[8530645, 8530646...|[14:07:00, 14:15:...|[14:07:00, 14:11:...| 3| 14:07:00|\n", "|138.TA.1-350-j19-...|[8572560, 8572599...|[17:49:00, 17:50:...|[17:49:00, 17:50:...| 20| 17:49:00|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+\n", "only showing top 20 rows" ] } ], "source": [ "# code from https://stackoverflow.com/questions/52975567/get-first-n-elements-from-dataframe-arraytype-column-in-pyspark\n", "\n", "trips_unique = trips_unique.withColumn('departure_first_stop', F.col(\"all_departures\")[0])\n", "trips_unique.show()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+\n", "| trip_id| all_stops| all_departures| all_arrivals|stop_count|departure_first_stop|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+\n", "|203.TA.1-17-A-j19...|[8502187, 8502277...|[07:01:00, 07:02:...|[07:01:00, 07:02:...| 7| 07:01:00|\n", "|4.TA.30-57-Y-j19-...|[8502208, 8502209...|[07:18:00, 07:23:...|[07:18:00, 07:23:...| 3| 07:18:00|\n", "|5.TA.30-57-Y-j19-...|[8502208, 8502209...|[07:48:00, 07:53:...|[07:48:00, 07:53:...| 3| 07:48:00|\n", "|6.TA.30-57-Y-j19-...|[8502208, 8502209...|[08:18:00, 08:23:...|[08:18:00, 08:23:...| 3| 08:18:00|\n", "|7.TA.30-57-Y-j19-...|[8502208, 8502209...|[08:48:00, 08:53:...|[08:48:00, 08:53:...| 3| 08:48:00|\n", "|8.TA.30-57-Y-j19-...|[8502208, 8502209...|[09:18:00, 09:23:...|[09:18:00, 09:23:...| 3| 09:18:00|\n", "|9.TA.30-57-Y-j19-...|[8502208, 8502209...|[09:48:00, 09:53:...|[09:48:00, 09:53:...| 3| 09:48:00|\n", "|10.TA.30-57-Y-j19...|[8502208, 8502209...|[10:18:00, 10:23:...|[10:18:00, 10:23:...| 3| 10:18:00|\n", "|11.TA.30-57-Y-j19...|[8502208, 8502209...|[10:48:00, 10:53:...|[10:48:00, 10:53:...| 3| 10:48:00|\n", "|12.TA.30-57-Y-j19...|[8502208, 8502209...|[11:18:00, 11:23:...|[11:18:00, 11:23:...| 3| 11:18:00|\n", "|13.TA.30-57-Y-j19...|[8502208, 8502209...|[11:48:00, 11:53:...|[11:48:00, 11:53:...| 3| 11:48:00|\n", "|14.TA.30-57-Y-j19...|[8502208, 8502209...|[12:18:00, 12:23:...|[12:18:00, 12:23:...| 3| 12:18:00|\n", "|15.TA.30-57-Y-j19...|[8502208, 8502209...|[12:48:00, 12:53:...|[12:48:00, 12:53:...| 3| 12:48:00|\n", "|16.TA.30-57-Y-j19...|[8502208, 8502209...|[13:18:00, 13:23:...|[13:18:00, 13:23:...| 3| 13:18:00|\n", "|17.TA.30-57-Y-j19...|[8502208, 8502209...|[13:48:00, 13:53:...|[13:48:00, 13:53:...| 3| 13:48:00|\n", "|18.TA.30-57-Y-j19...|[8502208, 8502209...|[14:18:00, 14:23:...|[14:18:00, 14:23:...| 3| 14:18:00|\n", "|19.TA.30-57-Y-j19...|[8502208, 8502209...|[14:48:00, 14:53:...|[14:48:00, 14:53:...| 3| 14:48:00|\n", "|20.TA.30-57-Y-j19...|[8502208, 8502209...|[15:18:00, 15:23:...|[15:18:00, 15:23:...| 3| 15:18:00|\n", "|21.TA.30-57-Y-j19...|[8502208, 8502209...|[15:48:00, 15:53:...|[15:48:00, 15:53:...| 3| 15:48:00|\n", "|22.TA.30-57-Y-j19...|[8502208, 8502209...|[16:18:00, 16:23:...|[16:18:00, 16:23:...| 3| 16:18:00|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+\n", "only showing top 20 rows" ] } ], "source": [ "#ordering by stop_sequence (arbitrary order) and departure at the first stop (ascending)\n", "trips_unique = trips_unique.sort(trips_unique.all_stops, trips_unique.departure_first_stop)\n", "trips_unique.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In RAPTOR, routes are defined as collections of unique trips serving the same stop sequences at different times. Therefore, there is one route per sequence of stops, i.e unique entry in column `all_stops`. However, there is no specific rule to order routes depending on the stops they serve. We simply subset unique sequences of stops and index them from 0 to n-1 routes." ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| all_stops|\n", "+--------------------+\n", - "| [8573205, 8588553]|\n", - "|[8576240, 8591353...|\n", "|[8591049, 8591128...|\n", "|[8591057, 8591402...|\n", - "|[8591061, 8591270...|\n", "| [8591281, 8591046]|\n", "|[8591825, 8590504...|\n", + "| [8573205, 8588553]|\n", + "|[8576240, 8591353...|\n", + "|[8591061, 8591270...|\n", "|[8575921, 8575920...|\n", "|[8591035, 8591134...|\n", "|[8595129, 8590543...|\n", + "|[8576127, 8576139...|\n", "|[8503010, 8503011...|\n", "| [8575927, 8594339]|\n", - "|[8576127, 8576139...|\n", "|[8591031, 8588553...|\n", - "|[8502208, 8502209...|\n", "|[8503674, 8503659...|\n", "|[8576171, 8576172...|\n", "|[8576276, 8576277...|\n", "|[8590805, 8590794...|\n", "|[8591110, 8591306...|\n", + "|[8502208, 8502209...|\n", "+--------------------+\n", "only showing top 20 rows" ] } ], "source": [ "routes = trips_unique.select(trips_unique.all_stops).distinct()\n", "routes.show()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#building an index from 0 to n_routes\n", "# code from https://stackoverflow.com/questions/39057766/spark-equivelant-of-zipwithindex-in-dataframe\n", "from pyspark.sql.types import StructType, StructField, LongType\n", "def dfZipWithIndex (df, offset=0, colName=\"rowId\"):\n", " '''\n", " Enumerates dataframe rows is native order, like rdd.ZipWithIndex(), but on a dataframe \n", " and preserves a schema\n", "\n", " :param df: source dataframe\n", " :param offset: adjustment to zipWithIndex()'s index\n", " :param colName: name of the index column\n", " '''\n", "\n", " new_schema = StructType(\n", " [StructField(colName,LongType(),True)] # new added field in front\n", " + df.schema.fields # previous schema\n", " )\n", "\n", " zipped_rdd = df.rdd.zipWithIndex()\n", "\n", " new_rdd = zipped_rdd.map(lambda args: ([args[1] + offset] + list(args[0])))\n", "\n", " return spark.createDataFrame(new_rdd, new_schema)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+---------+--------------------+\n", "|route_int| all_stops|\n", "+---------+--------------------+\n", - "| 0| [8573205, 8588553]|\n", - "| 1|[8576240, 8591353...|\n", - "| 2|[8591049, 8591128...|\n", - "| 3|[8591057, 8591402...|\n", - "| 4|[8591061, 8591270...|\n", - "| 5| [8591281, 8591046]|\n", - "| 6|[8591825, 8590504...|\n", + "| 0|[8576240, 8591353...|\n", + "| 1|[8591049, 8591128...|\n", + "| 2|[8591057, 8591402...|\n", + "| 3| [8591281, 8591046]|\n", + "| 4|[8591825, 8590504...|\n", + "| 5| [8573205, 8588553]|\n", + "| 6|[8591061, 8591270...|\n", "| 7|[8575921, 8575920...|\n", - "| 8|[8591035, 8591134...|\n", - "| 9|[8595129, 8590543...|\n", + "| 8|[8595129, 8590543...|\n", + "| 9|[8591035, 8591134...|\n", "| 10|[8503010, 8503011...|\n", - "| 11| [8575927, 8594339]|\n", - "| 12|[8576127, 8576139...|\n", - "| 13|[8591031, 8588553...|\n", + "| 11|[8591031, 8588553...|\n", + "| 12| [8575927, 8594339]|\n", + "| 13|[8576127, 8576139...|\n", "| 14|[8502208, 8502209...|\n", "| 15|[8503674, 8503659...|\n", "| 16|[8576171, 8576172...|\n", - "| 17|[8576276, 8576277...|\n", - "| 18|[8590805, 8590794...|\n", - "| 19|[8591110, 8591306...|\n", + "| 17|[8590805, 8590794...|\n", + "| 18|[8591110, 8591306...|\n", + "| 19|[8576276, 8576277...|\n", "+---------+--------------------+\n", "only showing top 20 rows" ] } ], "source": [ "routes_indexed = dfZipWithIndex(routes, 0, 'route_int')\n", "routes_indexed.show()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", "| all_stops| trip_id| all_departures| all_arrivals|stop_count|departure_first_stop|route_int|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", - "|[8503064, 8503065...|483.TA.26-18-j19-...|[10:41:00, 10:45:...|[10:41:00, 10:45:...| 14| 10:41:00| 41|\n", - "|[8591355, 8591354...|1107.TA.26-75-A-j...|[15:59:00, 16:00:...|[15:59:00, 16:00:...| 21| 15:59:00| 80|\n", - "|[8595899, 8591206...|110.TA.26-40-j19-...|[08:23:00, 08:23:...|[08:23:00, 08:23:...| 8| 08:23:00| 87|\n", - "|[8591276, 8591101...|613.TA.26-69-j19-...|[18:20:00, 18:23:...|[18:20:00, 18:23:...| 9| 18:20:00| 105|\n", - "|[8591401, 8503610...|1230.TA.26-80-j19...|[16:03:00, 16:04:...|[16:03:00, 16:04:...| 29| 16:03:00| 110|\n", - "|[8591401, 8503610...|1316.TA.26-80-j19...|[13:22:00, 13:23:...|[13:22:00, 13:23:...| 29| 13:22:00| 110|\n", - "|[8591401, 8503610...|1358.TA.26-80-j19...|[08:30:00, 08:31:...|[08:30:00, 08:31:...| 29| 08:30:00| 110|\n", - "|[8502208, 8502209...|677.TA.26-24-j19-...|[15:15:00, 15:17:...|[15:15:00, 15:17:...| 14| 15:15:00| 111|\n", - "|[8503150, 8576140...|107.TA.26-726-j19...|[19:02:00, 19:02:...|[19:02:00, 19:02:...| 11| 19:02:00| 122|\n", + "|[8591825, 8590504...|117.TA.26-703-j19...|[18:04:00, 18:05:...|[18:04:00, 18:05:...| 9| 18:04:00| 4|\n", + "|[8591355, 8591354...|1232.TA.26-75-A-j...|[17:06:00, 17:07:...|[17:06:00, 17:07:...| 21| 17:06:00| 78|\n", + "|[8591355, 8591354...|1135.TA.26-75-A-j...|[12:59:00, 13:00:...|[12:59:00, 13:00:...| 21| 12:59:00| 78|\n", + "|[8591401, 8503610...|1269.TA.26-80-j19...|[19:22:00, 19:23:...|[19:22:00, 19:23:...| 29| 19:22:00| 108|\n", "|[8580449, 8591063...|1892.TA.26-781-j1...|[15:17:00, 15:18:...|[15:17:00, 15:18:...| 12| 15:17:00| 136|\n", - "|[8590269, 8590276...|270.TA.26-303-j19...|[09:06:00, 09:07:...|[09:06:00, 09:07:...| 23| 09:06:00| 146|\n", - "|[8590269, 8590276...|276.TA.26-303-j19...|[08:06:00, 08:07:...|[08:06:00, 08:07:...| 23| 08:06:00| 146|\n", "| [8503081, 8503082]|1373.TA.26-LAF-j1...|[17:20:00, 17:25:00]|[17:20:00, 17:25:00]| 2| 17:20:00| 212|\n", - "|[8591067, 8587349...|1647.TA.26-17-j19...|[16:03:00, 16:06:...|[16:03:00, 16:06:...| 17| 16:03:00| 221|\n", - "|[8591276, 8591101...|270.TA.26-83-j19-...|[19:35:00, 19:38:...|[19:35:00, 19:38:...| 16| 19:35:00| 231|\n", - "|[8591276, 8591101...|225.TA.26-83-j19-...|[18:20:00, 18:22:...|[18:20:00, 18:22:...| 16| 18:20:00| 231|\n", - "|[8591122, 8591249...|31.TA.26-37-A-j19...|[11:02:00, 11:03:...|[11:02:00, 11:03:...| 6| 11:02:00| 265|\n", - "|[8591233, 8591107...|212.TA.26-704-j19...|[18:38:00, 18:42:...|[18:38:00, 18:42:...| 15| 18:38:00| 266|\n", + "|[8590878, 8576280...|15.TA.26-453-j19-...|[18:53:00, 18:53:...|[18:53:00, 18:53:...| 6| 18:53:00| 268|\n", + "|[8590716, 8590714...|364.TA.26-743-j19...|[15:30:00, 15:30:...|[15:30:00, 15:30:...| 17| 15:30:00| 278|\n", + "|[8590804, 8590805...|730.TA.26-31-j19-...|[12:32:00, 12:33:...|[12:32:00, 12:33:...| 9| 12:32:00| 281|\n", "| [8596126, 8573205]|4.TA.6-E02-j19-1.4.R|[14:50:00, 15:10:00]|[14:45:00, 15:10:00]| 2| 14:50:00| 283|\n", - "|[8580449, 8591063...|645.TA.26-768-j19...|[07:41:00, 07:42:...|[07:41:00, 07:42:...| 13| 07:41:00| 297|\n", + "|[8591439, 8591106...|1316.TA.26-7-B-j1...|[18:20:00, 18:21:...|[18:20:00, 18:21:...| 31| 18:20:00| 321|\n", + "|[8503104, 8503101...|6.TA.26-20-j19-1.2.H|[07:46:00, 07:53:...|[07:45:00, 07:52:...| 5| 07:46:00| 375|\n", + "|[8591136, 8591435...|2056.TA.26-13-j19...|[19:47:00, 19:48:...|[19:47:00, 19:48:...| 10| 19:47:00| 396|\n", + "|[8591122, 8591201...|1133.TA.26-69-j19...|[17:37:00, 17:38:...|[17:37:00, 17:38:...| 9| 17:37:00| 400|\n", + "|[8503059, 8530813...|427.TA.26-18-j19-...|[14:33:00, 14:34:...|[14:33:00, 14:34:...| 13| 14:33:00| 432|\n", + "|[8590637, 8590636...|92.TA.26-727-j19-...|[14:17:00, 14:18:...|[14:17:00, 14:18:...| 9| 14:17:00| 434|\n", + "|[8590901, 8590903...|637.TA.26-759-j19...|[09:08:00, 09:09:...|[09:08:00, 09:09:...| 28| 09:08:00| 460|\n", + "|[8591190, 8591390...|285.TA.26-9-B-j19...|[09:54:00, 09:55:...|[09:54:00, 09:55:...| 32| 09:54:00| 551|\n", + "|[8591354, 8591124...|673.TA.26-768-j19...|[07:00:00, 07:02:...|[07:00:00, 07:02:...| 9| 07:00:00| 586|\n", + "|[8591057, 8591896...|1201.TA.26-4-j19-...|[12:31:00, 12:32:...|[12:31:00, 12:32:...| 26| 12:31:00| 608|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", "only showing top 20 rows" ] } ], "source": [ "trips_unique = trips_unique.join(routes_indexed, how='inner', on='all_stops').dropDuplicates()\n", "trips_unique.show()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", "| all_stops| trip_id| all_departures| all_arrivals|stop_count|departure_first_stop|route_int|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", - "|8503064 8503065 8...|483.TA.26-18-j19-...|10:41:00 10:45:00...|10:41:00 10:45:00...| 14| 10:41:00| 41|\n", - "|8591355 8591354 8...|1107.TA.26-75-A-j...|15:59:00 16:00:00...|15:59:00 16:00:00...| 21| 15:59:00| 80|\n", - "|8595899 8591206 8...|110.TA.26-40-j19-...|08:23:00 08:23:00...|08:23:00 08:23:00...| 8| 08:23:00| 87|\n", - "|8591276 8591101 8...|613.TA.26-69-j19-...|18:20:00 18:23:00...|18:20:00 18:23:00...| 9| 18:20:00| 105|\n", - "|8591401 8503610 8...|1230.TA.26-80-j19...|16:03:00 16:04:00...|16:03:00 16:04:00...| 29| 16:03:00| 110|\n", - "|8591401 8503610 8...|1316.TA.26-80-j19...|13:22:00 13:23:00...|13:22:00 13:23:00...| 29| 13:22:00| 110|\n", - "|8591401 8503610 8...|1358.TA.26-80-j19...|08:30:00 08:31:00...|08:30:00 08:31:00...| 29| 08:30:00| 110|\n", - "|8502208 8502209 8...|677.TA.26-24-j19-...|15:15:00 15:17:00...|15:15:00 15:17:00...| 14| 15:15:00| 111|\n", - "|8503150 8576140 8...|107.TA.26-726-j19...|19:02:00 19:02:00...|19:02:00 19:02:00...| 11| 19:02:00| 122|\n", + "|8591825 8590504 8...|117.TA.26-703-j19...|18:04:00 18:05:00...|18:04:00 18:05:00...| 9| 18:04:00| 4|\n", + "|8591355 8591354 8...|1232.TA.26-75-A-j...|17:06:00 17:07:00...|17:06:00 17:07:00...| 21| 17:06:00| 78|\n", + "|8591355 8591354 8...|1135.TA.26-75-A-j...|12:59:00 13:00:00...|12:59:00 13:00:00...| 21| 12:59:00| 78|\n", + "|8591276 8591101 8...|570.TA.26-69-j19-...|15:21:00 15:23:00...|15:21:00 15:23:00...| 9| 15:21:00| 104|\n", + "|8573504 8581548 8...|247.TA.26-813-j19...|19:15:00 19:15:00...|19:15:00 19:15:00...| 8| 19:15:00| 112|\n", + "|8590785 8590722 8...|14.TA.79-18-A-j19...|11:41:00 11:43:00...|11:41:00 11:43:00...| 5| 11:41:00| 117|\n", "|8580449 8591063 8...|1892.TA.26-781-j1...|15:17:00 15:18:00...|15:17:00 15:18:00...| 12| 15:17:00| 136|\n", - "|8590269 8590276 8...|270.TA.26-303-j19...|09:06:00 09:07:00...|09:06:00 09:07:00...| 23| 09:06:00| 146|\n", - "|8590269 8590276 8...|276.TA.26-303-j19...|08:06:00 08:07:00...|08:06:00 08:07:00...| 23| 08:06:00| 146|\n", - "| 8503081 8503082|1373.TA.26-LAF-j1...| 17:20:00 17:25:00| 17:20:00 17:25:00| 2| 17:20:00| 212|\n", - "|8591067 8587349 8...|1647.TA.26-17-j19...|16:03:00 16:06:00...|16:03:00 16:06:00...| 17| 16:03:00| 221|\n", - "|8591276 8591101 8...|270.TA.26-83-j19-...|19:35:00 19:38:00...|19:35:00 19:38:00...| 16| 19:35:00| 231|\n", - "|8591276 8591101 8...|225.TA.26-83-j19-...|18:20:00 18:22:00...|18:20:00 18:22:00...| 16| 18:20:00| 231|\n", - "|8591122 8591249 8...|31.TA.26-37-A-j19...|11:02:00 11:03:00...|11:02:00 11:03:00...| 6| 11:02:00| 265|\n", - "|8591233 8591107 8...|212.TA.26-704-j19...|18:38:00 18:42:00...|18:38:00 18:42:00...| 15| 18:38:00| 266|\n", - "| 8596126 8573205|4.TA.6-E02-j19-1.4.R| 14:50:00 15:10:00| 14:45:00 15:10:00| 2| 14:50:00| 283|\n", - "|8580449 8591063 8...|645.TA.26-768-j19...|07:41:00 07:42:00...|07:41:00 07:42:00...| 13| 07:41:00| 297|\n", + "|8503016 8503006 8...|496.TA.26-24-j19-...|07:04:00 07:09:00...|07:02:00 07:08:00...| 12| 07:04:00| 198|\n", + "|8591067 8587349 8...|1673.TA.26-17-j19...|15:11:00 15:13:00...|15:11:00 15:13:00...| 17| 15:11:00| 218|\n", + "|8580433 8580438 8...|170.TA.26-733-j19...|18:30:00 18:31:00...|18:30:00 18:31:00...| 10| 18:30:00| 239|\n", + "|8591116 8591260 8...|190.TA.26-35-B-j1...|19:27:00 19:28:00...|19:27:00 19:28:00...| 8| 19:27:00| 290|\n", + "|8591439 8591106 8...|1316.TA.26-7-B-j1...|18:20:00 18:21:00...|18:20:00 18:21:00...| 31| 18:20:00| 321|\n", + "|8591136 8591435 8...|2056.TA.26-13-j19...|19:47:00 19:48:00...|19:47:00 19:48:00...| 10| 19:47:00| 396|\n", + "|8503059 8530813 8...|427.TA.26-18-j19-...|14:33:00 14:34:00...|14:33:00 14:34:00...| 13| 14:33:00| 432|\n", + "|8590637 8590636 8...|92.TA.26-727-j19-...|14:17:00 14:18:00...|14:17:00 14:18:00...| 9| 14:17:00| 434|\n", + "|8590318 8591051 8...|436.TA.26-2-A-j19...|09:58:00 09:59:00...|09:58:00 09:59:00...| 25| 09:58:00| 578|\n", + "|8591341 8502572 8...|605.TA.26-67-j19-...|17:21:00 17:22:00...|17:21:00 17:22:00...| 14| 17:21:00| 579|\n", + "|8591341 8502572 8...|43.TA.26-67-j19-1...|09:50:00 09:52:00...|09:50:00 09:52:00...| 14| 09:50:00| 579|\n", + "|8591341 8502572 8...|75.TA.26-67-j19-1...|09:28:00 09:29:00...|09:28:00 09:29:00...| 14| 09:28:00| 579|\n", + "|8591057 8591896 8...|1201.TA.26-4-j19-...|12:31:00 12:32:00...|12:31:00 12:32:00...| 26| 12:31:00| 608|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", "only showing top 20 rows" ] } ], "source": [ "# converting arrays to strings to be able to store the data as csv\n", "trips_unique_string_lists = trips_unique.withColumn(\"all_stops\", F.concat_ws(\" \", \"all_stops\"))\\\n", ".withColumn(\"all_departures\", F.concat_ws(\" \", \"all_departures\"))\\\n", ".withColumn(\"all_arrivals\", F.concat_ws(\" \", \"all_arrivals\"))\n", "trips_unique_string_lists.show()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_unique_string_lists.write.csv('data/lgpt_guys/trips_unique_string_lists.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", "| all_stops| trip_id| all_departures| all_arrivals|stop_count|departure_first_stop|route_int|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", - "|8590898 8590900 8...|130.TA.26-772-j19...|08:45:00 08:46:00...|08:45:00 08:46:00...| 12| 08:45:00| 47|\n", - "|8575977 8575976 8...|275.TA.26-845-j19...|14:08:00 14:09:00...|14:08:00 14:09:00...| 4| 14:08:00| 69|\n", - "|8581546 8581543 8...|157.TA.26-813-j19...|12:37:00 12:37:00...|12:37:00 12:37:00...| 8| 12:37:00| 71|\n", - "|8573504 8588051 8...|37.TA.26-811-j19-...|11:30:00 11:31:00...|11:30:00 11:31:00...| 11| 11:30:00| 75|\n", - "|8576127 8503154 8...|18.TA.26-726-j19-...|12:50:00 12:52:00...|12:50:00 12:52:00...| 10| 12:50:00| 77|\n", - "|8576127 8503154 8...|30.TA.26-726-j19-...|10:20:00 10:22:00...|10:20:00 10:22:00...| 10| 10:20:00| 77|\n", - "|8591355 8591354 8...|1111.TA.26-75-A-j...|15:29:00 15:30:00...|15:29:00 15:30:00...| 21| 15:29:00| 80|\n", - "|8591276 8591101 8...|607.TA.26-69-j19-...|14:06:00 14:08:00...|14:06:00 14:08:00...| 9| 14:06:00| 105|\n", - "|8591401 8503610 8...|1325.TA.26-80-j19...|17:15:00 17:17:00...|17:15:00 17:16:00...| 29| 17:15:00| 110|\n", - "|8591401 8503610 8...|1351.TA.26-80-j19...|16:35:00 16:37:00...|16:35:00 16:36:00...| 29| 16:35:00| 110|\n", - "|8591028 8591023 8...|37.TA.26-91-j19-1...|17:24:00 17:25:00...|17:24:00 17:25:00...| 13| 17:24:00| 116|\n", - "|8591123 8591174 8...|17.TA.26-E-j19-1.4.R|11:04:00 11:07:00...|11:04:00 11:07:00...| 3| 11:04:00| 118|\n", - "|8591123 8591174 8...|63.TA.26-E-j19-1.4.R|09:34:00 09:37:00...|09:34:00 09:37:00...| 3| 09:34:00| 118|\n", - "|8503061 8590601 8...|51.TA.79-18-A-j19...|12:20:00 12:21:00...|12:20:00 12:21:00...| 5| 12:20:00| 121|\n", - "|8530812 8591094 8...|199.TA.26-77-j19-...|16:24:00 16:26:00...|16:24:00 16:26:00...| 7| 16:24:00| 124|\n", - "|8591365 8591329 8...|742.TA.26-89-j19-...|17:03:00 17:04:00...|17:03:00 17:04:00...| 27| 17:03:00| 135|\n", - "|8591365 8591329 8...|603.TA.26-89-j19-...|10:35:00 10:37:00...|10:35:00 10:37:00...| 27| 10:35:00| 135|\n", - "|8591065 8590566 8...|62.TA.26-743-j19-...|11:09:00 11:10:00...|11:09:00 11:10:00...| 18| 11:09:00| 147|\n", - "|8591349 8591403 8...|408.TA.26-61-j19-...|12:09:00 12:10:00...|12:09:00 12:10:00...| 19| 12:09:00| 150|\n", - "|8591349 8591403 8...|415.TA.26-61-j19-...|07:18:00 07:19:00...|07:18:00 07:19:00...| 19| 07:18:00| 150|\n", + "|8590464 8590463 8...|603.TA.26-185-j19...|17:51:00 17:53:00...|17:51:00 17:53:00...| 14| 17:51:00| 21|\n", + "|8503305 8503306 8...|641.TA.26-8-A-j19...|09:19:00 09:23:00...|09:18:00 09:23:00...| 13| 09:19:00| 43|\n", + "|8503057 8503056 8...|139.TA.26-10-B-j1...|13:54:00 13:59:00...|13:54:00 13:59:00...| 8| 13:54:00| 83|\n", + "|8591401 8503610 8...|1310.TA.26-80-j19...|17:51:00 17:52:00...|17:51:00 17:52:00...| 29| 17:51:00| 108|\n", + "|8591401 8503610 8...|1341.TA.26-80-j19...|16:51:00 16:52:00...|16:51:00 16:52:00...| 29| 16:51:00| 108|\n", + "|8503097 8503089 8...|19.TA.26-4-B-j19-...|11:10:00 11:11:00...|11:10:00 11:11:00...| 12| 11:10:00| 129|\n", + "|8590269 8590276 8...|280.TA.26-303-j19...|07:51:00 07:52:00...|07:51:00 07:52:00...| 23| 07:51:00| 144|\n", + "|8591065 8590566 8...|126.TA.26-751-j19...|15:59:00 15:59:00...|15:59:00 15:59:00...| 10| 15:59:00| 146|\n", + "|8591349 8591403 8...|421.TA.26-61-j19-...|08:42:00 08:43:00...|08:42:00 08:43:00...| 19| 08:42:00| 148|\n", + "|8591067 8587349 8...|1514.TA.26-17-j19...|14:11:00 14:13:00...|14:11:00 14:13:00...| 17| 14:11:00| 221|\n", + "|8591067 8587349 8...|1546.TA.26-17-j19...|12:33:00 12:36:00...|12:33:00 12:36:00...| 17| 12:33:00| 221|\n", + "|8580433 8580438 8...|195.TA.26-733-j19...|14:45:00 14:46:00...|14:45:00 14:46:00...| 10| 14:45:00| 238|\n", + "|8580433 8580438 8...|247.TA.26-733-j19...|07:15:00 07:16:00...|07:15:00 07:16:00...| 10| 07:15:00| 238|\n", + "|8590647 8587998 8...|42.TA.26-973-j19-...|15:08:00 15:09:00...|15:08:00 15:09:00...| 14| 15:08:00| 274|\n", + "|8591054 8576262 8...|113.TA.26-37-A-j1...|08:48:00 08:50:00...|08:48:00 08:50:00...| 6| 08:48:00| 275|\n", + "|8580449 8591063 8...|1101.TA.26-768-j1...|16:08:00 16:09:00...|16:08:00 16:09:00...| 13| 16:08:00| 297|\n", + "|8591439 8591106 8...|1320.TA.26-7-B-j1...|17:50:00 17:51:00...|17:50:00 17:51:00...| 31| 17:50:00| 320|\n", + "|8591439 8591106 8...|1338.TA.26-7-B-j1...|08:30:00 08:31:00...|08:30:00 08:31:00...| 31| 08:30:00| 320|\n", + "|8573504 8588051 8...|10.TA.26-812-j19-...|16:45:00 16:45:00...|16:45:00 16:45:00...| 12| 16:45:00| 331|\n", + "|8591230 8591087 8...|197.TA.26-751-j19...|16:35:00 16:36:00...|16:35:00 16:36:00...| 9| 16:35:00| 405|\n", "+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------+\n", "only showing top 20 rows" ] } ], "source": [ "# we prepare an inner join on trips from trips_unique with stopTimes.\n", "\n", "trips_unique_string_lists = spark.read.csv('data/lgpt_guys/trips_unique_string_lists.csv', header = True)\n", "trips_unique_string_lists.show()" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+---------+----------+\n", "| trip_id|departure_first_stop|route_int|stop_count|\n", "+--------------------+--------------------+---------+----------+\n", - "|130.TA.26-772-j19...| 08:45:00| 47| 12|\n", - "|275.TA.26-845-j19...| 14:08:00| 69| 4|\n", - "|157.TA.26-813-j19...| 12:37:00| 71| 8|\n", - "|37.TA.26-811-j19-...| 11:30:00| 75| 11|\n", - "|18.TA.26-726-j19-...| 12:50:00| 77| 10|\n", - "|30.TA.26-726-j19-...| 10:20:00| 77| 10|\n", - "|1111.TA.26-75-A-j...| 15:29:00| 80| 21|\n", - "|607.TA.26-69-j19-...| 14:06:00| 105| 9|\n", - "|1325.TA.26-80-j19...| 17:15:00| 110| 29|\n", - "|1351.TA.26-80-j19...| 16:35:00| 110| 29|\n", - "|37.TA.26-91-j19-1...| 17:24:00| 116| 13|\n", - "|17.TA.26-E-j19-1.4.R| 11:04:00| 118| 3|\n", - "|63.TA.26-E-j19-1.4.R| 09:34:00| 118| 3|\n", - "|51.TA.79-18-A-j19...| 12:20:00| 121| 5|\n", - "|199.TA.26-77-j19-...| 16:24:00| 124| 7|\n", - "|742.TA.26-89-j19-...| 17:03:00| 135| 27|\n", - "|603.TA.26-89-j19-...| 10:35:00| 135| 27|\n", - "|62.TA.26-743-j19-...| 11:09:00| 147| 18|\n", - "|408.TA.26-61-j19-...| 12:09:00| 150| 19|\n", - "|415.TA.26-61-j19-...| 07:18:00| 150| 19|\n", + "|603.TA.26-185-j19...| 17:51:00| 21| 14|\n", + "|641.TA.26-8-A-j19...| 09:19:00| 43| 13|\n", + "|139.TA.26-10-B-j1...| 13:54:00| 83| 8|\n", + "|1310.TA.26-80-j19...| 17:51:00| 108| 29|\n", + "|1341.TA.26-80-j19...| 16:51:00| 108| 29|\n", + "|19.TA.26-4-B-j19-...| 11:10:00| 129| 12|\n", + "|280.TA.26-303-j19...| 07:51:00| 144| 23|\n", + "|126.TA.26-751-j19...| 15:59:00| 146| 10|\n", + "|421.TA.26-61-j19-...| 08:42:00| 148| 19|\n", + "|1514.TA.26-17-j19...| 14:11:00| 221| 17|\n", + "|1546.TA.26-17-j19...| 12:33:00| 221| 17|\n", + "|195.TA.26-733-j19...| 14:45:00| 238| 10|\n", + "|247.TA.26-733-j19...| 07:15:00| 238| 10|\n", + "|42.TA.26-973-j19-...| 15:08:00| 274| 14|\n", + "|113.TA.26-37-A-j1...| 08:48:00| 275| 6|\n", + "|1101.TA.26-768-j1...| 16:08:00| 297| 13|\n", + "|1320.TA.26-7-B-j1...| 17:50:00| 320| 31|\n", + "|1338.TA.26-7-B-j1...| 08:30:00| 320| 31|\n", + "|10.TA.26-812-j19-...| 16:45:00| 331| 12|\n", + "|197.TA.26-751-j19...| 16:35:00| 405| 9|\n", "+--------------------+--------------------+---------+----------+\n", "only showing top 20 rows" ] } ], "source": [ "trips_unique_for_join = trips_unique_string_lists.select(trips_unique_string_lists.trip_id, \\\n", " trips_unique_string_lists.departure_first_stop, \\\n", " trips_unique_string_lists.route_int, \\\n", " trips_unique_string_lists.stop_count)\n", "trips_unique_for_join.show()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "260459" ] } ], "source": [ "stop_times = stop_times.join(trips_unique_for_join, how='inner', on='trip_id')\n", "stop_times.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "At this step, we sort `stop_times` in the same order as RAPTOR's `stopTimes` data structure, that is:\n", - "- By route\n", - "- By trip (starting with the one that leaves the first stop of the route at the earliest departure time)\n", - "- By stop in the sequence of stops defining the route\n" + "Note that this csv is does not carry an index allowing to quickly sort it after loading." ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+--------------------+---------+----------+\n", - "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_id_general| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|departure_hour|departure_first_stop|route_int|stop_count|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+--------------------+---------+----------+\n", - "|1672.TA.26-10-j19...|8573205| 07:00:00| 07:01:00| 27| 0| 0| 8573205|Zürich Flughafen,...|47.4504413038344|8.56372943623189| 26-10-j19-1|Zürich Flughafen,...| 4096| 1| 7| 07:01:00| 0| 2|\n", - "|1672.TA.26-10-j19...|8588553| 07:02:00| 07:02:00| 28| 0| 0| 8588553|Zürich Flughafen,...|47.4524944976638|8.57205681891684| 26-10-j19-1|Zürich Flughafen,...| 4096| 1| 7| 07:01:00| 0| 2|\n", - "|2064.TA.26-13-j19...|8576240| 07:00:00| 07:00:00| 5| 0| 0| 8576240|Zürich, Meierhofp...|47.4020100860391|8.49937412926861| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591353| 07:01:00| 07:01:00| 6| 0| 0| 8591353| Zürich, Schwert|47.3997299435837|8.50461130737576| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591039| 07:02:00| 07:02:00| 7| 0| 0| 8591039| Zürich, Alte Trotte|47.3977659017765|8.50725235431143| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591121| 07:03:00| 07:03:00| 8| 0| 0| 8591121|Zürich, Eschergutweg|47.3962700189648|8.51204037477646| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591417| 07:05:00| 07:05:00| 9| 0| 0| 8591417| Zürich, Waidfussweg|47.3954977376399|8.51840044698891| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591437| 07:06:00| 07:06:00| 10| 0| 0| 8591437|Zürich, Wipkinger...|47.3925909395293|8.52357474302616| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8580522| 07:08:00| 07:08:00| 11| 0| 0| 8580522|Zürich, Escher-Wy...|47.3907969150758| 8.5223979500038| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591110| 07:09:00| 07:09:00| 12| 0| 0| 8591110| Zürich, Dammweg|47.3884919601296|8.52639545301869| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591306| 07:10:00| 07:10:00| 13| 0| 0| 8591306|Zürich, Quellenst...|47.3867403702341|8.52874903906341| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591257| 07:11:00| 07:11:00| 14| 0| 0| 8591257| Zürich, Limmatplatz|47.3845994590919|8.53162364797299| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591282| 07:12:00| 07:12:00| 15| 0| 0| 8591282|Zürich, Museum fü...|47.3821239221899|8.53493843137185| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591368| 07:14:00| 07:14:00| 16| 0| 0| 8591368| Zürich, Sihlquai/HB|47.3798733332196|8.53760642776606| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8587349| 07:16:00| 07:16:00| 17| 0| 0| 8587349|Zürich, Bahnhofqu...|47.3775618175159|8.54173867807358| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591067| 07:18:00| 07:18:00| 18| 0| 0| 8591067|Zürich, Bahnhofst...|47.3765581015114|8.53994204750509| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591316| 07:20:00| 07:20:00| 19| 0| 0| 8591316| Zürich, Rennweg|47.3730662375955|8.53845982728609| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591299| 07:22:00| 07:22:00| 20| 0| 0| 8591299| Zürich, Paradeplatz|47.3693672863583|8.53876525448273| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591384| 07:23:00| 07:23:00| 21| 0| 0| 8591384|Zürich, Stockerst...|47.3677002399791|8.53501029659459| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591404| 07:24:00| 07:24:00| 22| 0| 0| 8591404|Zürich, Tunnelstr...|47.3661426599847|8.53253094641008| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591059| 07:25:00| 07:25:00| 23| 0| 0| 8591059|Zürich Enge, Bahn...|47.3645546111557|8.53045583810347| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591415| 07:27:00| 07:27:00| 24| 0| 0| 8591415|Zürich, Waffenpla...|47.3614818138862|8.52574866601403| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591366| 07:28:00| 07:28:00| 25| 0| 0| 8591366|Zürich, Sihlcity ...|47.3600640074787|8.52303575385561| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591329| 07:29:00| 07:29:00| 26| 0| 0| 8591329|Zürich, Saalsport...|47.3578611597087|8.52040369007277| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591245| 07:30:00| 07:30:00| 27| 0| 0| 8591245| Zürich, Laubegg|47.3587313564196|8.51708890667391| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591405| 07:32:00| 07:32:00| 28| 0| 0| 8591405| Zürich, Uetlihof|47.3567353594536|8.51396276948474| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591385| 07:33:00| 07:33:00| 29| 0| 0| 8591385|Zürich, Strassenv...|47.3530717783138|8.51171698127413| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|2064.TA.26-13-j19...|8591034| 07:34:00| 07:34:00| 30| 0| 0| 8591034| Zürich, Albisgütli|47.3519945640447| 8.5077104951064| 26-13-j19-1| Zürich, Albisgütli| 1831| 0| 7| 07:00:00| 1| 26|\n", - "|791.TA.26-11-A-j1...|8591049| 19:49:00| 19:49:00| 1| 0| 0| 8591049| Zürich, Auzelg|47.4166918393693| 8.568113214819|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|791.TA.26-11-A-j1...|8591128| 19:51:00| 19:51:00| 2| 0| 0| 8591128|Zürich, Fernsehst...|47.4181749855684|8.56174415945371|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|791.TA.26-11-A-j1...|8591830| 19:52:00| 19:52:00| 3| 0| 0| 8591830|Glattpark, Glattpark|47.4199559214972|8.55716275150406|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|791.TA.26-11-A-j1...|8591294| 19:53:00| 19:53:00| 4| 0| 0| 8591294| Zürich, Oerlikerhus|47.4175853791724| 8.5542072942189|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|791.TA.26-11-A-j1...|8591256| 19:54:00| 19:54:00| 5| 0| 0| 8591256|Zürich, Leutschen...|47.4146433269471|8.55130573585079|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|791.TA.26-11-A-j1...|8591273| 19:55:00| 19:55:00| 6| 0| 0| 8591273|Zürich, Messe/Hal...|47.4106919651348|8.55068589830466|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|791.TA.26-11-A-j1...|8591382| 19:57:00| 19:57:00| 7| 0| 0| 8591382|Zürich, Sternen O...|47.4100718783688|8.54623025449481|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|791.TA.26-11-A-j1...|8580449| 19:59:00| 19:59:00| 8| 0| 0| 8580449|Zürich Oerlikon, ...| 47.411494419524|8.54479295004002|26-11-A-j19-1| Zürich, Rehalp| 363| 0| 19| 19:49:00| 2| 8|\n", - "|159.TA.26-304-j19...|8591057| 19:39:00| 19:39:00| 1| 0| 0| 8591057|Zürich Altstetten...| 47.392067942097|8.48990588617267| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8591402| 19:41:00| 19:41:00| 2| 0| 0| 8591402| Zürich, Tüffenwies|47.3979787271809|8.49434356367684| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8591434| 19:41:00| 19:41:00| 3| 0| 0| 8591434| Zürich, Winzerhalde|47.4000582901792| 8.4945681424979| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8591197| 19:42:00| 19:42:00| 4| 0| 0| 8591197|Zürich, Hohenklin...|47.4013473348052|8.49021131336931| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8591436| 19:43:00| 19:43:00| 5| 0| 0| 8591436|Zürich, Winzerstr...| 47.403372044054| 8.486123978826| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8591136| 19:46:00| 19:46:00| 6| 0| 0| 8591136| Zürich, Frankental|47.4057006674825|8.48137189097235| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590725| 19:47:00| 19:47:00| 7| 0| 0| 8590725|Oberengstringen, ...|47.4055243523393|8.47408655401713| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590726| 19:48:00| 19:48:00| 8| 0| 0| 8590726|Oberengstringen, ...| 47.407342193939|8.46795106062573| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590728| 19:49:00| 19:49:00| 9| 0| 0| 8590728|Oberengstringen, ...|47.4091295756792|8.46260608468448| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590727| 19:50:00| 19:50:00| 10| 0| 0| 8590727|Oberengstringen, ...|47.4104852703573|8.45874332896223| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590833| 19:51:00| 19:51:00| 11| 0| 0| 8590833|Unterengstringen,...|47.4122360710415|8.45316479104707| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8594732| 19:53:00| 19:53:00| 12| 0| 0| 8594732|Unterengstringen,...|47.4134944230824|8.44931101847766| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590831| 19:53:00| 19:53:00| 13| 0| 0| 8590831|Unterengstringen,...| 47.414977659342|8.44603216769017| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590911| 19:55:00| 19:55:00| 14| 0| 0| 8590911|Weiningen ZH, Aus...|47.4176826342903|8.43953734818508| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590913| 19:56:00| 19:56:00| 15| 0| 0| 8590913|Weiningen ZH, Lin...|47.4195547602987|8.43394084396424| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590914| 19:57:00| 19:57:00| 16| 0| 0| 8590914|Weiningen ZH, Sch...|47.4183512583635|8.42866773324572| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|159.TA.26-304-j19...|8590617| 19:59:00| 19:59:00| 17| 0| 0| 8590617| Geroldswil, Welbrig|47.4180716529658|8.41906474285715| 26-304-j19-1| Dietikon, Bahnhof| 5481| 1| 19| 19:39:00| 3| 17|\n", - "|966.TA.26-70-A-j1...|8591061| 07:00:00| 07:00:00| 9| 0| 0| 8591061|Zürich Leimbach, ...|47.3332523864039|8.51859807635144|26-70-A-j19-1|Zürich, Mittellei...| 3928| 0| 7| 07:00:00| 4| 5|\n", - "|966.TA.26-70-A-j1...|8591270| 07:02:00| 07:02:00| 10| 0| 0| 8591270| Zürich, Marbachweg|47.3303482449491|8.51537312448101|26-70-A-j19-1|Zürich, Mittellei...| 3928| 0| 7| 07:00:00| 4| 5|\n", - "|966.TA.26-70-A-j1...|8591210| 07:03:00| 07:03:00| 11| 0| 0| 8591210| Zürich, Im Hüsli|47.3282354882425|8.51269614493396|26-70-A-j19-1|Zürich, Mittellei...| 3928| 0| 7| 07:00:00| 4| 5|\n", - "|966.TA.26-70-A-j1...|8591370| 07:03:00| 07:03:00| 12| 0| 0| 8591370|Zürich, Sihlweids...|47.3264149182794|8.51466345540645|26-70-A-j19-1|Zürich, Mittellei...| 3928| 0| 7| 07:00:00| 4| 5|\n", - "|966.TA.26-70-A-j1...|8591278| 07:04:00| 07:04:00| 13| 0| 0| 8591278|Zürich, Mittellei...|47.3231389520848|8.51428616298707|26-70-A-j19-1|Zürich, Mittellei...| 3928| 0| 7| 07:00:00| 4| 5|\n", - "|269.TA.26-61-j19-...|8591281| 19:57:00| 19:57:00| 1| 0| 0| 8591281| Zürich, Mühlacker|47.4256326325821|8.49799970688372| 26-61-j19-1|Zürich, Schwamend...| 2076| 0| 19| 19:57:00| 5| 2|\n", - "|269.TA.26-61-j19-...|8591046| 19:58:00| 19:58:00| 2| 0| 0| 8591046| Zürich, Aspholz| 47.425085652811|8.50058685490234| 26-61-j19-1|Zürich, Schwamend...| 2076| 0| 19| 19:57:00| 5| 2|\n", - "|179.TA.26-703-j19...|8591825| 07:10:00| 07:10:00| 1| 0| 0| 8591825| Benglen, Bodenacher|47.3611288870976|8.63861299832652| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8590504| 07:11:00| 07:11:00| 2| 0| 0| 8590504|Benglen, Gerlisbr...|47.3610862923255|8.63360938219328| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8596005| 07:14:00| 07:14:00| 3| 0| 0| 8596005|Binz bei Maur, Tw...|47.3608915729295| 8.623476385787| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8591832| 07:14:00| 07:14:00| 4| 0| 0| 8591832|Pfaffhausen, Müseren|47.3626987847054|8.61754750491098| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8591147| 07:16:00| 07:16:00| 5| 0| 0| 8591147|Zürich, Friedhof ...|47.3613418604422|8.60282411740221| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8591162| 07:17:00| 07:17:00| 6| 0| 0| 8591162|Zürich, Glockenacker|47.3609767627537|8.59930272148798| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8591261| 07:18:00| 07:18:00| 7| 0| 0| 8591261|Zürich, Loorenstr...|47.3598631991991|8.59452368417579| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8591107| 07:19:00| 07:19:00| 8| 0| 0| 8591107|Zürich, Carl-Spit...|47.3583236436636|8.58659156021591| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|179.TA.26-703-j19...|8591233| 07:25:00| 07:25:00| 9| 0| 0| 8591233| Zürich, Klusplatz|47.3640374201824|8.56649624730736| 26-703-j19-1| Zürich, Klusplatz| 9385| 1| 7| 07:10:00| 6| 9|\n", - "|171.TA.26-703-j19...|8591825| 07:12:00| 07:12:00| 1| 0| 0| 8591825| Benglen, Bodenacher|47.3611288870976|8.63861299832652| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8590504| 07:13:00| 07:13:00| 2| 0| 0| 8590504|Benglen, Gerlisbr...|47.3610862923255|8.63360938219328| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8596005| 07:16:00| 07:16:00| 3| 0| 0| 8596005|Binz bei Maur, Tw...|47.3608915729295| 8.623476385787| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8591832| 07:16:00| 07:16:00| 4| 0| 0| 8591832|Pfaffhausen, Müseren|47.3626987847054|8.61754750491098| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8591147| 07:18:00| 07:18:00| 5| 0| 0| 8591147|Zürich, Friedhof ...|47.3613418604422|8.60282411740221| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8591162| 07:19:00| 07:19:00| 6| 0| 0| 8591162|Zürich, Glockenacker|47.3609767627537|8.59930272148798| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8591261| 07:20:00| 07:20:00| 7| 0| 0| 8591261|Zürich, Loorenstr...|47.3598631991991|8.59452368417579| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8591107| 07:21:00| 07:21:00| 8| 0| 0| 8591107|Zürich, Carl-Spit...|47.3583236436636|8.58659156021591| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|171.TA.26-703-j19...|8591233| 07:27:00| 07:27:00| 9| 0| 0| 8591233| Zürich, Klusplatz|47.3640374201824|8.56649624730736| 26-703-j19-1| Zürich, Klusplatz| 9346| 1| 7| 07:12:00| 6| 9|\n", - "|155.TA.26-703-j19...|8591825| 07:25:00| 07:25:00| 1| 0| 0| 8591825| Benglen, Bodenacher|47.3611288870976|8.63861299832652| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8590504| 07:26:00| 07:26:00| 2| 0| 0| 8590504|Benglen, Gerlisbr...|47.3610862923255|8.63360938219328| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8596005| 07:29:00| 07:29:00| 3| 0| 0| 8596005|Binz bei Maur, Tw...|47.3608915729295| 8.623476385787| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8591832| 07:29:00| 07:29:00| 4| 0| 0| 8591832|Pfaffhausen, Müseren|47.3626987847054|8.61754750491098| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8591147| 07:31:00| 07:31:00| 5| 0| 0| 8591147|Zürich, Friedhof ...|47.3613418604422|8.60282411740221| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8591162| 07:32:00| 07:32:00| 6| 0| 0| 8591162|Zürich, Glockenacker|47.3609767627537|8.59930272148798| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8591261| 07:33:00| 07:33:00| 7| 0| 0| 8591261|Zürich, Loorenstr...|47.3598631991991|8.59452368417579| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8591107| 07:34:00| 07:34:00| 8| 0| 0| 8591107|Zürich, Carl-Spit...|47.3583236436636|8.58659156021591| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|155.TA.26-703-j19...|8591233| 07:40:00| 07:40:00| 9| 0| 0| 8591233| Zürich, Klusplatz|47.3640374201824|8.56649624730736| 26-703-j19-1| Zürich, Klusplatz| 9267| 1| 7| 07:25:00| 6| 9|\n", - "|144.TA.26-703-j19...|8591825| 07:27:00| 07:27:00| 1| 0| 0| 8591825| Benglen, Bodenacher|47.3611288870976|8.63861299832652| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8590504| 07:28:00| 07:28:00| 2| 0| 0| 8590504|Benglen, Gerlisbr...|47.3610862923255|8.63360938219328| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8596005| 07:31:00| 07:31:00| 3| 0| 0| 8596005|Binz bei Maur, Tw...|47.3608915729295| 8.623476385787| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8591832| 07:31:00| 07:31:00| 4| 0| 0| 8591832|Pfaffhausen, Müseren|47.3626987847054|8.61754750491098| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8591147| 07:33:00| 07:33:00| 5| 0| 0| 8591147|Zürich, Friedhof ...|47.3613418604422|8.60282411740221| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8591162| 07:34:00| 07:34:00| 6| 0| 0| 8591162|Zürich, Glockenacker|47.3609767627537|8.59930272148798| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8591261| 07:35:00| 07:35:00| 7| 0| 0| 8591261|Zürich, Loorenstr...|47.3598631991991|8.59452368417579| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8591107| 07:36:00| 07:36:00| 8| 0| 0| 8591107|Zürich, Carl-Spit...|47.3583236436636|8.58659156021591| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|144.TA.26-703-j19...|8591233| 07:42:00| 07:42:00| 9| 0| 0| 8591233| Zürich, Klusplatz|47.3640374201824|8.56649624730736| 26-703-j19-1| Zürich, Klusplatz| 9231| 1| 7| 07:27:00| 6| 9|\n", - "|120.TA.26-703-j19...|8591825| 07:40:00| 07:40:00| 1| 0| 0| 8591825| Benglen, Bodenacher|47.3611288870976|8.63861299832652| 26-703-j19-1| Zürich, Klusplatz| 9159| 1| 7| 07:40:00| 6| 9|\n", - "|120.TA.26-703-j19...|8590504| 07:41:00| 07:41:00| 2| 0| 0| 8590504|Benglen, Gerlisbr...|47.3610862923255|8.63360938219328| 26-703-j19-1| Zürich, Klusplatz| 9159| 1| 7| 07:40:00| 6| 9|\n", - "|120.TA.26-703-j19...|8596005| 07:44:00| 07:44:00| 3| 0| 0| 8596005|Binz bei Maur, Tw...|47.3608915729295| 8.623476385787| 26-703-j19-1| Zürich, Klusplatz| 9159| 1| 7| 07:40:00| 6| 9|\n", - "|120.TA.26-703-j19...|8591832| 07:44:00| 07:44:00| 4| 0| 0| 8591832|Pfaffhausen, Müseren|47.3626987847054|8.61754750491098| 26-703-j19-1| Zürich, Klusplatz| 9159| 1| 7| 07:40:00| 6| 9|\n", - "+--------------------+-------+------------+--------------+-------------+-----------+-------------+---------------+--------------------+----------------+----------------+-------------+--------------------+---------------+------------+--------------+--------------------+---------+----------+\n", - "only showing top 100 rows" - ] - } - ], - "source": [ - "stop_times=stop_times.sort(stop_times.route_int.cast('int'), \n", - " stop_times.departure_first_stop, \n", - " stop_times.trip_id, \n", - " stop_times.stop_sequence.cast('int'))\n", - "stop_times.show(100)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times.write.csv('data/lgpt_guys/stop_times_with_route_int.csv', header=True, mode='overwrite')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating an index from 0 to n_stops-1 for stops:\n", "\n", "In RAPTOR, stops are indexed (in an arbitrary order) from 0 to the number of stops minus one. We generate this index below." ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times = spark.read.csv('data/lgpt_guys/stop_times_with_route_int.csv', header=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many unique routes do we find ?" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "1461" ] } ], "source": [ "stop_times.select(stop_times.route_int).dropDuplicates().count()" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------+---------------+\n", "|stop_int|stop_id_general|\n", "+--------+---------------+\n", - "| 0| 8503088|\n", + "| 0| 8503376|\n", "| 1| 8502508|\n", - "| 2| 8591190|\n", - "| 3| 8591284|\n", - "| 4| 8503376|\n", - "| 5| 8503078|\n", - "| 6| 8590819|\n", + "| 2| 8503088|\n", + "| 3| 8589111|\n", + "| 4| 8591284|\n", + "| 5| 8591190|\n", + "| 6| 8503078|\n", "| 7| 8587967|\n", - "| 8| 8589111|\n", + "| 8| 8590819|\n", "| 9| 8591362|\n", "| 10| 8591149|\n", "| 11| 8591315|\n", "| 12| 8588312|\n", "| 13| 8590541|\n", "| 14| 8590804|\n", - "| 15| 8591165|\n", + "| 15| 8591085|\n", "| 16| 8590273|\n", - "| 17| 8591080|\n", - "| 18| 8591271|\n", - "| 19| 8591053|\n", + "| 17| 8591271|\n", + "| 18| 8591165|\n", + "| 19| 8591080|\n", "+--------+---------------+\n", "only showing top 20 rows" ] } ], "source": [ "stops_general_indexed = dfZipWithIndex(stop_times.select(stop_times.stop_id_general).dropDuplicates(),\n", " 0,\n", " 'stop_int')\n", "stops_general_indexed.show()" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stops_general_indexed.write.csv('data/lgpt_guys/stops_general_indexed.csv', header=True, mode='overwrite')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we add this index to `stop_times` and drop columns we won't be using anymore:\n", "- `pickup_type`\n", "- `drop_off_type`\n", "- `departure_hour`\n", "\n", "Note that spark does not maintain order after joins, therefore we will need to reorder stop_times after all the processing is done." ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stops_general_indexed = spark.read.csv('data/lgpt_guys/stops_general_indexed.csv', header=True)" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+---------------+--------------------+------------+------------+--------------+-------------+--------------------+----------------+----------------+------------+----------------+---------------+------------+--------------------+---------+----------+--------+\n", - "|stop_id_general| trip_id| stop_id|arrival_time|departure_time|stop_sequence| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|\n", - "+---------------+--------------------+------------+------------+--------------+-------------+--------------------+----------------+----------------+------------+----------------+---------------+------------+--------------------+---------+----------+--------+\n", - "| 8503086|127.TA.26-4-B-j19...| 8503086| 08:14:00| 08:14:00| 5| Zürich Brunau| 47.352122370277|8.52623375626752|26-4-B-j19-1|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 1373|\n", - "| 8503093|127.TA.26-4-B-j19...| 8503093| 08:17:00| 08:17:00| 6| Zürich Manegg|47.3383783847121|8.51967605469254|26-4-B-j19-1|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 510|\n", - "| 8503094|127.TA.26-4-B-j19...| 8503094| 08:18:00| 08:18:00| 7| Zürich Leimbach|47.3346039449498| 8.5196041894698|26-4-B-j19-1|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 533|\n", - "| 8503095|127.TA.26-4-B-j19...| 8503095| 08:21:00| 08:21:00| 8| Sood-Oberleimbach|47.3195704752201|8.52143675264966|26-4-B-j19-1|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 1343|\n", - "| 8503096|127.TA.26-4-B-j19...| 8503096| 08:23:00| 08:24:00| 9| Adliswil|47.3123048960724|8.52416763111376|26-4-B-j19-1|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 239|\n", - "| 8503097|127.TA.26-4-B-j19...| 8503097| 08:30:00| 08:30:00| 10| Langnau-Gattikon|47.2868766420081|8.54402039889557|26-4-B-j19-1|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 669|\n", - "| 8503088|128.TA.26-4-B-j19...|8503088:0:21| 08:28:00| 08:28:00| 1| Zürich HB SZU|47.3775557344462|8.53916949636064|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 0|\n", - "| 8503090|128.TA.26-4-B-j19...| 8503090| 08:29:00| 08:29:00| 2| Zürich Selnau|47.3729384820921|8.53203687300374|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 48|\n", - "| 8503091|128.TA.26-4-B-j19...| 8503091| 08:31:00| 08:31:00| 3| Zürich Giesshübel|47.3624553927874|8.52184997768041|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 114|\n", - "| 8503087|128.TA.26-4-B-j19...| 8503087| 08:33:00| 08:33:00| 4|Zürich Saalsporth...| 47.357404757095|8.52214642172421|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 262|\n", - "| 8503086|128.TA.26-4-B-j19...| 8503086| 08:34:00| 08:34:00| 5| Zürich Brunau| 47.352122370277|8.52623375626752|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 1373|\n", - "| 8503093|128.TA.26-4-B-j19...| 8503093| 08:37:00| 08:37:00| 6| Zürich Manegg|47.3383783847121|8.51967605469254|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 510|\n", - "| 8503094|128.TA.26-4-B-j19...| 8503094| 08:38:00| 08:38:00| 7| Zürich Leimbach|47.3346039449498| 8.5196041894698|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 533|\n", - "| 8503095|128.TA.26-4-B-j19...| 8503095| 08:41:00| 08:41:00| 8| Sood-Oberleimbach|47.3195704752201|8.52143675264966|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 1343|\n", - "| 8503096|128.TA.26-4-B-j19...| 8503096| 08:43:00| 08:44:00| 9| Adliswil|47.3123048960724|8.52416763111376|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 239|\n", - "| 8503097|128.TA.26-4-B-j19...| 8503097| 08:50:00| 08:50:00| 10| Langnau-Gattikon|47.2868766420081|8.54402039889557|26-4-B-j19-1|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 669|\n", - "| 8503855|294.TA.26-134-j19...| 8503855:0:B| 07:04:00| 07:04:00| 1| Horgen, Bahnhof|47.2619787396278|8.59697608490178|26-134-j19-1|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 49|\n", - "| 8577912|294.TA.26-134-j19...| 8577912| 07:06:00| 07:06:00| 2|Horgen, untere Mühle|47.2591865186515|8.59809897900709|26-134-j19-1|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 247|\n", - "| 8590663|294.TA.26-134-j19...| 8590663| 07:07:00| 07:07:00| 3| Horgen, Wannenthal|47.2565465760621|8.60232106084304|26-134-j19-1|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 1334|\n", - "| 8590661|294.TA.26-134-j19...| 8590661| 07:08:00| 07:08:00| 4| Horgen, Teufenbach| 47.256253918433|8.60507888876567|26-134-j19-1|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 356|\n", - "+---------------+--------------------+------------+------------+--------------+-------------+--------------------+----------------+----------------+------------+----------------+---------------+------------+--------------------+---------+----------+--------+\n", + "+---------------+--------------------+-------+------------+--------------+-------------+--------------------+----------------+----------------+------------+--------------------+---------------+------------+--------------------+---------+----------+--------+\n", + "|stop_id_general| trip_id|stop_id|arrival_time|departure_time|stop_sequence| stop_name| stop_lat| stop_lon| route_id| trip_headsign|trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|\n", + "+---------------+--------------------+-------+------------+--------------+-------------+--------------------+----------------+----------------+------------+--------------------+---------------+------------+--------------------+---------+----------+--------+\n", + "| 8590679|610.TA.26-185-j19...|8590679| 10:56:00| 10:56:00| 5|Kilchberg ZH, Spital|47.3217079365566|8.53537860586113|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 65|\n", + "| 8590468|610.TA.26-185-j19...|8590468| 10:58:00| 10:58:00| 6| Adliswil, Eichenweg|47.3200028461711|8.53368977312675|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1171|\n", + "| 8590477|610.TA.26-185-j19...|8590477| 10:59:00| 10:59:00| 7| Adliswil, Moos|47.3254528526241|8.53104872619107|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 15|\n", + "| 8591388|610.TA.26-185-j19...|8591388| 11:00:00| 11:00:00| 8| Zürich, Sunnau|47.3270847004784|8.52961142173628|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1234|\n", + "| 8591111|610.TA.26-185-j19...|8591111| 11:02:00| 11:02:00| 9|Zürich, Dangelstr...|47.3345491527085|8.52997973100282|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 433|\n", + "| 8591439|610.TA.26-185-j19...|8591439| 11:03:00| 11:03:00| 10| Zürich, Wollishofen|47.3384392605619|8.53015939405967|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 552|\n", + "| 8591106|610.TA.26-185-j19...|8591106| 11:03:00| 11:03:00| 11|Zürich, Butzenstr...|47.3414099167461|8.53031210765799|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1037|\n", + "| 8591279|610.TA.26-185-j19...|8591279| 11:05:00| 11:05:00| 12| Zürich, Morgental|47.3439482343686|8.53014142775399|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1349|\n", + "| 8591304|610.TA.26-185-j19...|8591304| 11:06:00| 11:06:00| 13|Zürich, Post Woll...|47.3444717091534|8.53296213774651|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 982|\n", + "| 8502495|610.TA.26-185-j19...|8502495| 11:07:00| 11:07:00| 14|Zürich Wollishofe...|47.3476976601166|8.53331248070737|26-185-j19-1|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 116|\n", + "| 8591036|610.TA.26-3-A-j19...|8591036| 08:18:00| 08:18:00| 1| Zürich, Albisrieden|47.3743863596743|8.48478548905248|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 1356|\n", + "| 8591126|610.TA.26-3-A-j19...|8591126| 08:20:00| 08:20:00| 2|Zürich, Fellenber...|47.3757186152716|8.48841468280083|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 744|\n", + "| 8591363|610.TA.26-3-A-j19...|8591363| 08:21:00| 08:21:00| 3| Zürich, Siemens|47.3785837632719|8.49449627727516|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 264|\n", + "| 8591203|610.TA.26-3-A-j19...|8591203| 08:23:00| 08:23:00| 4| Zürich, Hubertus|47.3768744261929|8.49947294394988|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 569|\n", + "| 8591236|610.TA.26-3-A-j19...|8591236| 08:24:00| 08:24:00| 5|Zürich, Krematori...|47.3778051397306|8.50787219185756|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 368|\n", + "| 8591038|610.TA.26-3-A-j19...|8591038| 08:25:00| 08:25:00| 6|Zürich, Albisried...|47.3782127019246|8.51039645780629|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 1190|\n", + "| 8591448|610.TA.26-3-A-j19...|8591448| 08:26:00| 08:26:00| 7|Zürich, Zypressen...|47.3766919314159|8.51376514012221|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 773|\n", + "| 8591259|610.TA.26-3-A-j19...|8591259| 08:28:00| 08:28:00| 8| Zürich, Lochergut|47.3753475337612|8.51791535673542|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 653|\n", + "| 8591218|610.TA.26-3-A-j19...|8591218| 08:29:00| 08:29:00| 9|Zürich,Kalkbreite...|47.3745992794953|8.52100556131322|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 247|\n", + "| 8591079|610.TA.26-3-A-j19...|8591079| 08:30:00| 08:30:00| 10|Zürich, Bezirksge...|47.3741673555329|8.52556001980434|26-3-A-j19-1| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 1062|\n", + "+---------------+--------------------+-------+------------+--------------+-------------+--------------------+----------------+----------------+------------+--------------------+---------------+------------+--------------------+---------+----------+--------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times = stop_times.join(stops_general_indexed, how='inner', on='stop_id_general')\\\n", ".drop('pickup_type', 'drop_off_type', 'departure_hour')\n", "stop_times.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Adding transport types to stop_times from routes.txt" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+---------+----------------+---------------+----------+----------+\n", "| route_id|agency_id|route_short_name|route_long_name|route_desc|route_type|\n", "+-----------+---------+----------------+---------------+----------+----------+\n", "|11-40-j19-1| 801| 040| null| Bus| 700|\n", "|11-61-j19-1| 7031| 061| null| Bus| 700|\n", "|11-62-j19-1| 7031| 062| null| Bus| 700|\n", "|24-64-j19-1| 801| 064| null| Bus| 700|\n", "|11-83-j19-1| 801| 083| null| Bus| 700|\n", "|1-1-B-j19-1| 11| 1| null| S-Bahn| 400|\n", "|1-1-A-j19-1| 11| 1| null| S-Bahn| 400|\n", "|1-1-C-j19-1| 723| 1| null| Bus| 700|\n", "|1-1-D-j19-1| 840| 1| null| Bus| 700|\n", "|1-1-E-j19-1| 886| 1| null| Bus| 700|\n", "| 1-1-j19-1| 11| 1| null| Intercity| 102|\n", "| 4-1-j19-1| 11| 1| null| S-Bahn| 400|\n", "| 5-1-j19-1| 823| 1| null| Tram| 900|\n", "|6-1-A-j19-1| 146| 1| null| Bus| 700|\n", "|6-1-B-j19-1| 33| 1| null| S-Bahn| 400|\n", "|6-1-C-j19-1| 801| 1| null| Bus| 700|\n", "|6-1-D-j19-1| 889| 1| null| Bus| 700|\n", "|6-1-E-j19-1| 889| 1| null| Bus| 700|\n", "| 6-1-j19-1| 11| 1| null| Intercity| 102|\n", "|8-1-A-j19-1| 834| 1| null| Bus| 700|\n", "+-----------+---------+----------------+---------------+----------+----------+\n", "only showing top 20 rows" ] } ], "source": [ "routes = spark.read.csv(\"/data/sbb/timetables/csv/routes/2019/05/14/routes.txt\", header=True, sep = \",\")\n", "routes.show()" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+----------+\n", "| route_id|route_desc|\n", "+-----------+----------+\n", "|11-40-j19-1| Bus|\n", "|11-61-j19-1| Bus|\n", "|11-62-j19-1| Bus|\n", "|24-64-j19-1| Bus|\n", "|11-83-j19-1| Bus|\n", "|1-1-B-j19-1| S-Bahn|\n", "|1-1-A-j19-1| S-Bahn|\n", "|1-1-C-j19-1| Bus|\n", "|1-1-D-j19-1| Bus|\n", "|1-1-E-j19-1| Bus|\n", "| 1-1-j19-1| Intercity|\n", "| 4-1-j19-1| S-Bahn|\n", "| 5-1-j19-1| Tram|\n", "|6-1-A-j19-1| Bus|\n", "|6-1-B-j19-1| S-Bahn|\n", "|6-1-C-j19-1| Bus|\n", "|6-1-D-j19-1| Bus|\n", "|6-1-E-j19-1| Bus|\n", "| 6-1-j19-1| Intercity|\n", "|8-1-A-j19-1| Bus|\n", "+-----------+----------+\n", "only showing top 20 rows" ] } ], "source": [ "routes_for_join = routes.select(routes.route_id, routes.route_desc)\n", "routes_for_join.show()" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+------------+---------------+--------------------+------------+------------+--------------+-------------+--------------------+----------------+----------------+----------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "| route_id|stop_id_general| trip_id| stop_id|arrival_time|departure_time|stop_sequence| stop_name| stop_lat| stop_lon| trip_headsign|trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|\n", - "+------------+---------------+--------------------+------------+------------+--------------+-------------+--------------------+----------------+----------------+----------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "|26-4-B-j19-1| 8503086|127.TA.26-4-B-j19...| 8503086| 08:14:00| 08:14:00| 5| Zürich Brunau| 47.352122370277|8.52623375626752|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 1373| S-Bahn|\n", - "|26-4-B-j19-1| 8503093|127.TA.26-4-B-j19...| 8503093| 08:17:00| 08:17:00| 6| Zürich Manegg|47.3383783847121|8.51967605469254|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 510| S-Bahn|\n", - "|26-4-B-j19-1| 8503094|127.TA.26-4-B-j19...| 8503094| 08:18:00| 08:18:00| 7| Zürich Leimbach|47.3346039449498| 8.5196041894698|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 533| S-Bahn|\n", - "|26-4-B-j19-1| 8503095|127.TA.26-4-B-j19...| 8503095| 08:21:00| 08:21:00| 8| Sood-Oberleimbach|47.3195704752201|8.52143675264966|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 1343| S-Bahn|\n", - "|26-4-B-j19-1| 8503096|127.TA.26-4-B-j19...| 8503096| 08:23:00| 08:24:00| 9| Adliswil|47.3123048960724|8.52416763111376|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 239| S-Bahn|\n", - "|26-4-B-j19-1| 8503097|127.TA.26-4-B-j19...| 8503097| 08:30:00| 08:30:00| 10| Langnau-Gattikon|47.2868766420081|8.54402039889557|Langnau-Gattikon| 12473| 0| 08:08:00| 1316| 10| 669| S-Bahn|\n", - "|26-4-B-j19-1| 8503088|128.TA.26-4-B-j19...|8503088:0:21| 08:28:00| 08:28:00| 1| Zürich HB SZU|47.3775557344462|8.53916949636064|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 0| S-Bahn|\n", - "|26-4-B-j19-1| 8503090|128.TA.26-4-B-j19...| 8503090| 08:29:00| 08:29:00| 2| Zürich Selnau|47.3729384820921|8.53203687300374|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 48| S-Bahn|\n", - "|26-4-B-j19-1| 8503091|128.TA.26-4-B-j19...| 8503091| 08:31:00| 08:31:00| 3| Zürich Giesshübel|47.3624553927874|8.52184997768041|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 114| S-Bahn|\n", - "|26-4-B-j19-1| 8503087|128.TA.26-4-B-j19...| 8503087| 08:33:00| 08:33:00| 4|Zürich Saalsporth...| 47.357404757095|8.52214642172421|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 262| S-Bahn|\n", - "|26-4-B-j19-1| 8503086|128.TA.26-4-B-j19...| 8503086| 08:34:00| 08:34:00| 5| Zürich Brunau| 47.352122370277|8.52623375626752|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 1373| S-Bahn|\n", - "|26-4-B-j19-1| 8503093|128.TA.26-4-B-j19...| 8503093| 08:37:00| 08:37:00| 6| Zürich Manegg|47.3383783847121|8.51967605469254|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 510| S-Bahn|\n", - "|26-4-B-j19-1| 8503094|128.TA.26-4-B-j19...| 8503094| 08:38:00| 08:38:00| 7| Zürich Leimbach|47.3346039449498| 8.5196041894698|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 533| S-Bahn|\n", - "|26-4-B-j19-1| 8503095|128.TA.26-4-B-j19...| 8503095| 08:41:00| 08:41:00| 8| Sood-Oberleimbach|47.3195704752201|8.52143675264966|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 1343| S-Bahn|\n", - "|26-4-B-j19-1| 8503096|128.TA.26-4-B-j19...| 8503096| 08:43:00| 08:44:00| 9| Adliswil|47.3123048960724|8.52416763111376|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 239| S-Bahn|\n", - "|26-4-B-j19-1| 8503097|128.TA.26-4-B-j19...| 8503097| 08:50:00| 08:50:00| 10| Langnau-Gattikon|47.2868766420081|8.54402039889557|Langnau-Gattikon| 12477| 0| 08:28:00| 1316| 10| 669| S-Bahn|\n", - "|26-134-j19-1| 8503855|294.TA.26-134-j19...| 8503855:0:B| 07:04:00| 07:04:00| 1| Horgen, Bahnhof|47.2619787396278|8.59697608490178|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 49| Bus|\n", - "|26-134-j19-1| 8577912|294.TA.26-134-j19...| 8577912| 07:06:00| 07:06:00| 2|Horgen, untere Mühle|47.2591865186515|8.59809897900709|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 247| Bus|\n", - "|26-134-j19-1| 8590663|294.TA.26-134-j19...| 8590663| 07:07:00| 07:07:00| 3| Horgen, Wannenthal|47.2565465760621|8.60232106084304|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 1334| Bus|\n", - "|26-134-j19-1| 8590661|294.TA.26-134-j19...| 8590661| 07:08:00| 07:08:00| 4| Horgen, Teufenbach| 47.256253918433|8.60507888876567|Horgen, Risi/Dow| 2960| 0| 07:04:00| 1317| 6| 356| Bus|\n", - "+------------+---------------+--------------------+------------+------------+--------------+-------------+--------------------+----------------+----------------+----------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", + "+------------+---------------+--------------------+-------+------------+--------------+-------------+--------------------+----------------+----------------+--------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", + "| route_id|stop_id_general| trip_id|stop_id|arrival_time|departure_time|stop_sequence| stop_name| stop_lat| stop_lon| trip_headsign|trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|\n", + "+------------+---------------+--------------------+-------+------------+--------------+-------------+--------------------+----------------+----------------+--------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", + "|26-185-j19-1| 8590679|610.TA.26-185-j19...|8590679| 10:56:00| 10:56:00| 5|Kilchberg ZH, Spital|47.3217079365566|8.53537860586113|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 65| Bus|\n", + "|26-185-j19-1| 8590468|610.TA.26-185-j19...|8590468| 10:58:00| 10:58:00| 6| Adliswil, Eichenweg|47.3200028461711|8.53368977312675|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1171| Bus|\n", + "|26-185-j19-1| 8590477|610.TA.26-185-j19...|8590477| 10:59:00| 10:59:00| 7| Adliswil, Moos|47.3254528526241|8.53104872619107|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 15| Bus|\n", + "|26-185-j19-1| 8591388|610.TA.26-185-j19...|8591388| 11:00:00| 11:00:00| 8| Zürich, Sunnau|47.3270847004784|8.52961142173628|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1234| Bus|\n", + "|26-185-j19-1| 8591111|610.TA.26-185-j19...|8591111| 11:02:00| 11:02:00| 9|Zürich, Dangelstr...|47.3345491527085|8.52997973100282|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 433| Bus|\n", + "|26-185-j19-1| 8591439|610.TA.26-185-j19...|8591439| 11:03:00| 11:03:00| 10| Zürich, Wollishofen|47.3384392605619|8.53015939405967|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 552| Bus|\n", + "|26-185-j19-1| 8591106|610.TA.26-185-j19...|8591106| 11:03:00| 11:03:00| 11|Zürich, Butzenstr...|47.3414099167461|8.53031210765799|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1037| Bus|\n", + "|26-185-j19-1| 8591279|610.TA.26-185-j19...|8591279| 11:05:00| 11:05:00| 12| Zürich, Morgental|47.3439482343686|8.53014142775399|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 1349| Bus|\n", + "|26-185-j19-1| 8591304|610.TA.26-185-j19...|8591304| 11:06:00| 11:06:00| 13|Zürich, Post Woll...|47.3444717091534|8.53296213774651|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 982| Bus|\n", + "|26-185-j19-1| 8502495|610.TA.26-185-j19...|8502495| 11:07:00| 11:07:00| 14|Zürich Wollishofe...|47.3476976601166|8.53331248070737|Zürich Wollishofe...| 9270| 1| 10:51:00| 21| 14| 116| Bus|\n", + "|26-3-A-j19-1| 8591036|610.TA.26-3-A-j19...|8591036| 08:18:00| 08:18:00| 1| Zürich, Albisrieden|47.3743863596743|8.48478548905248| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 1356| Tram|\n", + "|26-3-A-j19-1| 8591126|610.TA.26-3-A-j19...|8591126| 08:20:00| 08:20:00| 2|Zürich, Fellenber...|47.3757186152716|8.48841468280083| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 744| Tram|\n", + "|26-3-A-j19-1| 8591363|610.TA.26-3-A-j19...|8591363| 08:21:00| 08:21:00| 3| Zürich, Siemens|47.3785837632719|8.49449627727516| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 264| Tram|\n", + "|26-3-A-j19-1| 8591203|610.TA.26-3-A-j19...|8591203| 08:23:00| 08:23:00| 4| Zürich, Hubertus|47.3768744261929|8.49947294394988| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 569| Tram|\n", + "|26-3-A-j19-1| 8591236|610.TA.26-3-A-j19...|8591236| 08:24:00| 08:24:00| 5|Zürich, Krematori...|47.3778051397306|8.50787219185756| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 368| Tram|\n", + "|26-3-A-j19-1| 8591038|610.TA.26-3-A-j19...|8591038| 08:25:00| 08:25:00| 6|Zürich, Albisried...|47.3782127019246|8.51039645780629| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 1190| Tram|\n", + "|26-3-A-j19-1| 8591448|610.TA.26-3-A-j19...|8591448| 08:26:00| 08:26:00| 7|Zürich, Zypressen...|47.3766919314159|8.51376514012221| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 773| Tram|\n", + "|26-3-A-j19-1| 8591259|610.TA.26-3-A-j19...|8591259| 08:28:00| 08:28:00| 8| Zürich, Lochergut|47.3753475337612|8.51791535673542| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 653| Tram|\n", + "|26-3-A-j19-1| 8591218|610.TA.26-3-A-j19...|8591218| 08:29:00| 08:29:00| 9|Zürich,Kalkbreite...|47.3745992794953|8.52100556131322| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 247| Tram|\n", + "|26-3-A-j19-1| 8591079|610.TA.26-3-A-j19...|8591079| 08:30:00| 08:30:00| 10|Zürich, Bezirksge...|47.3741673555329|8.52556001980434| Zürich, Klusplatz| 1059| 0| 08:18:00| 1282| 21| 1062| Tram|\n", + "+------------+---------------+--------------------+-------+------------+--------------+-------------+--------------------+----------------+----------------+--------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times = stop_times.join(routes_for_join, how='inner', on='route_id')\n", "stop_times.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## VERY IMPORTANT: final sort before saving to csv" + "## VERY IMPORTANT: final sort before writing to csv" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "|route_id |stop_id_general|trip_id |stop_id|arrival_time|departure_time|stop_sequence|stop_name |stop_lat |stop_lon |trip_headsign |trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|\n", - "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "|26-10-j19-1 |8573205 |1672.TA.26-10-j19-1.11.R|8573205|07:00:00 |07:01:00 |27 |Zürich Flughafen, Bahnhof |47.4504413038344|8.56372943623189|Zürich Flughafen, Fracht |4096 |1 |07:01:00 |0 |2 |298 |Tram |\n", - "|26-10-j19-1 |8588553 |1672.TA.26-10-j19-1.11.R|8588553|07:02:00 |07:02:00 |28 |Zürich Flughafen, Fracht |47.4524944976638|8.57205681891684|Zürich Flughafen, Fracht |4096 |1 |07:01:00 |0 |2 |1295 |Tram |\n", - "|26-13-j19-1 |8576240 |2064.TA.26-13-j19-1.24.H|8576240|07:00:00 |07:00:00 |5 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1222 |Tram |\n", - "|26-13-j19-1 |8591353 |2064.TA.26-13-j19-1.24.H|8591353|07:01:00 |07:01:00 |6 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |816 |Tram |\n", - "|26-13-j19-1 |8591039 |2064.TA.26-13-j19-1.24.H|8591039|07:02:00 |07:02:00 |7 |Zürich, Alte Trotte |47.3977659017765|8.50725235431143|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |778 |Tram |\n", - "|26-13-j19-1 |8591121 |2064.TA.26-13-j19-1.24.H|8591121|07:03:00 |07:03:00 |8 |Zürich, Eschergutweg |47.3962700189648|8.51204037477646|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |313 |Tram |\n", - "|26-13-j19-1 |8591417 |2064.TA.26-13-j19-1.24.H|8591417|07:05:00 |07:05:00 |9 |Zürich, Waidfussweg |47.3954977376399|8.51840044698891|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |350 |Tram |\n", - "|26-13-j19-1 |8591437 |2064.TA.26-13-j19-1.24.H|8591437|07:06:00 |07:06:00 |10 |Zürich, Wipkingerplatz |47.3925909395293|8.52357474302616|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1019 |Tram |\n", - "|26-13-j19-1 |8580522 |2064.TA.26-13-j19-1.24.H|8580522|07:08:00 |07:08:00 |11 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |454 |Tram |\n", - "|26-13-j19-1 |8591110 |2064.TA.26-13-j19-1.24.H|8591110|07:09:00 |07:09:00 |12 |Zürich, Dammweg |47.3884919601296|8.52639545301869|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1101 |Tram |\n", - "|26-13-j19-1 |8591306 |2064.TA.26-13-j19-1.24.H|8591306|07:10:00 |07:10:00 |13 |Zürich, Quellenstrasse |47.3867403702341|8.52874903906341|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |786 |Tram |\n", - "|26-13-j19-1 |8591257 |2064.TA.26-13-j19-1.24.H|8591257|07:11:00 |07:11:00 |14 |Zürich, Limmatplatz |47.3845994590919|8.53162364797299|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |388 |Tram |\n", - "|26-13-j19-1 |8591282 |2064.TA.26-13-j19-1.24.H|8591282|07:12:00 |07:12:00 |15 |Zürich, Museum für Gestaltung |47.3821239221899|8.53493843137185|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |138 |Tram |\n", - "|26-13-j19-1 |8591368 |2064.TA.26-13-j19-1.24.H|8591368|07:14:00 |07:14:00 |16 |Zürich, Sihlquai/HB |47.3798733332196|8.53760642776606|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |880 |Tram |\n", - "|26-13-j19-1 |8587349 |2064.TA.26-13-j19-1.24.H|8587349|07:16:00 |07:16:00 |17 |Zürich, Bahnhofquai/HB |47.3775618175159|8.54173867807358|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |602 |Tram |\n", - "|26-13-j19-1 |8591067 |2064.TA.26-13-j19-1.24.H|8591067|07:18:00 |07:18:00 |18 |Zürich, Bahnhofstrasse/HB |47.3765581015114|8.53994204750509|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |616 |Tram |\n", - "|26-13-j19-1 |8591316 |2064.TA.26-13-j19-1.24.H|8591316|07:20:00 |07:20:00 |19 |Zürich, Rennweg |47.3730662375955|8.53845982728609|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |375 |Tram |\n", - "|26-13-j19-1 |8591299 |2064.TA.26-13-j19-1.24.H|8591299|07:22:00 |07:22:00 |20 |Zürich, Paradeplatz |47.3693672863583|8.53876525448273|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1221 |Tram |\n", - "|26-13-j19-1 |8591384 |2064.TA.26-13-j19-1.24.H|8591384|07:23:00 |07:23:00 |21 |Zürich, Stockerstrasse |47.3677002399791|8.53501029659459|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |871 |Tram |\n", - "|26-13-j19-1 |8591404 |2064.TA.26-13-j19-1.24.H|8591404|07:24:00 |07:24:00 |22 |Zürich, Tunnelstrasse |47.3661426599847|8.53253094641008|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |888 |Tram |\n", - "|26-13-j19-1 |8591059 |2064.TA.26-13-j19-1.24.H|8591059|07:25:00 |07:25:00 |23 |Zürich Enge, Bahnhof/Bederstr.|47.3645546111557|8.53045583810347|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |175 |Tram |\n", - "|26-13-j19-1 |8591415 |2064.TA.26-13-j19-1.24.H|8591415|07:27:00 |07:27:00 |24 |Zürich, Waffenplatzstrasse |47.3614818138862|8.52574866601403|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1267 |Tram |\n", - "|26-13-j19-1 |8591366 |2064.TA.26-13-j19-1.24.H|8591366|07:28:00 |07:28:00 |25 |Zürich, Sihlcity Nord |47.3600640074787|8.52303575385561|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |968 |Tram |\n", - "|26-13-j19-1 |8591329 |2064.TA.26-13-j19-1.24.H|8591329|07:29:00 |07:29:00 |26 |Zürich, Saalsporthalle |47.3578611597087|8.52040369007277|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1236 |Tram |\n", - "|26-13-j19-1 |8591245 |2064.TA.26-13-j19-1.24.H|8591245|07:30:00 |07:30:00 |27 |Zürich, Laubegg |47.3587313564196|8.51708890667391|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1215 |Tram |\n", - "|26-13-j19-1 |8591405 |2064.TA.26-13-j19-1.24.H|8591405|07:32:00 |07:32:00 |28 |Zürich, Uetlihof |47.3567353594536|8.51396276948474|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |847 |Tram |\n", - "|26-13-j19-1 |8591385 |2064.TA.26-13-j19-1.24.H|8591385|07:33:00 |07:33:00 |29 |Zürich, Strassenverkehrsamt |47.3530717783138|8.51171698127413|Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |272 |Tram |\n", - "|26-13-j19-1 |8591034 |2064.TA.26-13-j19-1.24.H|8591034|07:34:00 |07:34:00 |30 |Zürich, Albisgütli |47.3519945640447|8.5077104951064 |Zürich, Albisgütli |1831 |0 |07:00:00 |1 |26 |1352 |Tram |\n", - "|26-11-A-j19-1|8591049 |791.TA.26-11-A-j19-1.3.H|8591049|19:49:00 |19:49:00 |1 |Zürich, Auzelg |47.4166918393693|8.568113214819 |Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |1117 |Tram |\n", - "|26-11-A-j19-1|8591128 |791.TA.26-11-A-j19-1.3.H|8591128|19:51:00 |19:51:00 |2 |Zürich, Fernsehstudio |47.4181749855684|8.56174415945371|Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |152 |Tram |\n", - "|26-11-A-j19-1|8591830 |791.TA.26-11-A-j19-1.3.H|8591830|19:52:00 |19:52:00 |3 |Glattpark, Glattpark |47.4199559214972|8.55716275150406|Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |672 |Tram |\n", - "|26-11-A-j19-1|8591294 |791.TA.26-11-A-j19-1.3.H|8591294|19:53:00 |19:53:00 |4 |Zürich, Oerlikerhus |47.4175853791724|8.5542072942189 |Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |571 |Tram |\n", - "|26-11-A-j19-1|8591256 |791.TA.26-11-A-j19-1.3.H|8591256|19:54:00 |19:54:00 |5 |Zürich, Leutschenbach |47.4146433269471|8.55130573585079|Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |444 |Tram |\n", - "|26-11-A-j19-1|8591273 |791.TA.26-11-A-j19-1.3.H|8591273|19:55:00 |19:55:00 |6 |Zürich, Messe/Hallenstadion |47.4106919651348|8.55068589830466|Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |389 |Tram |\n", - "|26-11-A-j19-1|8591382 |791.TA.26-11-A-j19-1.3.H|8591382|19:57:00 |19:57:00 |7 |Zürich, Sternen Oerlikon |47.4100718783688|8.54623025449481|Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |688 |Tram |\n", - "|26-11-A-j19-1|8580449 |791.TA.26-11-A-j19-1.3.H|8580449|19:59:00 |19:59:00 |8 |Zürich Oerlikon, Bahnhof |47.411494419524 |8.54479295004002|Zürich, Rehalp |363 |0 |19:49:00 |2 |8 |766 |Tram |\n", - "|26-304-j19-1 |8591057 |159.TA.26-304-j19-1.4.R |8591057|19:39:00 |19:39:00 |1 |Zürich Altstetten, Bahnhof N |47.392067942097 |8.48990588617267|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |520 |Bus |\n", - "|26-304-j19-1 |8591402 |159.TA.26-304-j19-1.4.R |8591402|19:41:00 |19:41:00 |2 |Zürich, Tüffenwies |47.3979787271809|8.49434356367684|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |999 |Bus |\n", - "|26-304-j19-1 |8591434 |159.TA.26-304-j19-1.4.R |8591434|19:41:00 |19:41:00 |3 |Zürich, Winzerhalde |47.4000582901792|8.4945681424979 |Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |710 |Bus |\n", - "|26-304-j19-1 |8591197 |159.TA.26-304-j19-1.4.R |8591197|19:42:00 |19:42:00 |4 |Zürich, Hohenklingensteig |47.4013473348052|8.49021131336931|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |1123 |Bus |\n", - "|26-304-j19-1 |8591436 |159.TA.26-304-j19-1.4.R |8591436|19:43:00 |19:43:00 |5 |Zürich, Winzerstrasse Süd |47.403372044054 |8.486123978826 |Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |147 |Bus |\n", - "|26-304-j19-1 |8591136 |159.TA.26-304-j19-1.4.R |8591136|19:46:00 |19:46:00 |6 |Zürich, Frankental |47.4057006674825|8.48137189097235|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |689 |Bus |\n", - "|26-304-j19-1 |8590725 |159.TA.26-304-j19-1.4.R |8590725|19:47:00 |19:47:00 |7 |Oberengstringen, Eggbühl |47.4055243523393|8.47408655401713|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |1325 |Bus |\n", - "|26-304-j19-1 |8590726 |159.TA.26-304-j19-1.4.R |8590726|19:48:00 |19:48:00 |8 |Oberengstringen, Lanzrain |47.407342193939 |8.46795106062573|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |759 |Bus |\n", - "|26-304-j19-1 |8590728 |159.TA.26-304-j19-1.4.R |8590728|19:49:00 |19:49:00 |9 |Oberengstringen, Zentrum |47.4091295756792|8.46260608468448|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |373 |Bus |\n", - "|26-304-j19-1 |8590727 |159.TA.26-304-j19-1.4.R |8590727|19:50:00 |19:50:00 |10 |Oberengstringen, Paradies |47.4104852703573|8.45874332896223|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |735 |Bus |\n", - "|26-304-j19-1 |8590833 |159.TA.26-304-j19-1.4.R |8590833|19:51:00 |19:51:00 |11 |Unterengstringen, Langacher |47.4122360710415|8.45316479104707|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |760 |Bus |\n", - "|26-304-j19-1 |8594732 |159.TA.26-304-j19-1.4.R |8594732|19:53:00 |19:53:00 |12 |Unterengstringen, Sennenbüel N|47.4134944230824|8.44931101847766|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |987 |Bus |\n", - "|26-304-j19-1 |8590831 |159.TA.26-304-j19-1.4.R |8590831|19:53:00 |19:53:00 |13 |Unterengstringen, Aegelsee |47.414977659342 |8.44603216769017|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |503 |Bus |\n", - "|26-304-j19-1 |8590911 |159.TA.26-304-j19-1.4.R |8590911|19:55:00 |19:55:00 |14 |Weiningen ZH, Ausserdorf |47.4176826342903|8.43953734818508|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |751 |Bus |\n", - "|26-304-j19-1 |8590913 |159.TA.26-304-j19-1.4.R |8590913|19:56:00 |19:56:00 |15 |Weiningen ZH, Lindenplatz |47.4195547602987|8.43394084396424|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |933 |Bus |\n", - "|26-304-j19-1 |8590914 |159.TA.26-304-j19-1.4.R |8590914|19:57:00 |19:57:00 |16 |Weiningen ZH, Schulhaus |47.4183512583635|8.42866773324572|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |623 |Bus |\n", - "|26-304-j19-1 |8590617 |159.TA.26-304-j19-1.4.R |8590617|19:59:00 |19:59:00 |17 |Geroldswil, Welbrig |47.4180716529658|8.41906474285715|Dietikon, Bahnhof |5481 |1 |19:39:00 |3 |17 |856 |Bus |\n", - "|26-70-A-j19-1|8591061 |966.TA.26-70-A-j19-1.5.H|8591061|07:00:00 |07:00:00 |9 |Zürich Leimbach, Bahnhof |47.3332523864039|8.51859807635144|Zürich, Mittelleimbach |3928 |0 |07:00:00 |4 |5 |1203 |Bus |\n", - "|26-70-A-j19-1|8591270 |966.TA.26-70-A-j19-1.5.H|8591270|07:02:00 |07:02:00 |10 |Zürich, Marbachweg |47.3303482449491|8.51537312448101|Zürich, Mittelleimbach |3928 |0 |07:00:00 |4 |5 |1197 |Bus |\n", - "|26-70-A-j19-1|8591210 |966.TA.26-70-A-j19-1.5.H|8591210|07:03:00 |07:03:00 |11 |Zürich, Im Hüsli |47.3282354882425|8.51269614493396|Zürich, Mittelleimbach |3928 |0 |07:00:00 |4 |5 |723 |Bus |\n", - "|26-70-A-j19-1|8591370 |966.TA.26-70-A-j19-1.5.H|8591370|07:03:00 |07:03:00 |12 |Zürich, Sihlweidstrasse |47.3264149182794|8.51466345540645|Zürich, Mittelleimbach |3928 |0 |07:00:00 |4 |5 |989 |Bus |\n", - "|26-70-A-j19-1|8591278 |966.TA.26-70-A-j19-1.5.H|8591278|07:04:00 |07:04:00 |13 |Zürich, Mittelleimbach |47.3231389520848|8.51428616298707|Zürich, Mittelleimbach |3928 |0 |07:00:00 |4 |5 |139 |Bus |\n", - "|26-61-j19-1 |8591281 |269.TA.26-61-j19-1.1.H |8591281|19:57:00 |19:57:00 |1 |Zürich, Mühlacker |47.4256326325821|8.49799970688372|Zürich, Schwamendingerplatz|2076 |0 |19:57:00 |5 |2 |208 |Bus |\n", - "|26-61-j19-1 |8591046 |269.TA.26-61-j19-1.1.H |8591046|19:58:00 |19:58:00 |2 |Zürich, Aspholz |47.425085652811 |8.50058685490234|Zürich, Schwamendingerplatz|2076 |0 |19:57:00 |5 |2 |1002 |Bus |\n", - "|26-703-j19-1 |8591825 |179.TA.26-703-j19-1.2.R |8591825|07:10:00 |07:10:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |587 |Bus |\n", - "|26-703-j19-1 |8590504 |179.TA.26-703-j19-1.2.R |8590504|07:11:00 |07:11:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |865 |Bus |\n", - "|26-703-j19-1 |8596005 |179.TA.26-703-j19-1.2.R |8596005|07:14:00 |07:14:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |1370 |Bus |\n", - "|26-703-j19-1 |8591832 |179.TA.26-703-j19-1.2.R |8591832|07:14:00 |07:14:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |1024 |Bus |\n", - "|26-703-j19-1 |8591147 |179.TA.26-703-j19-1.2.R |8591147|07:16:00 |07:16:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |1263 |Bus |\n", - "|26-703-j19-1 |8591162 |179.TA.26-703-j19-1.2.R |8591162|07:17:00 |07:17:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |153 |Bus |\n", - "|26-703-j19-1 |8591261 |179.TA.26-703-j19-1.2.R |8591261|07:18:00 |07:18:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |1199 |Bus |\n", - "|26-703-j19-1 |8591107 |179.TA.26-703-j19-1.2.R |8591107|07:19:00 |07:19:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |1313 |Bus |\n", - "|26-703-j19-1 |8591233 |179.TA.26-703-j19-1.2.R |8591233|07:25:00 |07:25:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9385 |1 |07:10:00 |6 |9 |1134 |Bus |\n", - "|26-703-j19-1 |8591825 |171.TA.26-703-j19-1.2.R |8591825|07:12:00 |07:12:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |587 |Bus |\n", - "|26-703-j19-1 |8590504 |171.TA.26-703-j19-1.2.R |8590504|07:13:00 |07:13:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |865 |Bus |\n", - "|26-703-j19-1 |8596005 |171.TA.26-703-j19-1.2.R |8596005|07:16:00 |07:16:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |1370 |Bus |\n", - "|26-703-j19-1 |8591832 |171.TA.26-703-j19-1.2.R |8591832|07:16:00 |07:16:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |1024 |Bus |\n", - "|26-703-j19-1 |8591147 |171.TA.26-703-j19-1.2.R |8591147|07:18:00 |07:18:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |1263 |Bus |\n", - "|26-703-j19-1 |8591162 |171.TA.26-703-j19-1.2.R |8591162|07:19:00 |07:19:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |153 |Bus |\n", - "|26-703-j19-1 |8591261 |171.TA.26-703-j19-1.2.R |8591261|07:20:00 |07:20:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |1199 |Bus |\n", - "|26-703-j19-1 |8591107 |171.TA.26-703-j19-1.2.R |8591107|07:21:00 |07:21:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |1313 |Bus |\n", - "|26-703-j19-1 |8591233 |171.TA.26-703-j19-1.2.R |8591233|07:27:00 |07:27:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9346 |1 |07:12:00 |6 |9 |1134 |Bus |\n", - "|26-703-j19-1 |8591825 |155.TA.26-703-j19-1.2.R |8591825|07:25:00 |07:25:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |587 |Bus |\n", - "|26-703-j19-1 |8590504 |155.TA.26-703-j19-1.2.R |8590504|07:26:00 |07:26:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |865 |Bus |\n", - "|26-703-j19-1 |8596005 |155.TA.26-703-j19-1.2.R |8596005|07:29:00 |07:29:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |1370 |Bus |\n", - "|26-703-j19-1 |8591832 |155.TA.26-703-j19-1.2.R |8591832|07:29:00 |07:29:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |1024 |Bus |\n", - "|26-703-j19-1 |8591147 |155.TA.26-703-j19-1.2.R |8591147|07:31:00 |07:31:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |1263 |Bus |\n", - "|26-703-j19-1 |8591162 |155.TA.26-703-j19-1.2.R |8591162|07:32:00 |07:32:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |153 |Bus |\n", - "|26-703-j19-1 |8591261 |155.TA.26-703-j19-1.2.R |8591261|07:33:00 |07:33:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |1199 |Bus |\n", - "|26-703-j19-1 |8591107 |155.TA.26-703-j19-1.2.R |8591107|07:34:00 |07:34:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |1313 |Bus |\n", - "|26-703-j19-1 |8591233 |155.TA.26-703-j19-1.2.R |8591233|07:40:00 |07:40:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9267 |1 |07:25:00 |6 |9 |1134 |Bus |\n", - "|26-703-j19-1 |8591825 |144.TA.26-703-j19-1.2.R |8591825|07:27:00 |07:27:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |587 |Bus |\n", - "|26-703-j19-1 |8590504 |144.TA.26-703-j19-1.2.R |8590504|07:28:00 |07:28:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |865 |Bus |\n", - "|26-703-j19-1 |8596005 |144.TA.26-703-j19-1.2.R |8596005|07:31:00 |07:31:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |1370 |Bus |\n", - "|26-703-j19-1 |8591832 |144.TA.26-703-j19-1.2.R |8591832|07:31:00 |07:31:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |1024 |Bus |\n", - "|26-703-j19-1 |8591147 |144.TA.26-703-j19-1.2.R |8591147|07:33:00 |07:33:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |1263 |Bus |\n", - "|26-703-j19-1 |8591162 |144.TA.26-703-j19-1.2.R |8591162|07:34:00 |07:34:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |153 |Bus |\n", - "|26-703-j19-1 |8591261 |144.TA.26-703-j19-1.2.R |8591261|07:35:00 |07:35:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |1199 |Bus |\n", - "|26-703-j19-1 |8591107 |144.TA.26-703-j19-1.2.R |8591107|07:36:00 |07:36:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |1313 |Bus |\n", - "|26-703-j19-1 |8591233 |144.TA.26-703-j19-1.2.R |8591233|07:42:00 |07:42:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9231 |1 |07:27:00 |6 |9 |1134 |Bus |\n", - "|26-703-j19-1 |8591825 |120.TA.26-703-j19-1.2.R |8591825|07:40:00 |07:40:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9159 |1 |07:40:00 |6 |9 |587 |Bus |\n", - "|26-703-j19-1 |8590504 |120.TA.26-703-j19-1.2.R |8590504|07:41:00 |07:41:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9159 |1 |07:40:00 |6 |9 |865 |Bus |\n", - "|26-703-j19-1 |8596005 |120.TA.26-703-j19-1.2.R |8596005|07:44:00 |07:44:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9159 |1 |07:40:00 |6 |9 |1370 |Bus |\n", - "|26-703-j19-1 |8591832 |120.TA.26-703-j19-1.2.R |8591832|07:44:00 |07:44:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9159 |1 |07:40:00 |6 |9 |1024 |Bus |\n", - "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", + "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|route_id |stop_id_general|trip_id |stop_id|arrival_time|departure_time|stop_sequence|stop_name |stop_lat |stop_lon |trip_headsign |trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|monotonically_increasing_id|\n", + "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|26-13-j19-1 |8576240 |2064.TA.26-13-j19-1.24.H|8576240|07:00:00 |07:00:00 |5 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1221 |Tram |0 |\n", + "|26-13-j19-1 |8591353 |2064.TA.26-13-j19-1.24.H|8591353|07:01:00 |07:01:00 |6 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |816 |Tram |1 |\n", + "|26-13-j19-1 |8591039 |2064.TA.26-13-j19-1.24.H|8591039|07:02:00 |07:02:00 |7 |Zürich, Alte Trotte |47.3977659017765|8.50725235431143|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |776 |Tram |2 |\n", + "|26-13-j19-1 |8591121 |2064.TA.26-13-j19-1.24.H|8591121|07:03:00 |07:03:00 |8 |Zürich, Eschergutweg |47.3962700189648|8.51204037477646|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |307 |Tram |3 |\n", + "|26-13-j19-1 |8591417 |2064.TA.26-13-j19-1.24.H|8591417|07:05:00 |07:05:00 |9 |Zürich, Waidfussweg |47.3954977376399|8.51840044698891|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |347 |Tram |4 |\n", + "|26-13-j19-1 |8591437 |2064.TA.26-13-j19-1.24.H|8591437|07:06:00 |07:06:00 |10 |Zürich, Wipkingerplatz |47.3925909395293|8.52357474302616|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1015 |Tram |5 |\n", + "|26-13-j19-1 |8580522 |2064.TA.26-13-j19-1.24.H|8580522|07:08:00 |07:08:00 |11 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |455 |Tram |6 |\n", + "|26-13-j19-1 |8591110 |2064.TA.26-13-j19-1.24.H|8591110|07:09:00 |07:09:00 |12 |Zürich, Dammweg |47.3884919601296|8.52639545301869|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1102 |Tram |7 |\n", + "|26-13-j19-1 |8591306 |2064.TA.26-13-j19-1.24.H|8591306|07:10:00 |07:10:00 |13 |Zürich, Quellenstrasse |47.3867403702341|8.52874903906341|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |786 |Tram |8 |\n", + "|26-13-j19-1 |8591257 |2064.TA.26-13-j19-1.24.H|8591257|07:11:00 |07:11:00 |14 |Zürich, Limmatplatz |47.3845994590919|8.53162364797299|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |388 |Tram |9 |\n", + "|26-13-j19-1 |8591282 |2064.TA.26-13-j19-1.24.H|8591282|07:12:00 |07:12:00 |15 |Zürich, Museum für Gestaltung |47.3821239221899|8.53493843137185|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |138 |Tram |10 |\n", + "|26-13-j19-1 |8591368 |2064.TA.26-13-j19-1.24.H|8591368|07:14:00 |07:14:00 |16 |Zürich, Sihlquai/HB |47.3798733332196|8.53760642776606|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |880 |Tram |11 |\n", + "|26-13-j19-1 |8587349 |2064.TA.26-13-j19-1.24.H|8587349|07:16:00 |07:16:00 |17 |Zürich, Bahnhofquai/HB |47.3775618175159|8.54173867807358|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |602 |Tram |12 |\n", + "|26-13-j19-1 |8591067 |2064.TA.26-13-j19-1.24.H|8591067|07:18:00 |07:18:00 |18 |Zürich, Bahnhofstrasse/HB |47.3765581015114|8.53994204750509|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |616 |Tram |13 |\n", + "|26-13-j19-1 |8591316 |2064.TA.26-13-j19-1.24.H|8591316|07:20:00 |07:20:00 |19 |Zürich, Rennweg |47.3730662375955|8.53845982728609|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |373 |Tram |14 |\n", + "|26-13-j19-1 |8591299 |2064.TA.26-13-j19-1.24.H|8591299|07:22:00 |07:22:00 |20 |Zürich, Paradeplatz |47.3693672863583|8.53876525448273|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1223 |Tram |15 |\n", + "|26-13-j19-1 |8591384 |2064.TA.26-13-j19-1.24.H|8591384|07:23:00 |07:23:00 |21 |Zürich, Stockerstrasse |47.3677002399791|8.53501029659459|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |871 |Tram |16 |\n", + "|26-13-j19-1 |8591404 |2064.TA.26-13-j19-1.24.H|8591404|07:24:00 |07:24:00 |22 |Zürich, Tunnelstrasse |47.3661426599847|8.53253094641008|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |890 |Tram |17 |\n", + "|26-13-j19-1 |8591059 |2064.TA.26-13-j19-1.24.H|8591059|07:25:00 |07:25:00 |23 |Zürich Enge, Bahnhof/Bederstr.|47.3645546111557|8.53045583810347|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |177 |Tram |18 |\n", + "|26-13-j19-1 |8591415 |2064.TA.26-13-j19-1.24.H|8591415|07:27:00 |07:27:00 |24 |Zürich, Waffenplatzstrasse |47.3614818138862|8.52574866601403|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1265 |Tram |19 |\n", + "|26-13-j19-1 |8591366 |2064.TA.26-13-j19-1.24.H|8591366|07:28:00 |07:28:00 |25 |Zürich, Sihlcity Nord |47.3600640074787|8.52303575385561|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |968 |Tram |20 |\n", + "|26-13-j19-1 |8591329 |2064.TA.26-13-j19-1.24.H|8591329|07:29:00 |07:29:00 |26 |Zürich, Saalsporthalle |47.3578611597087|8.52040369007277|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1238 |Tram |21 |\n", + "|26-13-j19-1 |8591245 |2064.TA.26-13-j19-1.24.H|8591245|07:30:00 |07:30:00 |27 |Zürich, Laubegg |47.3587313564196|8.51708890667391|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1216 |Tram |22 |\n", + "|26-13-j19-1 |8591405 |2064.TA.26-13-j19-1.24.H|8591405|07:32:00 |07:32:00 |28 |Zürich, Uetlihof |47.3567353594536|8.51396276948474|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |847 |Tram |23 |\n", + "|26-13-j19-1 |8591385 |2064.TA.26-13-j19-1.24.H|8591385|07:33:00 |07:33:00 |29 |Zürich, Strassenverkehrsamt |47.3530717783138|8.51171698127413|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |275 |Tram |24 |\n", + "|26-13-j19-1 |8591034 |2064.TA.26-13-j19-1.24.H|8591034|07:34:00 |07:34:00 |30 |Zürich, Albisgütli |47.3519945640447|8.5077104951064 |Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1352 |Tram |25 |\n", + "|26-11-A-j19-1|8591049 |791.TA.26-11-A-j19-1.3.H|8591049|19:49:00 |19:49:00 |1 |Zürich, Auzelg |47.4166918393693|8.568113214819 |Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |1117 |Tram |26 |\n", + "|26-11-A-j19-1|8591128 |791.TA.26-11-A-j19-1.3.H|8591128|19:51:00 |19:51:00 |2 |Zürich, Fernsehstudio |47.4181749855684|8.56174415945371|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |156 |Tram |27 |\n", + "|26-11-A-j19-1|8591830 |791.TA.26-11-A-j19-1.3.H|8591830|19:52:00 |19:52:00 |3 |Glattpark, Glattpark |47.4199559214972|8.55716275150406|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |671 |Tram |28 |\n", + "|26-11-A-j19-1|8591294 |791.TA.26-11-A-j19-1.3.H|8591294|19:53:00 |19:53:00 |4 |Zürich, Oerlikerhus |47.4175853791724|8.5542072942189 |Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |573 |Tram |29 |\n", + "|26-11-A-j19-1|8591256 |791.TA.26-11-A-j19-1.3.H|8591256|19:54:00 |19:54:00 |5 |Zürich, Leutschenbach |47.4146433269471|8.55130573585079|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |444 |Tram |30 |\n", + "|26-11-A-j19-1|8591273 |791.TA.26-11-A-j19-1.3.H|8591273|19:55:00 |19:55:00 |6 |Zürich, Messe/Hallenstadion |47.4106919651348|8.55068589830466|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |389 |Tram |31 |\n", + "|26-11-A-j19-1|8591382 |791.TA.26-11-A-j19-1.3.H|8591382|19:57:00 |19:57:00 |7 |Zürich, Sternen Oerlikon |47.4100718783688|8.54623025449481|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |687 |Tram |32 |\n", + "|26-11-A-j19-1|8580449 |791.TA.26-11-A-j19-1.3.H|8580449|19:59:00 |19:59:00 |8 |Zürich Oerlikon, Bahnhof |47.411494419524 |8.54479295004002|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |766 |Tram |33 |\n", + "|26-304-j19-1 |8591057 |159.TA.26-304-j19-1.4.R |8591057|19:39:00 |19:39:00 |1 |Zürich Altstetten, Bahnhof N |47.392067942097 |8.48990588617267|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |521 |Bus |34 |\n", + "|26-304-j19-1 |8591402 |159.TA.26-304-j19-1.4.R |8591402|19:41:00 |19:41:00 |2 |Zürich, Tüffenwies |47.3979787271809|8.49434356367684|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |1000 |Bus |35 |\n", + "|26-304-j19-1 |8591434 |159.TA.26-304-j19-1.4.R |8591434|19:41:00 |19:41:00 |3 |Zürich, Winzerhalde |47.4000582901792|8.4945681424979 |Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |711 |Bus |36 |\n", + "|26-304-j19-1 |8591197 |159.TA.26-304-j19-1.4.R |8591197|19:42:00 |19:42:00 |4 |Zürich, Hohenklingensteig |47.4013473348052|8.49021131336931|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |1125 |Bus |37 |\n", + "|26-304-j19-1 |8591436 |159.TA.26-304-j19-1.4.R |8591436|19:43:00 |19:43:00 |5 |Zürich, Winzerstrasse Süd |47.403372044054 |8.486123978826 |Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |149 |Bus |38 |\n", + "|26-304-j19-1 |8591136 |159.TA.26-304-j19-1.4.R |8591136|19:46:00 |19:46:00 |6 |Zürich, Frankental |47.4057006674825|8.48137189097235|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |690 |Bus |39 |\n", + "|26-304-j19-1 |8590725 |159.TA.26-304-j19-1.4.R |8590725|19:47:00 |19:47:00 |7 |Oberengstringen, Eggbühl |47.4055243523393|8.47408655401713|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |1329 |Bus |40 |\n", + "|26-304-j19-1 |8590726 |159.TA.26-304-j19-1.4.R |8590726|19:48:00 |19:48:00 |8 |Oberengstringen, Lanzrain |47.407342193939 |8.46795106062573|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |763 |Bus |41 |\n", + "|26-304-j19-1 |8590728 |159.TA.26-304-j19-1.4.R |8590728|19:49:00 |19:49:00 |9 |Oberengstringen, Zentrum |47.4091295756792|8.46260608468448|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |376 |Bus |42 |\n", + "|26-304-j19-1 |8590727 |159.TA.26-304-j19-1.4.R |8590727|19:50:00 |19:50:00 |10 |Oberengstringen, Paradies |47.4104852703573|8.45874332896223|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |740 |Bus |43 |\n", + "|26-304-j19-1 |8590833 |159.TA.26-304-j19-1.4.R |8590833|19:51:00 |19:51:00 |11 |Unterengstringen, Langacher |47.4122360710415|8.45316479104707|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |764 |Bus |44 |\n", + "|26-304-j19-1 |8594732 |159.TA.26-304-j19-1.4.R |8594732|19:53:00 |19:53:00 |12 |Unterengstringen, Sennenbüel N|47.4134944230824|8.44931101847766|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |987 |Bus |45 |\n", + "|26-304-j19-1 |8590831 |159.TA.26-304-j19-1.4.R |8590831|19:53:00 |19:53:00 |13 |Unterengstringen, Aegelsee |47.414977659342 |8.44603216769017|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |502 |Bus |46 |\n", + "|26-304-j19-1 |8590911 |159.TA.26-304-j19-1.4.R |8590911|19:55:00 |19:55:00 |14 |Weiningen ZH, Ausserdorf |47.4176826342903|8.43953734818508|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |754 |Bus |47 |\n", + "|26-304-j19-1 |8590913 |159.TA.26-304-j19-1.4.R |8590913|19:56:00 |19:56:00 |15 |Weiningen ZH, Lindenplatz |47.4195547602987|8.43394084396424|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |934 |Bus |48 |\n", + "|26-304-j19-1 |8590914 |159.TA.26-304-j19-1.4.R |8590914|19:57:00 |19:57:00 |16 |Weiningen ZH, Schulhaus |47.4183512583635|8.42866773324572|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |622 |Bus |49 |\n", + "|26-304-j19-1 |8590617 |159.TA.26-304-j19-1.4.R |8590617|19:59:00 |19:59:00 |17 |Geroldswil, Welbrig |47.4180716529658|8.41906474285715|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |859 |Bus |50 |\n", + "|26-61-j19-1 |8591281 |269.TA.26-61-j19-1.1.H |8591281|19:57:00 |19:57:00 |1 |Zürich, Mühlacker |47.4256326325821|8.49799970688372|Zürich, Schwamendingerplatz|2076 |0 |19:57:00 |3 |2 |212 |Bus |51 |\n", + "|26-61-j19-1 |8591046 |269.TA.26-61-j19-1.1.H |8591046|19:58:00 |19:58:00 |2 |Zürich, Aspholz |47.425085652811 |8.50058685490234|Zürich, Schwamendingerplatz|2076 |0 |19:57:00 |3 |2 |1003 |Bus |52 |\n", + "|26-703-j19-1 |8591825 |179.TA.26-703-j19-1.2.R |8591825|07:10:00 |07:10:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |580 |Bus |53 |\n", + "|26-703-j19-1 |8590504 |179.TA.26-703-j19-1.2.R |8590504|07:11:00 |07:11:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |861 |Bus |54 |\n", + "|26-703-j19-1 |8596005 |179.TA.26-703-j19-1.2.R |8596005|07:14:00 |07:14:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1366 |Bus |55 |\n", + "|26-703-j19-1 |8591832 |179.TA.26-703-j19-1.2.R |8591832|07:14:00 |07:14:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1023 |Bus |56 |\n", + "|26-703-j19-1 |8591147 |179.TA.26-703-j19-1.2.R |8591147|07:16:00 |07:16:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1260 |Bus |57 |\n", + "|26-703-j19-1 |8591162 |179.TA.26-703-j19-1.2.R |8591162|07:17:00 |07:17:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |146 |Bus |58 |\n", + "|26-703-j19-1 |8591261 |179.TA.26-703-j19-1.2.R |8591261|07:18:00 |07:18:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1197 |Bus |59 |\n", + "|26-703-j19-1 |8591107 |179.TA.26-703-j19-1.2.R |8591107|07:19:00 |07:19:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1311 |Bus |60 |\n", + "|26-703-j19-1 |8591233 |179.TA.26-703-j19-1.2.R |8591233|07:25:00 |07:25:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1133 |Bus |61 |\n", + "|26-703-j19-1 |8591825 |171.TA.26-703-j19-1.2.R |8591825|07:12:00 |07:12:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |580 |Bus |62 |\n", + "|26-703-j19-1 |8590504 |171.TA.26-703-j19-1.2.R |8590504|07:13:00 |07:13:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |861 |Bus |63 |\n", + "|26-703-j19-1 |8596005 |171.TA.26-703-j19-1.2.R |8596005|07:16:00 |07:16:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1366 |Bus |64 |\n", + "|26-703-j19-1 |8591832 |171.TA.26-703-j19-1.2.R |8591832|07:16:00 |07:16:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1023 |Bus |65 |\n", + "|26-703-j19-1 |8591147 |171.TA.26-703-j19-1.2.R |8591147|07:18:00 |07:18:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1260 |Bus |66 |\n", + "|26-703-j19-1 |8591162 |171.TA.26-703-j19-1.2.R |8591162|07:19:00 |07:19:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |146 |Bus |67 |\n", + "|26-703-j19-1 |8591261 |171.TA.26-703-j19-1.2.R |8591261|07:20:00 |07:20:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1197 |Bus |68 |\n", + "|26-703-j19-1 |8591107 |171.TA.26-703-j19-1.2.R |8591107|07:21:00 |07:21:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1311 |Bus |69 |\n", + "|26-703-j19-1 |8591233 |171.TA.26-703-j19-1.2.R |8591233|07:27:00 |07:27:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1133 |Bus |70 |\n", + "|26-703-j19-1 |8591825 |156.TA.26-703-j19-1.2.R |8591825|07:25:00 |07:25:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |580 |Bus |71 |\n", + "|26-703-j19-1 |8590504 |156.TA.26-703-j19-1.2.R |8590504|07:26:00 |07:26:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |861 |Bus |72 |\n", + "|26-703-j19-1 |8596005 |156.TA.26-703-j19-1.2.R |8596005|07:29:00 |07:29:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1366 |Bus |73 |\n", + "|26-703-j19-1 |8591832 |156.TA.26-703-j19-1.2.R |8591832|07:29:00 |07:29:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1023 |Bus |74 |\n", + "|26-703-j19-1 |8591147 |156.TA.26-703-j19-1.2.R |8591147|07:31:00 |07:31:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1260 |Bus |75 |\n", + "|26-703-j19-1 |8591162 |156.TA.26-703-j19-1.2.R |8591162|07:32:00 |07:32:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |146 |Bus |76 |\n", + "|26-703-j19-1 |8591261 |156.TA.26-703-j19-1.2.R |8591261|07:33:00 |07:33:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1197 |Bus |77 |\n", + "|26-703-j19-1 |8591107 |156.TA.26-703-j19-1.2.R |8591107|07:34:00 |07:34:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1311 |Bus |78 |\n", + "|26-703-j19-1 |8591233 |156.TA.26-703-j19-1.2.R |8591233|07:40:00 |07:40:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1133 |Bus |79 |\n", + "|26-703-j19-1 |8591825 |144.TA.26-703-j19-1.2.R |8591825|07:27:00 |07:27:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |580 |Bus |80 |\n", + "|26-703-j19-1 |8590504 |144.TA.26-703-j19-1.2.R |8590504|07:28:00 |07:28:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |861 |Bus |81 |\n", + "|26-703-j19-1 |8596005 |144.TA.26-703-j19-1.2.R |8596005|07:31:00 |07:31:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1366 |Bus |82 |\n", + "|26-703-j19-1 |8591832 |144.TA.26-703-j19-1.2.R |8591832|07:31:00 |07:31:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1023 |Bus |83 |\n", + "|26-703-j19-1 |8591147 |144.TA.26-703-j19-1.2.R |8591147|07:33:00 |07:33:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1260 |Bus |84 |\n", + "|26-703-j19-1 |8591162 |144.TA.26-703-j19-1.2.R |8591162|07:34:00 |07:34:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |146 |Bus |85 |\n", + "|26-703-j19-1 |8591261 |144.TA.26-703-j19-1.2.R |8591261|07:35:00 |07:35:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1197 |Bus |86 |\n", + "|26-703-j19-1 |8591107 |144.TA.26-703-j19-1.2.R |8591107|07:36:00 |07:36:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1311 |Bus |87 |\n", + "|26-703-j19-1 |8591233 |144.TA.26-703-j19-1.2.R |8591233|07:42:00 |07:42:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1133 |Bus |88 |\n", + "|26-703-j19-1 |8591825 |120.TA.26-703-j19-1.2.R |8591825|07:40:00 |07:40:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |580 |Bus |89 |\n", + "|26-703-j19-1 |8590504 |120.TA.26-703-j19-1.2.R |8590504|07:41:00 |07:41:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |861 |Bus |90 |\n", + "|26-703-j19-1 |8596005 |120.TA.26-703-j19-1.2.R |8596005|07:44:00 |07:44:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1366 |Bus |91 |\n", + "|26-703-j19-1 |8591832 |120.TA.26-703-j19-1.2.R |8591832|07:44:00 |07:44:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1023 |Bus |92 |\n", + "|26-703-j19-1 |8591147 |120.TA.26-703-j19-1.2.R |8591147|07:46:00 |07:46:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1260 |Bus |93 |\n", + "|26-703-j19-1 |8591162 |120.TA.26-703-j19-1.2.R |8591162|07:47:00 |07:47:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |146 |Bus |94 |\n", + "|26-703-j19-1 |8591261 |120.TA.26-703-j19-1.2.R |8591261|07:48:00 |07:48:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1197 |Bus |95 |\n", + "|26-703-j19-1 |8591107 |120.TA.26-703-j19-1.2.R |8591107|07:49:00 |07:49:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1311 |Bus |96 |\n", + "|26-703-j19-1 |8591233 |120.TA.26-703-j19-1.2.R |8591233|07:55:00 |07:55:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1133 |Bus |97 |\n", + "|26-703-j19-1 |8591825 |95.TA.26-703-j19-1.2.R |8591825|07:42:00 |07:42:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9119 |1 |07:42:00 |4 |9 |580 |Bus |98 |\n", + "|26-703-j19-1 |8590504 |95.TA.26-703-j19-1.2.R |8590504|07:43:00 |07:43:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9119 |1 |07:42:00 |4 |9 |861 |Bus |99 |\n", + "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", "only showing top 100 rows" ] } ], "source": [ "stop_times = stop_times.sort(stop_times.route_int.cast('int'), \n", " stop_times.departure_first_stop, \n", " stop_times.trip_id, \n", - " stop_times.stop_sequence.cast('int'))\n", + " stop_times.stop_sequence.cast('int'))\\\n", + ".withColumn('monotonically_increasing_id', F.monotonically_increasing_id())\n", + "\n", "stop_times.show(100, 0)" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times.write.csv('data/lgpt_guys/stop_times_final_cyril.csv', header=True, mode = 'overwrite')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Footpaths\n", "\n", "We make the simplifying assumption that within stops sharing the same 7 first characters (`stop_id_general`) (almost the same as grouping them by parent stations, but more robust), transfer times take 2 minutes, no matter the stop. This may break for very large stations (such as Zürich HB).\n", "\n", "Between different `stop_id_general`, walking time is computed as the distance (which must be max. 500m) divided by a constant walking speed of 50 meters per minute.\n", "\n", "We only consider stops present in the final and filtered `stop_times` table.\n", "\n", "- Get all unique stop_int with a single pair of coordinates (first of the groupby, doesnt need to be dead precise)\n", "\n", "- join two copies of this dataframe\n", "- drop lines where general_stop_id is the same\n", "- compute the distance with geopy for all other lines between the pairs of coordinates\n", "- filter based on distance <=500m\n", "- transform distance to walking time in seconds\n", "- order by stop_int1, then stop_int2\n", "- save" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_times = spark.read.csv('data/lgpt_guys/stop_times_final_cyril.csv', header=True)" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Building lookup tables to pretty-print results after RAPTOR:" + "Getting one pair of coordinates per parent stop:" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+----------------+----------------+--------------------+\n", + "|stop_id_general| stop_lat| stop_lon| stop_name|\n", + "+---------------+----------------+----------------+--------------------+\n", + "| 8576163|47.4446882894765|8.63618754705906| Bassersdorf, Rietli|\n", + "| 8588279|47.4314059613092|8.66796994181563|Tagelswangen, Ger...|\n", + "| 8588740|47.4448766240902|8.57874926778446|Kloten, Neubrunne...|\n", + "| 8595714|47.4485885009565|8.45665025434994| Regensdorf, Allmend|\n", + "| 8588054|47.3600457522121|8.71459250506792| Uster, Meieracher|\n", + "| 8590851|47.3894589587793|8.67489595265716|Volketswil, Chappeli|\n", + "| 8591345|47.3845933768342|8.47776964668253|Zürich, Schulhaus...|\n", + "| 8591264|47.4060107375098|8.58139031472014| Zürich, Luegisland|\n", + "| 8503202|47.2960953402584|8.56475351565593| Thalwil|\n", + "| 8573163|47.2857066724744|8.55511459265598|Gattikon, Gattike...|\n", + "| 8590795|47.3955220616337|8.46779834702741| Schlieren, Mülligen|\n", + "| 8590652|47.2544735496734|8.60926503799025|Horgen, Glärnischhof|\n", + "| 8594307|47.4384910970626|8.36438429150518|Würenlos, Lärchenweg|\n", + "| 8591110|47.3884919601296|8.52639545301869| Zürich, Dammweg|\n", + "| 8591052|47.3783343616704|8.52287405710445|Zürich, Bäckeranlage|\n", + "| 8582529|47.3522623481765|8.35006514587432|Bremgarten AG, Zu...|\n", + "| 8591319|47.4106433311425| 8.5592648092692| Zürich, Riedgraben|\n", + "| 8502221|47.3574960379336|8.43754308825406| Birmensdorf ZH|\n", + "| 8591438|47.3591816623693|8.58524408728955|Zürich, Witikon Z...|\n", + "| 8590473|47.3151369463988|8.53576488143336|Adliswil, Hofacke...|\n", + "+---------------+----------------+----------------+--------------------+\n", + "only showing top 20 rows" + ] } ], "source": [ - "stop_times = spark.read.csv('data/lgpt_guys/stop_times_final_cyril.csv', header=True)" + "stop_times.select(stop_times.stop_id_general, stop_times.stop_lat, stop_times.stop_lon, stop_times.stop_name).dropDuplicates()\\\n", + ".show()" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "# lookup table for routes" + "from pyspark.sql.window import Window\n", + "w= (\n", + " Window.partitionBy(\"stop_id_general\")\n", + " .orderBy('stop_name')\n", + ")" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+---------+--------------+-------------+--------------------+\n", - "|route_int| route_id| route_desc| trip_headsign|\n", - "+---------+--------------+-------------+--------------------+\n", - "| 0| 26-10-j19-1| Tram|Zürich Flughafen,...|\n", - "| 1| 26-13-j19-1| Tram| Zürich, Albisgütli|\n", - "| 2| 26-11-A-j19-1| Tram| Zürich, Rehalp|\n", - "| 3| 26-304-j19-1| Bus| Dietikon, Bahnhof|\n", - "| 4| 26-70-A-j19-1| Bus|Zürich, Mittellei...|\n", - "| 5| 26-61-j19-1| Bus|Zürich, Schwamend...|\n", - "| 6| 26-703-j19-1| Bus| Zürich, Klusplatz|\n", - "| 7| 26-652-j19-1| Bus| Effretikon, Bahnhof|\n", - "| 8| 26-83-j19-1| Bus|Zürich Altstetten...|\n", - "| 9| 26-749-j19-1| Bus| Dietlikon, Bahnhof|\n", - "| 10| 26-24-j19-1| S-Bahn| Weinfelden|\n", - "| 11| 26-640-j19-1| Bus| Neschwil, Post|\n", - "| 12| 26-725-j19-1| Bus| Volketswil, Dorf|\n", - "| 13| 79-736-j19-1| Bus|Zürich Flughafen,...|\n", - "| 14| 26-24-j19-1| S-Bahn| Weinfelden|\n", - "| 15|79-373-2-j19-1| Schiff| Küsnacht ZSG|\n", - "| 16| 26-658-j19-1| Bus| Effretikon, Bahnhof|\n", - "| 17| 26-449-j19-1| Bus|Buchs-Dällikon, B...|\n", - "| 18| 26-303-j19-1| Bus| Killwangen, Bahnhof|\n", - "| 19| 26-17-j19-1| Tram|Zürich, Bahnhofpl...|\n", - "| 20| 26-7-A-j19-1| S-Bahn| Winterthur|\n", - "| 21| 26-24-j19-1| S-Bahn| Zug|\n", - "| 22| 26-720-j19-1| Bus| Effretikon, Bahnhof|\n", - "| 23| 26-145-j19-1| Bus| Thalwil, Zentrum|\n", - "| 24| 26-185-j19-1| Bus|Zürich Wollishofe...|\n", - "| 25| 26-14-j19-1| S-Bahn| Hinwil|\n", - "| 26| 26-8-A-j19-1| S-Bahn| Pfäffikon SZ|\n", - "| 27| 1-322-j19-1| Bus|Baden, Postautost...|\n", - "| 28|26-962-A-j19-1| Bus|Erlenbach ZH, Bah...|\n", - "| 29| 26-301-j19-1| Bus| Dietikon, Bahnhof|\n", - "| 30| 26-533-j19-1| Taxi|Niederhasli, Nass...|\n", - "| 31| 26-78-j19-1| Bus| Zürich, Bändliweg|\n", - "| 32| 26-4-j19-1| Tram|Zürich Altstetten...|\n", - "| 33| 26-89-j19-1| Bus| Zürich, Heizenholz|\n", - "| 34| 26-15-A-j19-1| Tram|Zürich, Bucheggplatz|\n", - "| 35| 26-25-A-j19-1|Standseilbahn| Zürich, Dolder|\n", - "| 36| 26-919-j19-1| Bus|Zumikon, Dorfzentrum|\n", - "| 37| 26-302-j19-1| Bus| Urdorf, Oberurdorf|\n", - "| 38| 1-444-j19-1| Bus|Bremgarten AG, Ob...|\n", - "| 39| 26-8-C-j19-1| Tram| Zürich, Klusplatz|\n", - "| 40| 79-18-A-j19-1| Bus| Forch, Bahnhof|\n", - "| 41| 26-18-j19-1| S-Bahn|Zürich Stadelhofe...|\n", - "| 42| 26-2-j19-1| S-Bahn| Ziegelbrücke|\n", - "| 43| 26-8-A-j19-1| S-Bahn| Pfäffikon SZ|\n", - "| 44| 26-726-j19-1| Bus|Schwerzenbach ZH,...|\n", - "| 44| 26-721-j19-1| Bus|Schwerzenbach ZH,...|\n", - "| 45| 26-456-j19-1| Bus|Regensdorf-Watt, ...|\n", - "| 46| 26-311-j19-1| Bus| Dietikon, Bahnhof|\n", - "| 47| 26-772-j19-1| Bus|Wallisellen, Schu...|\n", - "| 48| 26-17-j19-1| Tram|Zürich Wiedikon, ...|\n", - "| 49| 26-21-j19-1| S-Bahn| Regensdorf-Watt|\n", - "| 50| 26-7-A-j19-1| S-Bahn| Winterthur|\n", - "| 51| 26-845-j19-1| Bus|Gossau ZH, Mittel...|\n", - "| 51| 26-845-j19-1| Bus| Grüningen, Adler|\n", - "| 52| 26-768-j19-1| Bus|Zürich Flughafen,...|\n", - "| 53| 26-13-j19-1| Tram| Zürich, Frankental|\n", - "| 54| 26-2-A-j19-1| Tram|Zürich Tiefenbrun...|\n", - "| 55| 26-36-j19-1| InterRegio| Basel SBB|\n", - "| 55| 26-19-j19-1| S-Bahn| Koblenz|\n", - "| 55| 26-19-j19-1| S-Bahn| Dietikon|\n", - "| 56| 1-445-j19-1| Bus|Zürich Enge, Bahn...|\n", - "| 57| 26-451-j19-1| Bus|Adlikon b. R., Le...|\n", - "| 58| 26-2-A-j19-1| Tram|Zürich Tiefenbrun...|\n", - "| 59| 26-304-j19-1| Bus| Dietikon, Bahnhof|\n", - "| 60| 26-14-A-j19-1| Tram| Zürich, Seebach|\n", - "| 61| 26-703-j19-1| Bus| Zürich, Klusplatz|\n", - "| 62| 26-10-B-j19-1| S-Bahn| Uetliberg|\n", - "| 63| 26-787-j19-1| Bus|Brüttisellen, Ob....|\n", - "| 64| 26-743-j19-1| Bus| Stettbach, Bahnhof|\n", - "| 65| 26-771-j19-1| Bus|Wallisellen, Bahnhof|\n", - "| 66| 26-38-j19-1| Bus| Zürich, Waidspital|\n", - "| 67| 26-14-A-j19-1| Tram| Zürich, Triemli|\n", - "| 68|79-373-2-j19-1| Schiff| Thalwil ZSG|\n", - "| 69| 26-845-j19-1| Bus| Uster, Bahnhof|\n", - "| 70| 26-452-j19-1| Bus|Regensdorf, Moosä...|\n", - "| 71| 26-813-j19-1| Bus| Uster, Bahnhof|\n", - "| 72| 26-31-j19-1| Bus| Zürich, Farbhof|\n", - "| 73| 26-6-A-j19-1| S-Bahn| Baden|\n", - "| 74| 26-14-A-j19-1| Tram| Zürich, Seebach|\n", - "| 75| 26-811-j19-1| Bus| Uster, Bahnhof|\n", - "| 76| 26-652-j19-1| Bus| Effretikon, Bahnhof|\n", - "| 77| 26-726-j19-1| Bus| Volketswil, Dorf|\n", - "| 78| 26-165-j19-1| Bus|Rüschlikon, Park ...|\n", - "| 79| 26-31-j19-1| Bus|Zürich, Kienasten...|\n", - "| 80| 26-75-A-j19-1| Bus|Zürich, Schwamend...|\n", - "| 81| 26-10-B-j19-1| S-Bahn| Zürich Selnau|\n", - "| 82| 79-10-B-j19-1| S-Bahn| Zürich Triemli|\n", - "| 83| 26-787-j19-1| Bus|Zürich Oerlikon, ...|\n", - "| 84| 26-35-B-j19-1| Bus| Zürich, Solidapark|\n", - "| 85| 26-89-j19-1| Bus|Zürich Altstetten...|\n", - "| 86| 26-6-B-j19-1| Tram| Zürich, Zoo|\n", - "| 87| 26-40-j19-1| Bus| Zürich, Seebach|\n", - "| 88| 26-19-j19-1| S-Bahn| Zürich HB|\n", - "| 88|80-160-Y-j19-1| RegioExpress| Zürich HB|\n", - "| 89| 26-24-j19-1| S-Bahn| Schaffhausen|\n", - "| 90| 26-24-j19-1| S-Bahn| Schaffhausen|\n", - "| 91| 26-31-j19-1| Bus|Zürich Altstetten...|\n", - "| 92| 26-3-A-j19-1| Tram| Zürich, Klusplatz|\n", - "| 93| 26-40-j19-1| Bus|Zürich, Glaubtens...|\n", - "| 94| 26-24-j19-1| S-Bahn| Zug|\n", - "+---------+--------------+-------------+--------------------+\n", - "only showing top 100 rows" + "+---------------+--------+----------------+----------------+--------------------+\n", + "|stop_id_general|stop_int| stop_lat_first| stop_lon_first| stop_name_first|\n", + "+---------------+--------+----------------+----------------+--------------------+\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...|\n", + "| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU|\n", + "| 8503376| 8|47.4353132339136| 8.7169371079598|Ottikon b. Kemptthal|\n", + "| 8587967| 7|47.2955835709855|8.60393802835468|Erlenbach ZH, Im ...|\n", + "| 8589111| 5| 47.260856991692|8.59230484542371|Horgen, Gumelenst...|\n", + "| 8590819| 6|47.2821844204798|8.57300004996529| Thalwil, Mettli|\n", + "| 8591190| 4|47.3694098744442|8.50635403902719| Zürich, Heuried|\n", + "| 8591284| 3|47.3909246273101|8.47396977303017| Zürich, Neeserweg|\n", + "| 8588312| 9|47.4329251530902|8.68904441838401|Effretikon, Kapel...|\n", + "| 8590541| 10|47.4165824252901|8.61689173475348|Dietlikon, Dornen...|\n", + "| 8590804| 14|47.3972672788911|8.44748743845064| Schlieren, Zentrum|\n", + "| 8591149| 12|47.3642868894025|8.50799795599736| Zürich, Friesenberg|\n", + "| 8591315| 13|47.3510268782417|8.58298033277325| Zürich, Rehalp|\n", + "| 8591362| 11|47.3859132103277|8.54836824487131|Zürich, Seilbahn ...|\n", + "| 8580432| 21|47.4439106426415|8.57842587428213| Kloten, Bramen|\n", + "| 8590273| 16|47.4263923172796|8.36551616876333|Spreitenbach, Fur...|\n", + "| 8590477| 15|47.3254528526241|8.53104872619107| Adliswil, Moos|\n", + "| 8591053| 22|47.4058952214384|8.53778609082291|Zürich, Bad Allen...|\n", + "| 8591080| 18| 47.347588102752|8.53434554328425|Zürich Wollishofe...|\n", + "+---------------+--------+----------------+----------------+--------------------+\n", + "only showing top 20 rows" ] } ], "source": [ - "stop_times.select(stop_times.route_int, stop_times.route_id, stop_times.route_desc, stop_times.trip_headsign)\\\n", + "from pyspark.sql import functions as F\n", + "\n", + "stop_coordinates = stop_times.select(stop_times.stop_id_general, stop_times.stop_int, stop_times.stop_lat, stop_times.stop_lon, stop_times.stop_name,\n", + " F.first(\"stop_lat\").over(w).alias(\"stop_lat_first\"),\n", + " F.first(\"stop_lon\").over(w).alias(\"stop_lon_first\"),\n", + " F.first(\"stop_name\").over(w).alias(\"stop_name_first\"))\\\n", + ".select(F.col('stop_id_general'), F.col('stop_int'), F.col('stop_lat_first'), F.col(\"stop_lon_first\"), F.col(\"stop_name_first\"))\\\n", ".dropDuplicates()\\\n", - ".sort(F.col('route_int').cast('int'))\\\n", - ".show(100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Verifying a few routes and trips on real data" + "\n", + "stop_coordinates.show()" ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1407" + ] } ], "source": [ - "stop_times = spark.read.csv('data/lgpt_guys/stop_times_final_cyril.csv', header=True)" + "stop_coordinates.count()" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+------------+---------------+-----------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+--------------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "|route_id |stop_id_general|trip_id |stop_id|arrival_time|departure_time|stop_sequence|stop_name |stop_lat |stop_lon |trip_headsign |trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|\n", - "+------------+---------------+-----------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+--------------------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "|26-660-j19-1|8576167 |486.TA.26-660-j19-1.9.R|8576167|07:16:00 |07:16:00 |1 |Nürensdorf, Chrüzstrass |47.4542439013472|8.63462447846447|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |1198 |Bus |\n", - "|26-660-j19-1|8576168 |486.TA.26-660-j19-1.9.R|8576168|07:17:00 |07:17:00 |2 |Birchwil (Nürensdorf) |47.4577182388675|8.63876571192484|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |641 |Bus |\n", - "|26-660-j19-1|8576169 |486.TA.26-660-j19-1.9.R|8576169|07:19:00 |07:19:00 |3 |Nürensdorf, Oberwil |47.4641014319557|8.63995148810004|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |861 |Bus |\n", - "|26-660-j19-1|8576172 |486.TA.26-660-j19-1.9.R|8576172|07:22:00 |07:22:00 |4 |Breite b. N'dorf,Grünenwaldstr|47.461927224864 |8.66128647610085|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |43 |Bus |\n", - "|26-660-j19-1|8576174 |486.TA.26-660-j19-1.9.R|8576174|07:24:00 |07:24:00 |5 |Brütten, Hofacher |47.470969635094 |8.67059302244562|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |92 |Bus |\n", - "|26-660-j19-1|8506960 |486.TA.26-660-j19-1.9.R|8506960|07:26:00 |07:26:00 |6 |Brütten, Zentrum |47.472038346399 |8.67648597071027|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |1059 |Bus |\n", - "|26-660-j19-1|8576176 |486.TA.26-660-j19-1.9.R|8576176|07:27:00 |07:27:00 |7 |Brütten, Harossen |47.4714675601105|8.68249569996186|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |777 |Bus |\n", - "|26-660-j19-1|8591835 |486.TA.26-660-j19-1.9.R|8591835|07:27:00 |07:27:00 |8 |Brütten, Steighof |47.469621570521 |8.68646625351823|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |1126 |Bus |\n", - "+------------+---------------+-----------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+--------------------------+---------------+------------+--------------------+---------+----------+--------+----------+" + "+-----------------+----------+----------------+----------------+--------------------+\n", + "|stop_id_general_2|stop_int_2|stop_lat_first_2|stop_lon_first_2| stop_name_first_2|\n", + "+-----------------+----------+----------------+----------------+--------------------+\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...|\n", + "| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU|\n", + "| 8503376| 8|47.4353132339136| 8.7169371079598|Ottikon b. Kemptthal|\n", + "| 8587967| 7|47.2955835709855|8.60393802835468|Erlenbach ZH, Im ...|\n", + "| 8589111| 5| 47.260856991692|8.59230484542371|Horgen, Gumelenst...|\n", + "| 8590819| 6|47.2821844204798|8.57300004996529| Thalwil, Mettli|\n", + "| 8591190| 4|47.3694098744442|8.50635403902719| Zürich, Heuried|\n", + "| 8591284| 3|47.3909246273101|8.47396977303017| Zürich, Neeserweg|\n", + "| 8588312| 9|47.4329251530902|8.68904441838401|Effretikon, Kapel...|\n", + "| 8590541| 10|47.4165824252901|8.61689173475348|Dietlikon, Dornen...|\n", + "| 8590804| 14|47.3972672788911|8.44748743845064| Schlieren, Zentrum|\n", + "| 8591149| 12|47.3642868894025|8.50799795599736| Zürich, Friesenberg|\n", + "| 8591315| 13|47.3510268782417|8.58298033277325| Zürich, Rehalp|\n", + "| 8591362| 11|47.3859132103277|8.54836824487131|Zürich, Seilbahn ...|\n", + "| 8580432| 21|47.4439106426415|8.57842587428213| Kloten, Bramen|\n", + "| 8590273| 16|47.4263923172796|8.36551616876333|Spreitenbach, Fur...|\n", + "| 8590477| 15|47.3254528526241|8.53104872619107| Adliswil, Moos|\n", + "| 8591053| 22|47.4058952214384|8.53778609082291|Zürich, Bad Allen...|\n", + "| 8591080| 18| 47.347588102752|8.53434554328425|Zürich Wollishofe...|\n", + "+-----------------+----------+----------------+----------------+--------------------+\n", + "only showing top 20 rows" ] } ], "source": [ - "stop_times.where(stop_times.route_int==500).show(100, 0)" + "stop_coordinates_for_join = stop_coordinates.select(stop_coordinates.stop_id_general.alias('stop_id_general_2'),\n", + " stop_coordinates.stop_int.alias('stop_int_2'),\n", + " stop_coordinates.stop_lat_first.alias('stop_lat_first_2'),\n", + " stop_coordinates.stop_lon_first.alias('stop_lon_first_2'),\n", + " stop_coordinates.stop_name_first.alias('stop_name_first_2'))\n", + "stop_coordinates_for_join.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Validated on sbb.ch" + "We perform a cross-join (every possible combination of row gets created), then drop every row where the stop_id is the same." ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+---------------+------------------------+-----------+------------+--------------+-------------+------------------+----------------+----------------+-------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "|route_id |stop_id_general|trip_id |stop_id |arrival_time|departure_time|stop_sequence|stop_name |stop_lat |stop_lon |trip_headsign|trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|\n", - "+-----------+---------------+------------------------+-----------+------------+--------------+-------------+------------------+----------------+----------------+-------------+---------------+------------+--------------------+---------+----------+--------+----------+\n", - "|26-24-j19-1|8503016 |494.TA.26-24-j19-1.239.H|8503016:0:4|07:02:00 |07:04:00 |7 |Zürich Flughafen |47.4505627968247|8.56238196330552|Zug |20425 |0 |07:04:00 |200 |12 |1218 |S-Bahn |\n", - "|26-24-j19-1|8503006 |494.TA.26-24-j19-1.239.H|8503006:0:3|07:08:00 |07:09:00 |8 |Zürich Oerlikon |47.412017224139 |8.54411023042399|Zug |20425 |0 |07:04:00 |200 |12 |617 |S-Bahn |\n", - "|26-24-j19-1|8503015 |494.TA.26-24-j19-1.239.H|8503015:0:2|07:11:00 |07:11:00 |9 |Zürich Wipkingen |47.3931504194084|8.52935989345669|Zug |20425 |0 |07:04:00 |200 |12 |661 |S-Bahn |\n", - "|26-24-j19-1|8503000 |494.TA.26-24-j19-1.239.H|8503000:0:4|07:16:00 |07:21:00 |10 |Zürich HB |47.3793319609979|8.54019357578468|Zug |20425 |0 |07:04:00 |200 |12 |1176 |S-Bahn |\n", - "|26-24-j19-1|8503011 |494.TA.26-24-j19-1.239.H|8503011:0:1|07:24:00 |07:24:00 |11 |Zürich Wiedikon |47.3715939887424|8.52345796203921|Zug |20425 |0 |07:04:00 |200 |12 |724 |S-Bahn |\n", - "|26-24-j19-1|8503010 |494.TA.26-24-j19-1.239.H|8503010:0:1|07:26:00 |07:27:00 |12 |Zürich Enge |47.3642199587519|8.53080618106433|Zug |20425 |0 |07:04:00 |200 |12 |1138 |S-Bahn |\n", - "|26-24-j19-1|8503009 |494.TA.26-24-j19-1.239.H|8503009:0:4|07:29:00 |07:29:00 |13 |Zürich Wollishofen|47.3475028913112|8.53359095844549|Zug |20425 |0 |07:04:00 |200 |12 |921 |S-Bahn |\n", - "|26-24-j19-1|8503200 |494.TA.26-24-j19-1.239.H|8503200:0:1|07:32:00 |07:32:00 |14 |Kilchberg |47.3244907694466|8.54799993560477|Zug |20425 |0 |07:04:00 |200 |12 |1009 |S-Bahn |\n", - "|26-24-j19-1|8503201 |494.TA.26-24-j19-1.239.H|8503201:0:1|07:35:00 |07:35:00 |15 |Rüschlikon |47.3071946406515|8.55515052526735|Zug |20425 |0 |07:04:00 |200 |12 |754 |S-Bahn |\n", - "|26-24-j19-1|8503202 |494.TA.26-24-j19-1.239.H|8503202:0:4|07:37:00 |07:38:00 |16 |Thalwil |47.2961562648419|8.56475351565593|Zug |20425 |0 |07:04:00 |200 |12 |415 |S-Bahn |\n", - "|26-24-j19-1|8502209 |494.TA.26-24-j19-1.239.H|8502209:0:2|07:40:00 |07:40:00 |17 |Oberrieden Dorf |47.2767848038458|8.577635356832 |Zug |20425 |0 |07:04:00 |200 |12 |854 |S-Bahn |\n", - "|26-24-j19-1|8502208 |494.TA.26-24-j19-1.239.H|8502208:0:3|07:43:00 |07:43:00 |18 |Horgen Oberdorf |47.2588085210892|8.58979854578067|Zug |20425 |0 |07:04:00 |200 |12 |163 |S-Bahn |\n", - "|26-24-j19-1|8503016 |496.TA.26-24-j19-1.240.H|8503016:0:4|07:02:00 |07:04:00 |7 |Zürich Flughafen |47.4505627968247|8.56238196330552|Zug |20425 |0 |07:04:00 |200 |12 |1218 |S-Bahn |\n", - "|26-24-j19-1|8503006 |496.TA.26-24-j19-1.240.H|8503006:0:3|07:08:00 |07:09:00 |8 |Zürich Oerlikon |47.412017224139 |8.54411023042399|Zug |20425 |0 |07:04:00 |200 |12 |617 |S-Bahn |\n", - "|26-24-j19-1|8503015 |496.TA.26-24-j19-1.240.H|8503015:0:2|07:11:00 |07:11:00 |9 |Zürich Wipkingen |47.3931504194084|8.52935989345669|Zug |20425 |0 |07:04:00 |200 |12 |661 |S-Bahn |\n", - "|26-24-j19-1|8503000 |496.TA.26-24-j19-1.240.H|8503000:0:4|07:16:00 |07:21:00 |10 |Zürich HB |47.3793319609979|8.54019357578468|Zug |20425 |0 |07:04:00 |200 |12 |1176 |S-Bahn |\n", - "|26-24-j19-1|8503011 |496.TA.26-24-j19-1.240.H|8503011:0:1|07:24:00 |07:24:00 |11 |Zürich Wiedikon |47.3715939887424|8.52345796203921|Zug |20425 |0 |07:04:00 |200 |12 |724 |S-Bahn |\n", - "|26-24-j19-1|8503010 |496.TA.26-24-j19-1.240.H|8503010:0:1|07:26:00 |07:27:00 |12 |Zürich Enge |47.3642199587519|8.53080618106433|Zug |20425 |0 |07:04:00 |200 |12 |1138 |S-Bahn |\n", - "|26-24-j19-1|8503009 |496.TA.26-24-j19-1.240.H|8503009:0:4|07:29:00 |07:29:00 |13 |Zürich Wollishofen|47.3475028913112|8.53359095844549|Zug |20425 |0 |07:04:00 |200 |12 |921 |S-Bahn |\n", - "|26-24-j19-1|8503200 |496.TA.26-24-j19-1.240.H|8503200:0:1|07:32:00 |07:32:00 |14 |Kilchberg |47.3244907694466|8.54799993560477|Zug |20425 |0 |07:04:00 |200 |12 |1009 |S-Bahn |\n", - "|26-24-j19-1|8503201 |496.TA.26-24-j19-1.240.H|8503201:0:1|07:35:00 |07:35:00 |15 |Rüschlikon |47.3071946406515|8.55515052526735|Zug |20425 |0 |07:04:00 |200 |12 |754 |S-Bahn |\n", - "|26-24-j19-1|8503202 |496.TA.26-24-j19-1.240.H|8503202:0:4|07:38:00 |07:38:00 |16 |Thalwil |47.2961562648419|8.56475351565593|Zug |20425 |0 |07:04:00 |200 |12 |415 |S-Bahn |\n", - "|26-24-j19-1|8502209 |496.TA.26-24-j19-1.240.H|8502209:0:2|07:40:00 |07:40:00 |17 |Oberrieden Dorf |47.2767848038458|8.577635356832 |Zug |20425 |0 |07:04:00 |200 |12 |854 |S-Bahn |\n", - "|26-24-j19-1|8502208 |496.TA.26-24-j19-1.240.H|8502208:0:3|07:44:00 |07:45:00 |18 |Horgen Oberdorf |47.2588085210892|8.58979854578067|Zug |20425 |0 |07:04:00 |200 |12 |163 |S-Bahn |\n", - "+-----------+---------------+------------------------+-----------+------------+--------------+-------------+------------------+----------------+----------------+-------------+---------------+------------+--------------------+---------+----------+--------+----------+" + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+\n", + "|stop_id_general|stop_int| stop_lat_first| stop_lon_first| stop_name_first|stop_id_general_2|stop_int_2|stop_lat_first_2|stop_lon_first_2| stop_name_first_2|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503376| 8|47.4353132339136| 8.7169371079598|Ottikon b. Kemptthal|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8587967| 7|47.2955835709855|8.60393802835468|Erlenbach ZH, Im ...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8589111| 5| 47.260856991692|8.59230484542371|Horgen, Gumelenst...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590819| 6|47.2821844204798|8.57300004996529| Thalwil, Mettli|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591190| 4|47.3694098744442|8.50635403902719| Zürich, Heuried|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591284| 3|47.3909246273101|8.47396977303017| Zürich, Neeserweg|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8588312| 9|47.4329251530902|8.68904441838401|Effretikon, Kapel...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590541| 10|47.4165824252901|8.61689173475348|Dietlikon, Dornen...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590804| 14|47.3972672788911|8.44748743845064| Schlieren, Zentrum|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591149| 12|47.3642868894025|8.50799795599736| Zürich, Friesenberg|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591315| 13|47.3510268782417|8.58298033277325| Zürich, Rehalp|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591362| 11|47.3859132103277|8.54836824487131|Zürich, Seilbahn ...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8580432| 21|47.4439106426415|8.57842587428213| Kloten, Bramen|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590273| 16|47.4263923172796|8.36551616876333|Spreitenbach, Fur...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590477| 15|47.3254528526241|8.53104872619107| Adliswil, Moos|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591053| 22|47.4058952214384|8.53778609082291|Zürich, Bad Allen...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591080| 18| 47.347588102752|8.53434554328425|Zürich Wollishofe...|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591085| 17|47.4274011616099|8.54614042296638|Zürich, Birch-/Gl...|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+\n", + "only showing top 20 rows" + ] + } + ], + "source": [ + "stop_coordinates_cross = stop_coordinates.crossJoin(stop_coordinates_for_join)\\\n", + ".filter(F.col('stop_id_general')!=F.col('stop_id_general_2'))\n", + "stop_coordinates_cross.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we compute the distance in meters, as well as the time in seconds it takes to walk this distance" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+---------+\n", + "|stop_id_general|stop_int| stop_lat_first| stop_lon_first| stop_name_first|stop_id_general_2|stop_int_2|stop_lat_first_2|stop_lon_first_2| stop_name_first_2| distance|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+---------+\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg|18.017555|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU|12.902226|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503376| 8|47.4353132339136| 8.7169371079598|Ottikon b. Kemptthal| 25.65447|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8587967| 7|47.2955835709855|8.60393802835468|Erlenbach ZH, Im ...|21.665598|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8589111| 5| 47.260856991692|8.59230484542371|Horgen, Gumelenst...|23.627174|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+---------+\n", + "only showing top 5 rows" + ] + } + ], + "source": [ + "# adding distance\n", + "stop_coordinates_cross_distance = stop_coordinates_cross.withColumn(\"distance\", \\\n", + " great_circle_udf(struct(stop_coordinates_cross.stop_lat_first, stop_coordinates_cross.stop_lon_first), \\\n", + " struct(stop_coordinates_cross.stop_lat_first_2, stop_coordinates_cross.stop_lon_first_2)))\n", + "stop_coordinates_cross_distance.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+----------+------------+\n", + "|stop_id_general|stop_int| stop_lat_first| stop_lon_first| stop_name_first|stop_id_general_2|stop_int_2|stop_lat_first_2|stop_lon_first_2| stop_name_first_2| distance|walking_time|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+----------+------------+\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg| 18.017555| 21621|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 12.902226| 15482|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8503376| 8|47.4353132339136| 8.7169371079598|Ottikon b. Kemptthal| 25.65447| 30785|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8587967| 7|47.2955835709855|8.60393802835468|Erlenbach ZH, Im ...| 21.665598| 25998|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8589111| 5| 47.260856991692|8.59230484542371|Horgen, Gumelenst...| 23.627174| 28352|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590819| 6|47.2821844204798|8.57300004996529| Thalwil, Mettli| 20.909445| 25091|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591190| 4|47.3694098744442|8.50635403902719| Zürich, Heuried| 10.988478| 13186|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591284| 3|47.3909246273101|8.47396977303017| Zürich, Neeserweg| 7.7776713| 9333|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8588312| 9|47.4329251530902|8.68904441838401|Effretikon, Kapel...| 23.541742| 28250|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590541| 10|47.4165824252901|8.61689173475348|Dietlikon, Dornen...| 18.036524| 21643|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590804| 14|47.3972672788911|8.44748743845064| Schlieren, Zentrum| 5.663664| 6796|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591149| 12|47.3642868894025|8.50799795599736| Zürich, Friesenberg| 11.372528| 13647|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591315| 13|47.3510268782417|8.58298033277325| Zürich, Rehalp| 17.069794| 20483|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591362| 11|47.3859132103277|8.54836824487131|Zürich, Seilbahn ...|13.2958765| 15955|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8580432| 21|47.4439106426415|8.57842587428213| Kloten, Bramen| 15.46525| 18558|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590273| 16|47.4263923172796|8.36551616876333|Spreitenbach, Fur...| 1.5007852| 1800|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590477| 15|47.3254528526241|8.53104872619107| Adliswil, Moos| 15.309975| 18371|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591053| 22|47.4058952214384|8.53778609082291|Zürich, Bad Allen...|12.1317835| 14558|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591080| 18| 47.347588102752|8.53434554328425|Zürich Wollishofe...| 14.033888| 16840|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8591085| 17|47.4274011616099|8.54614042296638|Zürich, Birch-/Gl...| 12.780608| 15336|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+----------+------------+\n", + "only showing top 20 rows" + ] + } + ], + "source": [ + "stop_coordinates_cross_distance_time = stop_coordinates_cross_distance.withColumn('walking_time', \n", + " (F.col('distance')/0.05*60).cast('int'))\n", + "stop_coordinates_cross_distance_time.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Removing pairs of stops more than 500 meters (0.5 km) away" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+-----------+------------+\n", + "|stop_id_general|stop_int| stop_lat_first| stop_lon_first| stop_name_first|stop_id_general_2|stop_int_2|stop_lat_first_2|stop_lon_first_2| stop_name_first_2| distance|walking_time|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+-----------+------------+\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590268| 815|47.4142117310803| 8.3795209040447| Spreitenbach, ASP| 0.22296342| 267|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590270| 1350|47.4179500849385|8.37208285349115| Spreitenbach, Brüel| 0.4742755| 569|\n", + "| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg| 8591903| 63|47.3483489130904|8.59604183700617|Zollikerberg, Spital| 0.39212397| 470|\n", + "| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg| 8591023| 242|47.3468211948325|8.59815287792414|Zollikerb., Langä...| 0.41439477| 497|\n", + "| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg| 8590879| 551|47.3453908209261|8.59330197538922| Waldburg, Station|0.023021813| 27|\n", + "| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg| 8503077| 705|47.3473202949034|8.59679642184493| Spital Zollikerberg| 0.35051093| 420|\n", + "| 8503078| 1|47.3454760357765| 8.5930234976511| Waldburg| 8576189| 1001|47.3457012457904|8.58848700546567|Zollikon, Rebwies...| 0.34270898| 411|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591327| 48|47.3743498590472|8.54323886459827|Zürich, Rudolf-Br...| 0.46496668| 557|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8588078| 272|47.3768440104406|8.54393955051998| Zürich, Central| 0.36639458| 439|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591316| 373|47.3730662375955|8.53845982728609| Zürich, Rennweg| 0.4953369| 594|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591367| 461|47.3760957774096|8.53420181283877| Zürich, Sihlpost/HB| 0.40512198| 486|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8587349| 602|47.3775618175159|8.54173867807358|Zürich, Bahnhofqu...| 0.19359568| 232|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591067| 616|47.3765581015114|8.53994204750509|Zürich, Bahnhofst...| 0.11930997| 143|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8503446| 785|47.3788453295376|8.54171172861506| Zürich Landesmuseum| 0.24329051| 291|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 0.42122737| 505|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591368| 880|47.3798733332196|8.53760642776606| Zürich, Sihlquai/HB| 0.28947487| 347|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591174| 947|47.3796482690385| 8.5445144723019| Zürich, Haldenegg| 0.46829802| 561|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591379| 1033| 47.379958492389|8.54280767326183|Zürich, Stampfenb...| 0.38740817| 464|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8503000| 1173|47.3782978637762|8.54019357578468| Zürich HB| 0.1179737| 141|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8587348| 1196|47.3772394138519|8.53934017626465|Zürich, Bahnhofpl...|0.031180955| 37|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+-----------+------------+\n", + "only showing top 20 rows" + ] + } + ], + "source": [ + "stop_coordinates_cross_distance_time_filtered = stop_coordinates_cross_distance_time.filter(F.col('distance')<=0.5)\n", + "stop_coordinates_cross_distance_time_filtered.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sorting by stop_int, then stop_int 2 to sort it as `transfers` in RAPTOR." + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_coordinates_cross_distance_time_filtered_sorted = stop_coordinates_cross_distance_time_filtered.sort(F.col('stop_int').cast('int'), F.col('stop_int_2').cast('int'))\\\n", + ".withColumn('monotonically_increasing_id', F.monotonically_increasing_id())" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_coordinates_cross_distance_time_filtered_sorted.write.csv('data/lgpt_guys/transfers_cyril.csv', header=True, mode='overwrite')" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_coordinates_cross_distance_time_filtered_sorted = spark.read.csv('data/lgpt_guys/transfers_cyril.csv', header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6264" + ] + } + ], + "source": [ + "stop_coordinates_cross_distance_time_filtered_sorted.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+----------+------------+---------------------------+\n", + "|stop_id_general|stop_int| stop_lat_first| stop_lon_first| stop_name_first|stop_id_general_2|stop_int_2|stop_lat_first_2|stop_lon_first_2| stop_name_first_2| distance|walking_time|monotonically_increasing_id|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+----------+------------+---------------------------+\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590268| 815|47.4142117310803| 8.3795209040447| Spreitenbach, ASP|0.22296342| 267| 0|\n", + "| 8502508| 0|47.4154457211288|8.37718528430566|Spreitenbach, Rai...| 8590270| 1350|47.4179500849385|8.37208285349115| Spreitenbach, Brüel| 0.4742755| 569| 1|\n", + "| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU| 8591367| 461|47.3760957774096|8.53420181283877| Zürich, Sihlpost/HB|0.40512198| 486| 10|\n", + "| 8590474| 819|47.3129444045737|8.52782377432064| Adliswil, Krone| 8595717| 1241|47.3129504950936|8.53005159622556|Adliswil, Grundst...|0.16795586| 201| 1005022347264|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8503088| 2|47.3774949037101|8.53916949636064| Zürich HB SZU|0.42122737| 505| 1005022347265|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8591282| 138|47.3821239221899|8.53493843137185|Zürich, Museum fü...| 0.2132881| 255| 1005022347266|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8591071| 543|47.3840824646745|8.54044510406427| Zürich, Beckenhof| 0.4105776| 492| 1005022347267|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8503446| 785|47.3788453295376|8.54171172861506| Zürich Landesmuseum| 0.413811| 496| 1005022347268|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8591368| 880|47.3798733332196|8.53760642776606| Zürich, Sihlquai/HB|0.13454837| 161| 1005022347269|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8591379| 1033| 47.379958492389|8.54280767326183|Zürich, Stampfenb...|0.43250278| 519| 1005022347270|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8503000| 1173|47.3782978637762|8.54019357578468| Zürich HB|0.37702298| 452| 1005022347271|\n", + "| 8596126| 820|47.3810655490734|8.53730100056942| Zürich, Bus Station| 8587348| 1196|47.3772394138519|8.53934017626465|Zürich, Bahnhofpl...| 0.4523052| 542| 1005022347272|\n", + "| 8591354| 821|47.4218644344265|8.54977859986757| Zürich, Seebach| 8591048| 160|47.4245143567597| 8.548556891081|Zürich, Ausserdor...| 0.3086605| 370| 1005022347273|\n", + "| 8591354| 821|47.4218644344265|8.54977859986757| Zürich, Seebach| 8591355| 273| 47.419864748766|8.54836824487131|Zürich, Seebacher...|0.24637553| 295| 1005022347274|\n", + "| 8591354| 821|47.4218644344265|8.54977859986757| Zürich, Seebach| 8591187| 940|47.4228794433749|8.54349039287786|Zürich, Hertenste...|0.48635942| 583| 1005022347275|\n", + "| 8591354| 821|47.4218644344265|8.54977859986757| Zürich, Seebach| 8591124| 1250|47.4254624617068|8.55363237243698| Zürich, Ettenfeld| 0.4940879| 592| 1005022347276|\n", + "| 8591407| 822|47.4245872885157|8.51184274541392|Zürich, Unteraffo...| 8503008| 211|47.4210317478027|8.50856389462643| Zürich Affoltern|0.46600202| 559| 1005022347277|\n", + "| 8591407| 822|47.4245872885157|8.51184274541392|Zürich, Unteraffo...| 8591153| 465|47.4219616816404|8.51004611484544| Zürich, Fronwald|0.32172477| 386| 1005022347278|\n", + "| 8591062| 824|47.4125400235641|8.54190935797759|Zürich Oerlikon, ...| 8591272| 82|47.4140476021462|8.54133443619567|Zürich, Max-Bill-...| 0.1731274| 207| 1005022347279|\n", + "| 8591062| 824|47.4125400235641|8.54190935797759|Zürich Oerlikon, ...| 8591063| 357| 47.413336370854|8.54584397892258|Zürich Oerlikon, ...| 0.3090261| 370| 1005022347280|\n", + "+---------------+--------+----------------+----------------+--------------------+-----------------+----------+----------------+----------------+--------------------+----------+------------+---------------------------+\n", + "only showing top 20 rows" + ] + } + ], + "source": [ + "stop_coordinates_cross_distance_time_filtered_sorted.sort(F.col('monotonically_increasing_id')).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building lookup tables to pretty-print results after RAPTOR:" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stop_times = spark.read.csv('data/lgpt_guys/stop_times_final_cyril.csv', header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# lookup table for routes" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+--------------+-------------+--------------------+\n", + "|route_int| route_id| route_desc| trip_headsign|\n", + "+---------+--------------+-------------+--------------------+\n", + "| 0| 26-10-j19-1| Tram|Zürich Flughafen,...|\n", + "| 1| 26-13-j19-1| Tram| Zürich, Albisgütli|\n", + "| 2| 26-11-A-j19-1| Tram| Zürich, Rehalp|\n", + "| 3| 26-304-j19-1| Bus| Dietikon, Bahnhof|\n", + "| 4| 26-70-A-j19-1| Bus|Zürich, Mittellei...|\n", + "| 5| 26-61-j19-1| Bus|Zürich, Schwamend...|\n", + "| 6| 26-703-j19-1| Bus| Zürich, Klusplatz|\n", + "| 7| 26-652-j19-1| Bus| Effretikon, Bahnhof|\n", + "| 8| 26-83-j19-1| Bus|Zürich Altstetten...|\n", + "| 9| 26-749-j19-1| Bus| Dietlikon, Bahnhof|\n", + "| 10| 26-24-j19-1| S-Bahn| Weinfelden|\n", + "| 11| 26-640-j19-1| Bus| Neschwil, Post|\n", + "| 12| 26-725-j19-1| Bus| Volketswil, Dorf|\n", + "| 13| 79-736-j19-1| Bus|Zürich Flughafen,...|\n", + "| 14| 26-24-j19-1| S-Bahn| Weinfelden|\n", + "| 15|79-373-2-j19-1| Schiff| Küsnacht ZSG|\n", + "| 16| 26-658-j19-1| Bus| Effretikon, Bahnhof|\n", + "| 17| 26-449-j19-1| Bus|Buchs-Dällikon, B...|\n", + "| 18| 26-303-j19-1| Bus| Killwangen, Bahnhof|\n", + "| 19| 26-17-j19-1| Tram|Zürich, Bahnhofpl...|\n", + "| 20| 26-7-A-j19-1| S-Bahn| Winterthur|\n", + "| 21| 26-24-j19-1| S-Bahn| Zug|\n", + "| 22| 26-720-j19-1| Bus| Effretikon, Bahnhof|\n", + "| 23| 26-145-j19-1| Bus| Thalwil, Zentrum|\n", + "| 24| 26-185-j19-1| Bus|Zürich Wollishofe...|\n", + "| 25| 26-14-j19-1| S-Bahn| Hinwil|\n", + "| 26| 26-8-A-j19-1| S-Bahn| Pfäffikon SZ|\n", + "| 27| 1-322-j19-1| Bus|Baden, Postautost...|\n", + "| 28|26-962-A-j19-1| Bus|Erlenbach ZH, Bah...|\n", + "| 29| 26-301-j19-1| Bus| Dietikon, Bahnhof|\n", + "| 30| 26-533-j19-1| Taxi|Niederhasli, Nass...|\n", + "| 31| 26-78-j19-1| Bus| Zürich, Bändliweg|\n", + "| 32| 26-4-j19-1| Tram|Zürich Altstetten...|\n", + "| 33| 26-89-j19-1| Bus| Zürich, Heizenholz|\n", + "| 34| 26-15-A-j19-1| Tram|Zürich, Bucheggplatz|\n", + "| 35| 26-25-A-j19-1|Standseilbahn| Zürich, Dolder|\n", + "| 36| 26-919-j19-1| Bus|Zumikon, Dorfzentrum|\n", + "| 37| 26-302-j19-1| Bus| Urdorf, Oberurdorf|\n", + "| 38| 1-444-j19-1| Bus|Bremgarten AG, Ob...|\n", + "| 39| 26-8-C-j19-1| Tram| Zürich, Klusplatz|\n", + "| 40| 79-18-A-j19-1| Bus| Forch, Bahnhof|\n", + "| 41| 26-18-j19-1| S-Bahn|Zürich Stadelhofe...|\n", + "| 42| 26-2-j19-1| S-Bahn| Ziegelbrücke|\n", + "| 43| 26-8-A-j19-1| S-Bahn| Pfäffikon SZ|\n", + "| 44| 26-726-j19-1| Bus|Schwerzenbach ZH,...|\n", + "| 44| 26-721-j19-1| Bus|Schwerzenbach ZH,...|\n", + "| 45| 26-456-j19-1| Bus|Regensdorf-Watt, ...|\n", + "| 46| 26-311-j19-1| Bus| Dietikon, Bahnhof|\n", + "| 47| 26-772-j19-1| Bus|Wallisellen, Schu...|\n", + "| 48| 26-17-j19-1| Tram|Zürich Wiedikon, ...|\n", + "| 49| 26-21-j19-1| S-Bahn| Regensdorf-Watt|\n", + "| 50| 26-7-A-j19-1| S-Bahn| Winterthur|\n", + "| 51| 26-845-j19-1| Bus|Gossau ZH, Mittel...|\n", + "| 51| 26-845-j19-1| Bus| Grüningen, Adler|\n", + "| 52| 26-768-j19-1| Bus|Zürich Flughafen,...|\n", + "| 53| 26-13-j19-1| Tram| Zürich, Frankental|\n", + "| 54| 26-2-A-j19-1| Tram|Zürich Tiefenbrun...|\n", + "| 55| 26-36-j19-1| InterRegio| Basel SBB|\n", + "| 55| 26-19-j19-1| S-Bahn| Koblenz|\n", + "| 55| 26-19-j19-1| S-Bahn| Dietikon|\n", + "| 56| 1-445-j19-1| Bus|Zürich Enge, Bahn...|\n", + "| 57| 26-451-j19-1| Bus|Adlikon b. R., Le...|\n", + "| 58| 26-2-A-j19-1| Tram|Zürich Tiefenbrun...|\n", + "| 59| 26-304-j19-1| Bus| Dietikon, Bahnhof|\n", + "| 60| 26-14-A-j19-1| Tram| Zürich, Seebach|\n", + "| 61| 26-703-j19-1| Bus| Zürich, Klusplatz|\n", + "| 62| 26-10-B-j19-1| S-Bahn| Uetliberg|\n", + "| 63| 26-787-j19-1| Bus|Brüttisellen, Ob....|\n", + "| 64| 26-743-j19-1| Bus| Stettbach, Bahnhof|\n", + "| 65| 26-771-j19-1| Bus|Wallisellen, Bahnhof|\n", + "| 66| 26-38-j19-1| Bus| Zürich, Waidspital|\n", + "| 67| 26-14-A-j19-1| Tram| Zürich, Triemli|\n", + "| 68|79-373-2-j19-1| Schiff| Thalwil ZSG|\n", + "| 69| 26-845-j19-1| Bus| Uster, Bahnhof|\n", + "| 70| 26-452-j19-1| Bus|Regensdorf, Moosä...|\n", + "| 71| 26-813-j19-1| Bus| Uster, Bahnhof|\n", + "| 72| 26-31-j19-1| Bus| Zürich, Farbhof|\n", + "| 73| 26-6-A-j19-1| S-Bahn| Baden|\n", + "| 74| 26-14-A-j19-1| Tram| Zürich, Seebach|\n", + "| 75| 26-811-j19-1| Bus| Uster, Bahnhof|\n", + "| 76| 26-652-j19-1| Bus| Effretikon, Bahnhof|\n", + "| 77| 26-726-j19-1| Bus| Volketswil, Dorf|\n", + "| 78| 26-165-j19-1| Bus|Rüschlikon, Park ...|\n", + "| 79| 26-31-j19-1| Bus|Zürich, Kienasten...|\n", + "| 80| 26-75-A-j19-1| Bus|Zürich, Schwamend...|\n", + "| 81| 26-10-B-j19-1| S-Bahn| Zürich Selnau|\n", + "| 82| 79-10-B-j19-1| S-Bahn| Zürich Triemli|\n", + "| 83| 26-787-j19-1| Bus|Zürich Oerlikon, ...|\n", + "| 84| 26-35-B-j19-1| Bus| Zürich, Solidapark|\n", + "| 85| 26-89-j19-1| Bus|Zürich Altstetten...|\n", + "| 86| 26-6-B-j19-1| Tram| Zürich, Zoo|\n", + "| 87| 26-40-j19-1| Bus| Zürich, Seebach|\n", + "| 88| 26-19-j19-1| S-Bahn| Zürich HB|\n", + "| 88|80-160-Y-j19-1| RegioExpress| Zürich HB|\n", + "| 89| 26-24-j19-1| S-Bahn| Schaffhausen|\n", + "| 90| 26-24-j19-1| S-Bahn| Schaffhausen|\n", + "| 91| 26-31-j19-1| Bus|Zürich Altstetten...|\n", + "| 92| 26-3-A-j19-1| Tram| Zürich, Klusplatz|\n", + "| 93| 26-40-j19-1| Bus|Zürich, Glaubtens...|\n", + "| 94| 26-24-j19-1| S-Bahn| Zug|\n", + "+---------+--------------+-------------+--------------------+\n", + "only showing top 100 rows" + ] + } + ], + "source": [ + "stop_times.select(stop_times.route_int, stop_times.route_id, stop_times.route_desc, stop_times.trip_headsign)\\\n", + ".dropDuplicates()\\\n", + ".sort(F.col('route_int').cast('int'))\\\n", + ".show(100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Verifying a few routes and trips on real data" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+------------+--------------+---------------------------+\n", + "|route_int|arrival_time|departure_time|monotonically_increasing_id|\n", + "+---------+------------+--------------+---------------------------+\n", + "| 0| 07:00:00| 07:00:00| 0|\n", + "| 0| 07:01:00| 07:01:00| 1|\n", + "| 0| 07:02:00| 07:02:00| 2|\n", + "| 0| 07:03:00| 07:03:00| 3|\n", + "| 0| 07:05:00| 07:05:00| 4|\n", + "| 0| 07:06:00| 07:06:00| 5|\n", + "| 0| 07:08:00| 07:08:00| 6|\n", + "| 0| 07:09:00| 07:09:00| 7|\n", + "| 0| 07:10:00| 07:10:00| 8|\n", + "| 0| 07:11:00| 07:11:00| 9|\n", + "| 0| 07:12:00| 07:12:00| 10|\n", + "| 0| 07:14:00| 07:14:00| 11|\n", + "| 0| 07:16:00| 07:16:00| 12|\n", + "| 0| 07:18:00| 07:18:00| 13|\n", + "| 0| 07:20:00| 07:20:00| 14|\n", + "| 0| 07:22:00| 07:22:00| 15|\n", + "| 0| 07:23:00| 07:23:00| 16|\n", + "| 0| 07:24:00| 07:24:00| 17|\n", + "| 0| 07:25:00| 07:25:00| 18|\n", + "| 0| 07:27:00| 07:27:00| 19|\n", + "+---------+------------+--------------+---------------------------+\n", + "only showing top 20 rows" + ] + } + ], + "source": [ + "stop_times = spark.read.csv('data/lgpt_guys/stop_times_final_cyril.csv', header=True)\n", + "stop_times.sort(stop_times.route_int.cast('int'), \n", + " stop_times.departure_first_stop, \n", + " stop_times.trip_id, \n", + " stop_times.stop_sequence.cast('int'))\\\n", + ".select(stop_times.route_int, stop_times.arrival_time, stop_times.departure_time, stop_times.monotonically_increasing_id)\\\n", + ".show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Casting spark's `monotonically_increasing_id` as `LongType` (and not `IntegerType`) is paramount to make sure the sort happens as expected. Otherwise, there are not enough bytes to represent the index as an integer." + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|route_id |stop_id_general|trip_id |stop_id|arrival_time|departure_time|stop_sequence|stop_name |stop_lat |stop_lon |trip_headsign |trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|monotonically_increasing_id|\n", + "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|26-13-j19-1 |8576240 |2064.TA.26-13-j19-1.24.H|8576240|07:00:00 |07:00:00 |5 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1221 |Tram |0 |\n", + "|26-13-j19-1 |8591353 |2064.TA.26-13-j19-1.24.H|8591353|07:01:00 |07:01:00 |6 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |816 |Tram |1 |\n", + "|26-13-j19-1 |8591039 |2064.TA.26-13-j19-1.24.H|8591039|07:02:00 |07:02:00 |7 |Zürich, Alte Trotte |47.3977659017765|8.50725235431143|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |776 |Tram |2 |\n", + "|26-13-j19-1 |8591121 |2064.TA.26-13-j19-1.24.H|8591121|07:03:00 |07:03:00 |8 |Zürich, Eschergutweg |47.3962700189648|8.51204037477646|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |307 |Tram |3 |\n", + "|26-13-j19-1 |8591417 |2064.TA.26-13-j19-1.24.H|8591417|07:05:00 |07:05:00 |9 |Zürich, Waidfussweg |47.3954977376399|8.51840044698891|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |347 |Tram |4 |\n", + "|26-13-j19-1 |8591437 |2064.TA.26-13-j19-1.24.H|8591437|07:06:00 |07:06:00 |10 |Zürich, Wipkingerplatz |47.3925909395293|8.52357474302616|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1015 |Tram |5 |\n", + "|26-13-j19-1 |8580522 |2064.TA.26-13-j19-1.24.H|8580522|07:08:00 |07:08:00 |11 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |455 |Tram |6 |\n", + "|26-13-j19-1 |8591110 |2064.TA.26-13-j19-1.24.H|8591110|07:09:00 |07:09:00 |12 |Zürich, Dammweg |47.3884919601296|8.52639545301869|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1102 |Tram |7 |\n", + "|26-13-j19-1 |8591306 |2064.TA.26-13-j19-1.24.H|8591306|07:10:00 |07:10:00 |13 |Zürich, Quellenstrasse |47.3867403702341|8.52874903906341|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |786 |Tram |8 |\n", + "|26-13-j19-1 |8591257 |2064.TA.26-13-j19-1.24.H|8591257|07:11:00 |07:11:00 |14 |Zürich, Limmatplatz |47.3845994590919|8.53162364797299|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |388 |Tram |9 |\n", + "|26-13-j19-1 |8591282 |2064.TA.26-13-j19-1.24.H|8591282|07:12:00 |07:12:00 |15 |Zürich, Museum für Gestaltung |47.3821239221899|8.53493843137185|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |138 |Tram |10 |\n", + "|26-13-j19-1 |8591368 |2064.TA.26-13-j19-1.24.H|8591368|07:14:00 |07:14:00 |16 |Zürich, Sihlquai/HB |47.3798733332196|8.53760642776606|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |880 |Tram |11 |\n", + "|26-13-j19-1 |8587349 |2064.TA.26-13-j19-1.24.H|8587349|07:16:00 |07:16:00 |17 |Zürich, Bahnhofquai/HB |47.3775618175159|8.54173867807358|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |602 |Tram |12 |\n", + "|26-13-j19-1 |8591067 |2064.TA.26-13-j19-1.24.H|8591067|07:18:00 |07:18:00 |18 |Zürich, Bahnhofstrasse/HB |47.3765581015114|8.53994204750509|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |616 |Tram |13 |\n", + "|26-13-j19-1 |8591316 |2064.TA.26-13-j19-1.24.H|8591316|07:20:00 |07:20:00 |19 |Zürich, Rennweg |47.3730662375955|8.53845982728609|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |373 |Tram |14 |\n", + "|26-13-j19-1 |8591299 |2064.TA.26-13-j19-1.24.H|8591299|07:22:00 |07:22:00 |20 |Zürich, Paradeplatz |47.3693672863583|8.53876525448273|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1223 |Tram |15 |\n", + "|26-13-j19-1 |8591384 |2064.TA.26-13-j19-1.24.H|8591384|07:23:00 |07:23:00 |21 |Zürich, Stockerstrasse |47.3677002399791|8.53501029659459|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |871 |Tram |16 |\n", + "|26-13-j19-1 |8591404 |2064.TA.26-13-j19-1.24.H|8591404|07:24:00 |07:24:00 |22 |Zürich, Tunnelstrasse |47.3661426599847|8.53253094641008|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |890 |Tram |17 |\n", + "|26-13-j19-1 |8591059 |2064.TA.26-13-j19-1.24.H|8591059|07:25:00 |07:25:00 |23 |Zürich Enge, Bahnhof/Bederstr.|47.3645546111557|8.53045583810347|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |177 |Tram |18 |\n", + "|26-13-j19-1 |8591415 |2064.TA.26-13-j19-1.24.H|8591415|07:27:00 |07:27:00 |24 |Zürich, Waffenplatzstrasse |47.3614818138862|8.52574866601403|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1265 |Tram |19 |\n", + "|26-13-j19-1 |8591366 |2064.TA.26-13-j19-1.24.H|8591366|07:28:00 |07:28:00 |25 |Zürich, Sihlcity Nord |47.3600640074787|8.52303575385561|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |968 |Tram |20 |\n", + "|26-13-j19-1 |8591329 |2064.TA.26-13-j19-1.24.H|8591329|07:29:00 |07:29:00 |26 |Zürich, Saalsporthalle |47.3578611597087|8.52040369007277|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1238 |Tram |21 |\n", + "|26-13-j19-1 |8591245 |2064.TA.26-13-j19-1.24.H|8591245|07:30:00 |07:30:00 |27 |Zürich, Laubegg |47.3587313564196|8.51708890667391|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1216 |Tram |22 |\n", + "|26-13-j19-1 |8591405 |2064.TA.26-13-j19-1.24.H|8591405|07:32:00 |07:32:00 |28 |Zürich, Uetlihof |47.3567353594536|8.51396276948474|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |847 |Tram |23 |\n", + "|26-13-j19-1 |8591385 |2064.TA.26-13-j19-1.24.H|8591385|07:33:00 |07:33:00 |29 |Zürich, Strassenverkehrsamt |47.3530717783138|8.51171698127413|Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |275 |Tram |24 |\n", + "|26-13-j19-1 |8591034 |2064.TA.26-13-j19-1.24.H|8591034|07:34:00 |07:34:00 |30 |Zürich, Albisgütli |47.3519945640447|8.5077104951064 |Zürich, Albisgütli |1831 |0 |07:00:00 |0 |26 |1352 |Tram |25 |\n", + "|26-11-A-j19-1|8591049 |791.TA.26-11-A-j19-1.3.H|8591049|19:49:00 |19:49:00 |1 |Zürich, Auzelg |47.4166918393693|8.568113214819 |Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |1117 |Tram |26 |\n", + "|26-11-A-j19-1|8591128 |791.TA.26-11-A-j19-1.3.H|8591128|19:51:00 |19:51:00 |2 |Zürich, Fernsehstudio |47.4181749855684|8.56174415945371|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |156 |Tram |27 |\n", + "|26-11-A-j19-1|8591830 |791.TA.26-11-A-j19-1.3.H|8591830|19:52:00 |19:52:00 |3 |Glattpark, Glattpark |47.4199559214972|8.55716275150406|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |671 |Tram |28 |\n", + "|26-11-A-j19-1|8591294 |791.TA.26-11-A-j19-1.3.H|8591294|19:53:00 |19:53:00 |4 |Zürich, Oerlikerhus |47.4175853791724|8.5542072942189 |Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |573 |Tram |29 |\n", + "|26-11-A-j19-1|8591256 |791.TA.26-11-A-j19-1.3.H|8591256|19:54:00 |19:54:00 |5 |Zürich, Leutschenbach |47.4146433269471|8.55130573585079|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |444 |Tram |30 |\n", + "|26-11-A-j19-1|8591273 |791.TA.26-11-A-j19-1.3.H|8591273|19:55:00 |19:55:00 |6 |Zürich, Messe/Hallenstadion |47.4106919651348|8.55068589830466|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |389 |Tram |31 |\n", + "|26-11-A-j19-1|8591382 |791.TA.26-11-A-j19-1.3.H|8591382|19:57:00 |19:57:00 |7 |Zürich, Sternen Oerlikon |47.4100718783688|8.54623025449481|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |687 |Tram |32 |\n", + "|26-11-A-j19-1|8580449 |791.TA.26-11-A-j19-1.3.H|8580449|19:59:00 |19:59:00 |8 |Zürich Oerlikon, Bahnhof |47.411494419524 |8.54479295004002|Zürich, Rehalp |363 |0 |19:49:00 |1 |8 |766 |Tram |33 |\n", + "|26-304-j19-1 |8591057 |159.TA.26-304-j19-1.4.R |8591057|19:39:00 |19:39:00 |1 |Zürich Altstetten, Bahnhof N |47.392067942097 |8.48990588617267|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |521 |Bus |34 |\n", + "|26-304-j19-1 |8591402 |159.TA.26-304-j19-1.4.R |8591402|19:41:00 |19:41:00 |2 |Zürich, Tüffenwies |47.3979787271809|8.49434356367684|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |1000 |Bus |35 |\n", + "|26-304-j19-1 |8591434 |159.TA.26-304-j19-1.4.R |8591434|19:41:00 |19:41:00 |3 |Zürich, Winzerhalde |47.4000582901792|8.4945681424979 |Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |711 |Bus |36 |\n", + "|26-304-j19-1 |8591197 |159.TA.26-304-j19-1.4.R |8591197|19:42:00 |19:42:00 |4 |Zürich, Hohenklingensteig |47.4013473348052|8.49021131336931|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |1125 |Bus |37 |\n", + "|26-304-j19-1 |8591436 |159.TA.26-304-j19-1.4.R |8591436|19:43:00 |19:43:00 |5 |Zürich, Winzerstrasse Süd |47.403372044054 |8.486123978826 |Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |149 |Bus |38 |\n", + "|26-304-j19-1 |8591136 |159.TA.26-304-j19-1.4.R |8591136|19:46:00 |19:46:00 |6 |Zürich, Frankental |47.4057006674825|8.48137189097235|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |690 |Bus |39 |\n", + "|26-304-j19-1 |8590725 |159.TA.26-304-j19-1.4.R |8590725|19:47:00 |19:47:00 |7 |Oberengstringen, Eggbühl |47.4055243523393|8.47408655401713|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |1329 |Bus |40 |\n", + "|26-304-j19-1 |8590726 |159.TA.26-304-j19-1.4.R |8590726|19:48:00 |19:48:00 |8 |Oberengstringen, Lanzrain |47.407342193939 |8.46795106062573|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |763 |Bus |41 |\n", + "|26-304-j19-1 |8590728 |159.TA.26-304-j19-1.4.R |8590728|19:49:00 |19:49:00 |9 |Oberengstringen, Zentrum |47.4091295756792|8.46260608468448|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |376 |Bus |42 |\n", + "|26-304-j19-1 |8590727 |159.TA.26-304-j19-1.4.R |8590727|19:50:00 |19:50:00 |10 |Oberengstringen, Paradies |47.4104852703573|8.45874332896223|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |740 |Bus |43 |\n", + "|26-304-j19-1 |8590833 |159.TA.26-304-j19-1.4.R |8590833|19:51:00 |19:51:00 |11 |Unterengstringen, Langacher |47.4122360710415|8.45316479104707|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |764 |Bus |44 |\n", + "|26-304-j19-1 |8594732 |159.TA.26-304-j19-1.4.R |8594732|19:53:00 |19:53:00 |12 |Unterengstringen, Sennenbüel N|47.4134944230824|8.44931101847766|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |987 |Bus |45 |\n", + "|26-304-j19-1 |8590831 |159.TA.26-304-j19-1.4.R |8590831|19:53:00 |19:53:00 |13 |Unterengstringen, Aegelsee |47.414977659342 |8.44603216769017|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |502 |Bus |46 |\n", + "|26-304-j19-1 |8590911 |159.TA.26-304-j19-1.4.R |8590911|19:55:00 |19:55:00 |14 |Weiningen ZH, Ausserdorf |47.4176826342903|8.43953734818508|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |754 |Bus |47 |\n", + "|26-304-j19-1 |8590913 |159.TA.26-304-j19-1.4.R |8590913|19:56:00 |19:56:00 |15 |Weiningen ZH, Lindenplatz |47.4195547602987|8.43394084396424|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |934 |Bus |48 |\n", + "|26-304-j19-1 |8590914 |159.TA.26-304-j19-1.4.R |8590914|19:57:00 |19:57:00 |16 |Weiningen ZH, Schulhaus |47.4183512583635|8.42866773324572|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |622 |Bus |49 |\n", + "|26-304-j19-1 |8590617 |159.TA.26-304-j19-1.4.R |8590617|19:59:00 |19:59:00 |17 |Geroldswil, Welbrig |47.4180716529658|8.41906474285715|Dietikon, Bahnhof |5481 |1 |19:39:00 |2 |17 |859 |Bus |50 |\n", + "|26-61-j19-1 |8591281 |269.TA.26-61-j19-1.1.H |8591281|19:57:00 |19:57:00 |1 |Zürich, Mühlacker |47.4256326325821|8.49799970688372|Zürich, Schwamendingerplatz|2076 |0 |19:57:00 |3 |2 |212 |Bus |51 |\n", + "|26-61-j19-1 |8591046 |269.TA.26-61-j19-1.1.H |8591046|19:58:00 |19:58:00 |2 |Zürich, Aspholz |47.425085652811 |8.50058685490234|Zürich, Schwamendingerplatz|2076 |0 |19:57:00 |3 |2 |1003 |Bus |52 |\n", + "|26-703-j19-1 |8591825 |179.TA.26-703-j19-1.2.R |8591825|07:10:00 |07:10:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |580 |Bus |53 |\n", + "|26-703-j19-1 |8590504 |179.TA.26-703-j19-1.2.R |8590504|07:11:00 |07:11:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |861 |Bus |54 |\n", + "|26-703-j19-1 |8596005 |179.TA.26-703-j19-1.2.R |8596005|07:14:00 |07:14:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1366 |Bus |55 |\n", + "|26-703-j19-1 |8591832 |179.TA.26-703-j19-1.2.R |8591832|07:14:00 |07:14:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1023 |Bus |56 |\n", + "|26-703-j19-1 |8591147 |179.TA.26-703-j19-1.2.R |8591147|07:16:00 |07:16:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1260 |Bus |57 |\n", + "|26-703-j19-1 |8591162 |179.TA.26-703-j19-1.2.R |8591162|07:17:00 |07:17:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |146 |Bus |58 |\n", + "|26-703-j19-1 |8591261 |179.TA.26-703-j19-1.2.R |8591261|07:18:00 |07:18:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1197 |Bus |59 |\n", + "|26-703-j19-1 |8591107 |179.TA.26-703-j19-1.2.R |8591107|07:19:00 |07:19:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1311 |Bus |60 |\n", + "|26-703-j19-1 |8591233 |179.TA.26-703-j19-1.2.R |8591233|07:25:00 |07:25:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9385 |1 |07:10:00 |4 |9 |1133 |Bus |61 |\n", + "|26-703-j19-1 |8591825 |171.TA.26-703-j19-1.2.R |8591825|07:12:00 |07:12:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |580 |Bus |62 |\n", + "|26-703-j19-1 |8590504 |171.TA.26-703-j19-1.2.R |8590504|07:13:00 |07:13:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |861 |Bus |63 |\n", + "|26-703-j19-1 |8596005 |171.TA.26-703-j19-1.2.R |8596005|07:16:00 |07:16:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1366 |Bus |64 |\n", + "|26-703-j19-1 |8591832 |171.TA.26-703-j19-1.2.R |8591832|07:16:00 |07:16:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1023 |Bus |65 |\n", + "|26-703-j19-1 |8591147 |171.TA.26-703-j19-1.2.R |8591147|07:18:00 |07:18:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1260 |Bus |66 |\n", + "|26-703-j19-1 |8591162 |171.TA.26-703-j19-1.2.R |8591162|07:19:00 |07:19:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |146 |Bus |67 |\n", + "|26-703-j19-1 |8591261 |171.TA.26-703-j19-1.2.R |8591261|07:20:00 |07:20:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1197 |Bus |68 |\n", + "|26-703-j19-1 |8591107 |171.TA.26-703-j19-1.2.R |8591107|07:21:00 |07:21:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1311 |Bus |69 |\n", + "|26-703-j19-1 |8591233 |171.TA.26-703-j19-1.2.R |8591233|07:27:00 |07:27:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9346 |1 |07:12:00 |4 |9 |1133 |Bus |70 |\n", + "|26-703-j19-1 |8591825 |156.TA.26-703-j19-1.2.R |8591825|07:25:00 |07:25:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |580 |Bus |71 |\n", + "|26-703-j19-1 |8590504 |156.TA.26-703-j19-1.2.R |8590504|07:26:00 |07:26:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |861 |Bus |72 |\n", + "|26-703-j19-1 |8596005 |156.TA.26-703-j19-1.2.R |8596005|07:29:00 |07:29:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1366 |Bus |73 |\n", + "|26-703-j19-1 |8591832 |156.TA.26-703-j19-1.2.R |8591832|07:29:00 |07:29:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1023 |Bus |74 |\n", + "|26-703-j19-1 |8591147 |156.TA.26-703-j19-1.2.R |8591147|07:31:00 |07:31:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1260 |Bus |75 |\n", + "|26-703-j19-1 |8591162 |156.TA.26-703-j19-1.2.R |8591162|07:32:00 |07:32:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |146 |Bus |76 |\n", + "|26-703-j19-1 |8591261 |156.TA.26-703-j19-1.2.R |8591261|07:33:00 |07:33:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1197 |Bus |77 |\n", + "|26-703-j19-1 |8591107 |156.TA.26-703-j19-1.2.R |8591107|07:34:00 |07:34:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1311 |Bus |78 |\n", + "|26-703-j19-1 |8591233 |156.TA.26-703-j19-1.2.R |8591233|07:40:00 |07:40:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9288 |1 |07:25:00 |4 |9 |1133 |Bus |79 |\n", + "|26-703-j19-1 |8591825 |144.TA.26-703-j19-1.2.R |8591825|07:27:00 |07:27:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |580 |Bus |80 |\n", + "|26-703-j19-1 |8590504 |144.TA.26-703-j19-1.2.R |8590504|07:28:00 |07:28:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |861 |Bus |81 |\n", + "|26-703-j19-1 |8596005 |144.TA.26-703-j19-1.2.R |8596005|07:31:00 |07:31:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1366 |Bus |82 |\n", + "|26-703-j19-1 |8591832 |144.TA.26-703-j19-1.2.R |8591832|07:31:00 |07:31:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1023 |Bus |83 |\n", + "|26-703-j19-1 |8591147 |144.TA.26-703-j19-1.2.R |8591147|07:33:00 |07:33:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1260 |Bus |84 |\n", + "|26-703-j19-1 |8591162 |144.TA.26-703-j19-1.2.R |8591162|07:34:00 |07:34:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |146 |Bus |85 |\n", + "|26-703-j19-1 |8591261 |144.TA.26-703-j19-1.2.R |8591261|07:35:00 |07:35:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1197 |Bus |86 |\n", + "|26-703-j19-1 |8591107 |144.TA.26-703-j19-1.2.R |8591107|07:36:00 |07:36:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1311 |Bus |87 |\n", + "|26-703-j19-1 |8591233 |144.TA.26-703-j19-1.2.R |8591233|07:42:00 |07:42:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9231 |1 |07:27:00 |4 |9 |1133 |Bus |88 |\n", + "|26-703-j19-1 |8591825 |120.TA.26-703-j19-1.2.R |8591825|07:40:00 |07:40:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |580 |Bus |89 |\n", + "|26-703-j19-1 |8590504 |120.TA.26-703-j19-1.2.R |8590504|07:41:00 |07:41:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |861 |Bus |90 |\n", + "|26-703-j19-1 |8596005 |120.TA.26-703-j19-1.2.R |8596005|07:44:00 |07:44:00 |3 |Binz bei Maur, Twäracher |47.3608915729295|8.623476385787 |Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1366 |Bus |91 |\n", + "|26-703-j19-1 |8591832 |120.TA.26-703-j19-1.2.R |8591832|07:44:00 |07:44:00 |4 |Pfaffhausen, Müseren |47.3626987847054|8.61754750491098|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1023 |Bus |92 |\n", + "|26-703-j19-1 |8591147 |120.TA.26-703-j19-1.2.R |8591147|07:46:00 |07:46:00 |5 |Zürich, Friedhof Witikon |47.3613418604422|8.60282411740221|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1260 |Bus |93 |\n", + "|26-703-j19-1 |8591162 |120.TA.26-703-j19-1.2.R |8591162|07:47:00 |07:47:00 |6 |Zürich, Glockenacker |47.3609767627537|8.59930272148798|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |146 |Bus |94 |\n", + "|26-703-j19-1 |8591261 |120.TA.26-703-j19-1.2.R |8591261|07:48:00 |07:48:00 |7 |Zürich, Loorenstrasse |47.3598631991991|8.59452368417579|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1197 |Bus |95 |\n", + "|26-703-j19-1 |8591107 |120.TA.26-703-j19-1.2.R |8591107|07:49:00 |07:49:00 |8 |Zürich, Carl-Spitteler-Strasse|47.3583236436636|8.58659156021591|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1311 |Bus |96 |\n", + "|26-703-j19-1 |8591233 |120.TA.26-703-j19-1.2.R |8591233|07:55:00 |07:55:00 |9 |Zürich, Klusplatz |47.3640374201824|8.56649624730736|Zürich, Klusplatz |9159 |1 |07:40:00 |4 |9 |1133 |Bus |97 |\n", + "|26-703-j19-1 |8591825 |95.TA.26-703-j19-1.2.R |8591825|07:42:00 |07:42:00 |1 |Benglen, Bodenacher |47.3611288870976|8.63861299832652|Zürich, Klusplatz |9119 |1 |07:42:00 |4 |9 |580 |Bus |98 |\n", + "|26-703-j19-1 |8590504 |95.TA.26-703-j19-1.2.R |8590504|07:43:00 |07:43:00 |2 |Benglen, Gerlisbrunnen |47.3610862923255|8.63360938219328|Zürich, Klusplatz |9119 |1 |07:42:00 |4 |9 |861 |Bus |99 |\n", + "+-------------+---------------+------------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+---------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "only showing top 100 rows" + ] + } + ], + "source": [ + "from pyspark.sql.types import LongType\n", + "stop_times = spark.read.csv('data/lgpt_guys/stop_times_final_cyril.csv', header=True)\n", + "stop_times = stop_times.sort(stop_times.monotonically_increasing_id.cast(LongType()))\n", + "\n", + "stop_times.show(100, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+---------------+-----------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+--------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|route_id |stop_id_general|trip_id |stop_id|arrival_time|departure_time|stop_sequence|stop_name |stop_lat |stop_lon |trip_headsign |trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|monotonically_increasing_id|\n", + "+------------+---------------+-----------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+--------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|26-660-j19-1|8576167 |486.TA.26-660-j19-1.9.R|8576167|07:16:00 |07:16:00 |1 |Nürensdorf, Chrüzstrass |47.4542439013472|8.63462447846447|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |1200 |Bus |532575945867 |\n", + "|26-660-j19-1|8576168 |486.TA.26-660-j19-1.9.R|8576168|07:17:00 |07:17:00 |2 |Birchwil (Nürensdorf) |47.4577182388675|8.63876571192484|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |646 |Bus |532575945868 |\n", + "|26-660-j19-1|8576169 |486.TA.26-660-j19-1.9.R|8576169|07:19:00 |07:19:00 |3 |Nürensdorf, Oberwil |47.4641014319557|8.63995148810004|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |865 |Bus |532575945869 |\n", + "|26-660-j19-1|8576172 |486.TA.26-660-j19-1.9.R|8576172|07:22:00 |07:22:00 |4 |Breite b. N'dorf,Grünenwaldstr|47.461927224864 |8.66128647610085|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |45 |Bus |532575945870 |\n", + "|26-660-j19-1|8576174 |486.TA.26-660-j19-1.9.R|8576174|07:24:00 |07:24:00 |5 |Brütten, Hofacher |47.470969635094 |8.67059302244562|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |94 |Bus |532575945871 |\n", + "|26-660-j19-1|8506960 |486.TA.26-660-j19-1.9.R|8506960|07:26:00 |07:26:00 |6 |Brütten, Zentrum |47.472038346399 |8.67648597071027|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |1060 |Bus |532575945872 |\n", + "|26-660-j19-1|8576176 |486.TA.26-660-j19-1.9.R|8576176|07:27:00 |07:27:00 |7 |Brütten, Harossen |47.4714675601105|8.68249569996186|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |778 |Bus |532575945873 |\n", + "|26-660-j19-1|8591835 |486.TA.26-660-j19-1.9.R|8591835|07:27:00 |07:27:00 |8 |Brütten, Steighof |47.469621570521 |8.68646625351823|Winterthur, Archstrasse/HB|2268 |1 |07:16:00 |500 |8 |1128 |Bus |532575945874 |\n", + "+------------+---------------+-----------------------+-------+------------+--------------+-------------+------------------------------+----------------+----------------+--------------------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+" + ] + } + ], + "source": [ + "stop_times.where(stop_times.route_int==500).show(100, 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Validated on sbb.ch" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+---------------+---------------------+-------+------------+--------------+-------------+--------------------------+----------------+----------------+-----------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|route_id |stop_id_general|trip_id |stop_id|arrival_time|departure_time|stop_sequence|stop_name |stop_lat |stop_lon |trip_headsign |trip_short_name|direction_id|departure_first_stop|route_int|stop_count|stop_int|route_desc|monotonically_increasing_id|\n", + "+-----------+---------------+---------------------+-------+------------+--------------+-------------+--------------------------+----------------+----------------+-----------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+\n", + "|26-46-j19-1|8591328 |55.TA.26-46-j19-1.3.H|8591328|08:35:00 |08:35:00 |1 |Zürich, Rütihof |47.413451870606 |8.47731150588756|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |881 |Bus |970662609829 |\n", + "|26-46-j19-1|8591155 |55.TA.26-46-j19-1.3.H|8591155|08:36:00 |08:36:00 |2 |Zürich, Geeringstrasse |47.4144427264912|8.48043764307674|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |187 |Bus |970662609830 |\n", + "|26-46-j19-1|8576241 |55.TA.26-46-j19-1.3.H|8576241|08:38:00 |08:38:00 |3 |Zürich, Heizenholz |47.4122968616864|8.48390514007392|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |57 |Bus |970662609831 |\n", + "|26-46-j19-1|8591158 |55.TA.26-46-j19-1.3.H|8591158|08:38:00 |08:38:00 |4 |Zürich, Giblenstrasse |47.4107284405996|8.485953298922 |Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |704 |Bus |970662609832 |\n", + "|26-46-j19-1|8591358 |55.TA.26-46-j19-1.3.H|8591358|08:40:00 |08:40:00 |5 |Zürich, Segantinistrasse |47.4074455475966|8.48996876824257|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |1025 |Bus |970662609833 |\n", + "|26-46-j19-1|8591371 |55.TA.26-46-j19-1.3.H|8591371|08:40:00 |08:40:00 |6 |Zürich, Singlistrasse |47.4051109214132|8.49349016415681|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |879 |Bus |970662609834 |\n", + "|26-46-j19-1|8591431 |55.TA.26-46-j19-1.3.H|8591431|08:41:00 |08:41:00 |7 |Zürich, Wieslergasse |47.4040651698812|8.49596053118848|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |583 |Bus |970662609835 |\n", + "|26-46-j19-1|8576240 |55.TA.26-46-j19-1.3.H|8576240|08:43:00 |08:43:00 |8 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |1221 |Bus |970662609836 |\n", + "|26-46-j19-1|8591353 |55.TA.26-46-j19-1.3.H|8591353|08:44:00 |08:44:00 |9 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |816 |Bus |970662609837 |\n", + "|26-46-j19-1|8591226 |55.TA.26-46-j19-1.3.H|8591226|08:45:00 |08:45:00 |10 |Zürich, Kempfhofsteig |47.3973037636525|8.51002814853975|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |736 |Bus |970662609838 |\n", + "|26-46-j19-1|8591312 |55.TA.26-46-j19-1.3.H|8591312|08:46:00 |08:46:00 |11 |Zürich, Rebbergsteig |47.396902429888 |8.5149688826031 |Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |762 |Bus |970662609839 |\n", + "|26-46-j19-1|8591247 |55.TA.26-46-j19-1.3.H|8591247|08:47:00 |08:47:00 |12 |Zürich, Lehenstrasse |47.3962335334876|8.52015216179319|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |1206 |Bus |970662609840 |\n", + "|26-46-j19-1|8591323 |55.TA.26-46-j19-1.3.H|8591323|08:48:00 |08:48:00 |13 |Zürich, Rosengartenstrasse|47.394330172729 |8.52546120512307|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |1039 |Bus |970662609841 |\n", + "|26-46-j19-1|8580522 |55.TA.26-46-j19-1.3.H|8580522|08:49:00 |08:49:00 |14 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |455 |Bus |970662609842 |\n", + "|26-46-j19-1|8594239 |55.TA.26-46-j19-1.3.H|8594239|08:50:00 |08:50:00 |15 |Zürich, Schiffbau |47.3875735990751|8.51944249271863|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |832 |Bus |970662609843 |\n", + "|26-46-j19-1|8591060 |55.TA.26-46-j19-1.3.H|8591060|08:52:00 |08:52:00 |16 |Zürich Hardbrücke, Bahnhof|47.3849339821896|8.51703500775686|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |850 |Bus |970662609844 |\n", + "|26-46-j19-1|8591177 |55.TA.26-46-j19-1.3.H|8591177|08:52:00 |08:52:00 |17 |Zürich, Hardplatz |47.3823428932854|8.51452870811382|Zürich, Hardplatz|1106 |0 |08:35:00 |800 |17 |266 |Bus |970662609845 |\n", + "|26-46-j19-1|8591328 |54.TA.26-46-j19-1.3.H|8591328|18:35:00 |18:35:00 |1 |Zürich, Rütihof |47.413451870606 |8.47731150588756|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |881 |Bus |970662609846 |\n", + "|26-46-j19-1|8591155 |54.TA.26-46-j19-1.3.H|8591155|18:36:00 |18:36:00 |2 |Zürich, Geeringstrasse |47.4144427264912|8.48043764307674|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |187 |Bus |970662609847 |\n", + "|26-46-j19-1|8576241 |54.TA.26-46-j19-1.3.H|8576241|18:37:00 |18:37:00 |3 |Zürich, Heizenholz |47.4122968616864|8.48390514007392|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |57 |Bus |970662609848 |\n", + "|26-46-j19-1|8591158 |54.TA.26-46-j19-1.3.H|8591158|18:38:00 |18:38:00 |4 |Zürich, Giblenstrasse |47.4107284405996|8.485953298922 |Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |704 |Bus |970662609849 |\n", + "|26-46-j19-1|8591358 |54.TA.26-46-j19-1.3.H|8591358|18:39:00 |18:39:00 |5 |Zürich, Segantinistrasse |47.4074455475966|8.48996876824257|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |1025 |Bus |970662609850 |\n", + "|26-46-j19-1|8591371 |54.TA.26-46-j19-1.3.H|8591371|18:40:00 |18:40:00 |6 |Zürich, Singlistrasse |47.4051109214132|8.49349016415681|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |879 |Bus |970662609851 |\n", + "|26-46-j19-1|8591431 |54.TA.26-46-j19-1.3.H|8591431|18:41:00 |18:41:00 |7 |Zürich, Wieslergasse |47.4040651698812|8.49596053118848|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |583 |Bus |970662609852 |\n", + "|26-46-j19-1|8576240 |54.TA.26-46-j19-1.3.H|8576240|18:42:00 |18:42:00 |8 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |1221 |Bus |970662609853 |\n", + "|26-46-j19-1|8591353 |54.TA.26-46-j19-1.3.H|8591353|18:43:00 |18:43:00 |9 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |816 |Bus |970662609854 |\n", + "|26-46-j19-1|8591226 |54.TA.26-46-j19-1.3.H|8591226|18:45:00 |18:45:00 |10 |Zürich, Kempfhofsteig |47.3973037636525|8.51002814853975|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |736 |Bus |970662609855 |\n", + "|26-46-j19-1|8591312 |54.TA.26-46-j19-1.3.H|8591312|18:45:00 |18:45:00 |11 |Zürich, Rebbergsteig |47.396902429888 |8.5149688826031 |Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |762 |Bus |970662609856 |\n", + "|26-46-j19-1|8591247 |54.TA.26-46-j19-1.3.H|8591247|18:47:00 |18:47:00 |12 |Zürich, Lehenstrasse |47.3962335334876|8.52015216179319|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |1206 |Bus |970662609857 |\n", + "|26-46-j19-1|8591323 |54.TA.26-46-j19-1.3.H|8591323|18:48:00 |18:48:00 |13 |Zürich, Rosengartenstrasse|47.394330172729 |8.52546120512307|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |1039 |Bus |970662609858 |\n", + "|26-46-j19-1|8580522 |54.TA.26-46-j19-1.3.H|8580522|18:49:00 |18:49:00 |14 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |455 |Bus |970662609859 |\n", + "|26-46-j19-1|8594239 |54.TA.26-46-j19-1.3.H|8594239|18:50:00 |18:50:00 |15 |Zürich, Schiffbau |47.3875735990751|8.51944249271863|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |832 |Bus |970662609860 |\n", + "|26-46-j19-1|8591060 |54.TA.26-46-j19-1.3.H|8591060|18:51:00 |18:51:00 |16 |Zürich Hardbrücke, Bahnhof|47.3849339821896|8.51703500775686|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |850 |Bus |970662609861 |\n", + "|26-46-j19-1|8591177 |54.TA.26-46-j19-1.3.H|8591177|18:52:00 |18:52:00 |17 |Zürich, Hardplatz |47.3823428932854|8.51452870811382|Zürich, Hardplatz|531 |0 |18:35:00 |800 |17 |266 |Bus |970662609862 |\n", + "|26-46-j19-1|8591328 |53.TA.26-46-j19-1.3.H|8591328|18:42:00 |18:42:00 |1 |Zürich, Rütihof |47.413451870606 |8.47731150588756|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |881 |Bus |970662609863 |\n", + "|26-46-j19-1|8591155 |53.TA.26-46-j19-1.3.H|8591155|18:43:00 |18:43:00 |2 |Zürich, Geeringstrasse |47.4144427264912|8.48043764307674|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |187 |Bus |970662609864 |\n", + "|26-46-j19-1|8576241 |53.TA.26-46-j19-1.3.H|8576241|18:45:00 |18:45:00 |3 |Zürich, Heizenholz |47.4122968616864|8.48390514007392|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |57 |Bus |970662609865 |\n", + "|26-46-j19-1|8591158 |53.TA.26-46-j19-1.3.H|8591158|18:45:00 |18:45:00 |4 |Zürich, Giblenstrasse |47.4107284405996|8.485953298922 |Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |704 |Bus |970662609866 |\n", + "|26-46-j19-1|8591358 |53.TA.26-46-j19-1.3.H|8591358|18:47:00 |18:47:00 |5 |Zürich, Segantinistrasse |47.4074455475966|8.48996876824257|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |1025 |Bus |970662609867 |\n", + "|26-46-j19-1|8591371 |53.TA.26-46-j19-1.3.H|8591371|18:47:00 |18:47:00 |6 |Zürich, Singlistrasse |47.4051109214132|8.49349016415681|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |879 |Bus |970662609868 |\n", + "|26-46-j19-1|8591431 |53.TA.26-46-j19-1.3.H|8591431|18:48:00 |18:48:00 |7 |Zürich, Wieslergasse |47.4040651698812|8.49596053118848|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |583 |Bus |970662609869 |\n", + "|26-46-j19-1|8576240 |53.TA.26-46-j19-1.3.H|8576240|18:50:00 |18:50:00 |8 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |1221 |Bus |970662609870 |\n", + "|26-46-j19-1|8591353 |53.TA.26-46-j19-1.3.H|8591353|18:51:00 |18:51:00 |9 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |816 |Bus |970662609871 |\n", + "|26-46-j19-1|8591226 |53.TA.26-46-j19-1.3.H|8591226|18:52:00 |18:52:00 |10 |Zürich, Kempfhofsteig |47.3973037636525|8.51002814853975|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |736 |Bus |970662609872 |\n", + "|26-46-j19-1|8591312 |53.TA.26-46-j19-1.3.H|8591312|18:53:00 |18:53:00 |11 |Zürich, Rebbergsteig |47.396902429888 |8.5149688826031 |Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |762 |Bus |970662609873 |\n", + "|26-46-j19-1|8591247 |53.TA.26-46-j19-1.3.H|8591247|18:54:00 |18:54:00 |12 |Zürich, Lehenstrasse |47.3962335334876|8.52015216179319|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |1206 |Bus |970662609874 |\n", + "|26-46-j19-1|8591323 |53.TA.26-46-j19-1.3.H|8591323|18:55:00 |18:55:00 |13 |Zürich, Rosengartenstrasse|47.394330172729 |8.52546120512307|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |1039 |Bus |970662609875 |\n", + "|26-46-j19-1|8580522 |53.TA.26-46-j19-1.3.H|8580522|18:56:00 |18:56:00 |14 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |455 |Bus |970662609876 |\n", + "|26-46-j19-1|8594239 |53.TA.26-46-j19-1.3.H|8594239|18:57:00 |18:57:00 |15 |Zürich, Schiffbau |47.3875735990751|8.51944249271863|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |832 |Bus |970662609877 |\n", + "|26-46-j19-1|8591060 |53.TA.26-46-j19-1.3.H|8591060|18:59:00 |18:59:00 |16 |Zürich Hardbrücke, Bahnhof|47.3849339821896|8.51703500775686|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |850 |Bus |970662609878 |\n", + "|26-46-j19-1|8591177 |53.TA.26-46-j19-1.3.H|8591177|18:59:00 |18:59:00 |17 |Zürich, Hardplatz |47.3823428932854|8.51452870811382|Zürich, Hardplatz|525 |0 |18:42:00 |800 |17 |266 |Bus |970662609879 |\n", + "|26-46-j19-1|8591328 |52.TA.26-46-j19-1.3.H|8591328|18:57:00 |18:57:00 |1 |Zürich, Rütihof |47.413451870606 |8.47731150588756|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |881 |Bus |970662609880 |\n", + "|26-46-j19-1|8591155 |52.TA.26-46-j19-1.3.H|8591155|18:58:00 |18:58:00 |2 |Zürich, Geeringstrasse |47.4144427264912|8.48043764307674|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |187 |Bus |970662609881 |\n", + "|26-46-j19-1|8576241 |52.TA.26-46-j19-1.3.H|8576241|19:00:00 |19:00:00 |3 |Zürich, Heizenholz |47.4122968616864|8.48390514007392|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |57 |Bus |970662609882 |\n", + "|26-46-j19-1|8591158 |52.TA.26-46-j19-1.3.H|8591158|19:00:00 |19:00:00 |4 |Zürich, Giblenstrasse |47.4107284405996|8.485953298922 |Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |704 |Bus |970662609883 |\n", + "|26-46-j19-1|8591358 |52.TA.26-46-j19-1.3.H|8591358|19:02:00 |19:02:00 |5 |Zürich, Segantinistrasse |47.4074455475966|8.48996876824257|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |1025 |Bus |970662609884 |\n", + "|26-46-j19-1|8591371 |52.TA.26-46-j19-1.3.H|8591371|19:02:00 |19:02:00 |6 |Zürich, Singlistrasse |47.4051109214132|8.49349016415681|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |879 |Bus |970662609885 |\n", + "|26-46-j19-1|8591431 |52.TA.26-46-j19-1.3.H|8591431|19:03:00 |19:03:00 |7 |Zürich, Wieslergasse |47.4040651698812|8.49596053118848|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |583 |Bus |970662609886 |\n", + "|26-46-j19-1|8576240 |52.TA.26-46-j19-1.3.H|8576240|19:05:00 |19:05:00 |8 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |1221 |Bus |970662609887 |\n", + "|26-46-j19-1|8591353 |52.TA.26-46-j19-1.3.H|8591353|19:06:00 |19:06:00 |9 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |816 |Bus |970662609888 |\n", + "|26-46-j19-1|8591226 |52.TA.26-46-j19-1.3.H|8591226|19:07:00 |19:07:00 |10 |Zürich, Kempfhofsteig |47.3973037636525|8.51002814853975|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |736 |Bus |970662609889 |\n", + "|26-46-j19-1|8591312 |52.TA.26-46-j19-1.3.H|8591312|19:08:00 |19:08:00 |11 |Zürich, Rebbergsteig |47.396902429888 |8.5149688826031 |Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |762 |Bus |970662609890 |\n", + "|26-46-j19-1|8591247 |52.TA.26-46-j19-1.3.H|8591247|19:09:00 |19:09:00 |12 |Zürich, Lehenstrasse |47.3962335334876|8.52015216179319|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |1206 |Bus |970662609891 |\n", + "|26-46-j19-1|8591323 |52.TA.26-46-j19-1.3.H|8591323|19:10:00 |19:10:00 |13 |Zürich, Rosengartenstrasse|47.394330172729 |8.52546120512307|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |1039 |Bus |970662609892 |\n", + "|26-46-j19-1|8580522 |52.TA.26-46-j19-1.3.H|8580522|19:11:00 |19:11:00 |14 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |455 |Bus |970662609893 |\n", + "|26-46-j19-1|8594239 |52.TA.26-46-j19-1.3.H|8594239|19:12:00 |19:12:00 |15 |Zürich, Schiffbau |47.3875735990751|8.51944249271863|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |832 |Bus |970662609894 |\n", + "|26-46-j19-1|8591060 |52.TA.26-46-j19-1.3.H|8591060|19:14:00 |19:14:00 |16 |Zürich Hardbrücke, Bahnhof|47.3849339821896|8.51703500775686|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |850 |Bus |970662609895 |\n", + "|26-46-j19-1|8591177 |52.TA.26-46-j19-1.3.H|8591177|19:14:00 |19:14:00 |17 |Zürich, Hardplatz |47.3823428932854|8.51452870811382|Zürich, Hardplatz|514 |0 |18:57:00 |800 |17 |266 |Bus |970662609896 |\n", + "|26-46-j19-1|8591328 |51.TA.26-46-j19-1.3.H|8591328|19:05:00 |19:05:00 |1 |Zürich, Rütihof |47.413451870606 |8.47731150588756|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |881 |Bus |970662609897 |\n", + "|26-46-j19-1|8591155 |51.TA.26-46-j19-1.3.H|8591155|19:06:00 |19:06:00 |2 |Zürich, Geeringstrasse |47.4144427264912|8.48043764307674|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |187 |Bus |970662609898 |\n", + "|26-46-j19-1|8576241 |51.TA.26-46-j19-1.3.H|8576241|19:07:00 |19:07:00 |3 |Zürich, Heizenholz |47.4122968616864|8.48390514007392|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |57 |Bus |970662609899 |\n", + "|26-46-j19-1|8591158 |51.TA.26-46-j19-1.3.H|8591158|19:08:00 |19:08:00 |4 |Zürich, Giblenstrasse |47.4107284405996|8.485953298922 |Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |704 |Bus |970662609900 |\n", + "|26-46-j19-1|8591358 |51.TA.26-46-j19-1.3.H|8591358|19:09:00 |19:09:00 |5 |Zürich, Segantinistrasse |47.4074455475966|8.48996876824257|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |1025 |Bus |970662609901 |\n", + "|26-46-j19-1|8591371 |51.TA.26-46-j19-1.3.H|8591371|19:10:00 |19:10:00 |6 |Zürich, Singlistrasse |47.4051109214132|8.49349016415681|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |879 |Bus |970662609902 |\n", + "|26-46-j19-1|8591431 |51.TA.26-46-j19-1.3.H|8591431|19:11:00 |19:11:00 |7 |Zürich, Wieslergasse |47.4040651698812|8.49596053118848|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |583 |Bus |970662609903 |\n", + "|26-46-j19-1|8576240 |51.TA.26-46-j19-1.3.H|8576240|19:12:00 |19:12:00 |8 |Zürich, Meierhofplatz |47.4020100860391|8.49937412926861|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |1221 |Bus |970662609904 |\n", + "|26-46-j19-1|8591353 |51.TA.26-46-j19-1.3.H|8591353|19:13:00 |19:13:00 |9 |Zürich, Schwert |47.3997299435837|8.50461130737576|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |816 |Bus |970662609905 |\n", + "|26-46-j19-1|8591226 |51.TA.26-46-j19-1.3.H|8591226|19:15:00 |19:15:00 |10 |Zürich, Kempfhofsteig |47.3973037636525|8.51002814853975|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |736 |Bus |970662609906 |\n", + "|26-46-j19-1|8591312 |51.TA.26-46-j19-1.3.H|8591312|19:15:00 |19:15:00 |11 |Zürich, Rebbergsteig |47.396902429888 |8.5149688826031 |Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |762 |Bus |970662609907 |\n", + "|26-46-j19-1|8591247 |51.TA.26-46-j19-1.3.H|8591247|19:17:00 |19:17:00 |12 |Zürich, Lehenstrasse |47.3962335334876|8.52015216179319|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |1206 |Bus |970662609908 |\n", + "|26-46-j19-1|8591323 |51.TA.26-46-j19-1.3.H|8591323|19:18:00 |19:18:00 |13 |Zürich, Rosengartenstrasse|47.394330172729 |8.52546120512307|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |1039 |Bus |970662609909 |\n", + "|26-46-j19-1|8580522 |51.TA.26-46-j19-1.3.H|8580522|19:19:00 |19:19:00 |14 |Zürich, Escher-Wyss-Platz |47.3907969150758|8.5223979500038 |Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |455 |Bus |970662609910 |\n", + "|26-46-j19-1|8594239 |51.TA.26-46-j19-1.3.H|8594239|19:20:00 |19:20:00 |15 |Zürich, Schiffbau |47.3875735990751|8.51944249271863|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |832 |Bus |970662609911 |\n", + "|26-46-j19-1|8591060 |51.TA.26-46-j19-1.3.H|8591060|19:21:00 |19:21:00 |16 |Zürich Hardbrücke, Bahnhof|47.3849339821896|8.51703500775686|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |850 |Bus |970662609912 |\n", + "|26-46-j19-1|8591177 |51.TA.26-46-j19-1.3.H|8591177|19:22:00 |19:22:00 |17 |Zürich, Hardplatz |47.3823428932854|8.51452870811382|Zürich, Hardplatz|506 |0 |19:05:00 |800 |17 |266 |Bus |970662609913 |\n", + "+-----------+---------------+---------------------+-------+------------+--------------+-------------+--------------------------+----------------+----------------+-----------------+---------------+------------+--------------------+---------+----------+--------+----------+---------------------------+" ] } ], "source": [ - "stop_times.where(stop_times.route_int==200).show(100, 0)" + "stop_times.where(stop_times.route_int==800).show(100, 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Verified on sbb.ch. This one is interesting: the service stops in the middle of the day. That is also what is observed on sbb.ch." ] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/notebooks/probabilities.ipynb b/notebooks/hdfs_get_distributions.ipynb similarity index 95% rename from notebooks/probabilities.ipynb rename to notebooks/hdfs_get_distributions.ipynb index faf0c2e..892cfea 100644 --- a/notebooks/probabilities.ipynb +++ b/notebooks/hdfs_get_distributions.ipynb @@ -1,2476 +1,2321 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Make distribution tables to calculate probabilities of transfer\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7272application_1589299642358_1768pysparkidleLinkLink
7292application_1589299642358_1788pysparkbusyLinkLink
7326application_1589299642358_1822pysparkidleLinkLink
7369application_1589299642358_1865pysparkidleLinkLink
7388application_1589299642358_1884pysparkidleLinkLink
7393application_1589299642358_1889pysparkidleLinkLink
7398application_1589299642358_1894pysparkidleLinkLink
7407application_1589299642358_1903pysparkidleLinkLink
7412application_1589299642358_1908pysparkbusyLinkLink
7415application_1589299642358_1911pysparkidleLinkLink
7418application_1589299642358_1914pysparkidleLinkLink
7420application_1589299642358_1916pysparkbusyLinkLink
7421application_1589299642358_1917pysparkidleLinkLink
7422application_1589299642358_1918pysparkbusyLinkLink
7423application_1589299642358_1919pysparkidleLinkLink
7424application_1589299642358_1920pysparkidleLinkLink
7426application_1589299642358_1922pysparkidleLinkLink
7427application_1589299642358_1923pysparkidleLinkLink
7428application_1589299642358_1924pysparkbusyLinkLink
7429application_1589299642358_1925pysparkidleLinkLink
7431application_1589299642358_1927pysparkidleLinkLink
7433application_1589299642358_1929pysparkidleLinkLink
7434application_1589299642358_1930pysparkidleLinkLink
7435application_1589299642358_1931pysparkbusyLinkLink
7437application_1589299642358_1933pysparkidleLinkLink
7438application_1589299642358_1934pysparkidleLinkLink
7440application_1589299642358_1936pysparkidleLinkLink
7441application_1589299642358_1937pysparkidleLinkLink
7443application_1589299642358_1939pysparkidleLinkLink
7444application_1589299642358_1940pysparkidleLinkLink
7445application_1589299642358_1941pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7446application_1589299642358_1942pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "Variable named username not found.\n" ] } ], "source": [ "%%send_to_spark -i username -t str -n username" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import useful libraries " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from geopy.distance import great_circle\n", "from pyspark.sql.functions import *\n", "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read TimeTable data for routes / trips " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+---------------------------+-------+\n", "|stop_id_raw|stop_name |stop_id|\n", "+-----------+---------------------------+-------+\n", "|8500926 |Oetwil a.d.L., Schweizäcker|8500926|\n", "|8502186 |Dietikon Stoffelbach |8502186|\n", "|8502186:0:1|Dietikon Stoffelbach |8502186|\n", "|8502186:0:2|Dietikon Stoffelbach |8502186|\n", "|8502186P |Dietikon Stoffelbach |8502186|\n", "+-----------+---------------------------+-------+\n", "only showing top 5 rows" ] } ], "source": [ "stops_15km = spark.read.csv('data/lgpt_guys/stops_15km.csv', header = True)\n", "\n", "# We use only first 7 characters of stop_id to remove special cases\n", "stops_15km = stops_15km.select(col('stop_id').alias('stop_id_raw'), 'stop_name')\\\n", " .withColumn('stop_id',col('stop_id_raw').substr(1, 7))\n", "stops_15km.show(5, False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read the [SBB actual data](https://opentransportdata.swiss/en/dataset/istdaten) in ORC format" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sbb = spark.read.orc('/data/sbb/orc/istdaten')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- betriebstag: string (nullable = true)\n", " |-- fahrt_bezeichner: string (nullable = true)\n", " |-- betreiber_id: string (nullable = true)\n", " |-- betreiber_abk: string (nullable = true)\n", " |-- betreiber_name: string (nullable = true)\n", " |-- produkt_id: string (nullable = true)\n", " |-- linien_id: string (nullable = true)\n", " |-- linien_text: string (nullable = true)\n", " |-- umlauf_id: string (nullable = true)\n", " |-- verkehrsmittel_text: string (nullable = true)\n", " |-- zusatzfahrt_tf: string (nullable = true)\n", " |-- faellt_aus_tf: string (nullable = true)\n", " |-- bpuic: string (nullable = true)\n", " |-- haltestellen_name: string (nullable = true)\n", " |-- ankunftszeit: string (nullable = true)\n", " |-- an_prognose: string (nullable = true)\n", " |-- an_prognose_status: string (nullable = true)\n", " |-- abfahrtszeit: string (nullable = true)\n", " |-- ab_prognose: string (nullable = true)\n", " |-- ab_prognose_status: string (nullable = true)\n", " |-- durchfahrt_tf: string (nullable = true)" ] } ], "source": [ "sbb.printSchema()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Subset SBB data\n", "\n", "We take only stop_id in 15 km range from Zurich HB using `stop_id` field from _stops_15km_. We did not use only `geschaetz` prognose time as there was too few overlap between _timetable_ and _sbb_ datasets with only `geschaetz` arrival times. _To do next : Use only geschaetz when available_" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "10848628\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+\n", "|fahrt_bezeichner|haltestellen_name|ankunftszeit |abfahrtszeit |an_prognose |ab_prognose |stop_id|\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+\n", "|85:11:10:002 |Zürich HB |03.09.2018 21:51| |03.09.2018 21:53:40| |8503000|\n", "|85:11:11:001 |Zürich HB | |03.09.2018 06:09| |03.09.2018 06:10:22|8503000|\n", "|85:11:12:001 |Zürich HB |03.09.2018 10:51| |03.09.2018 10:51:28| |8503000|\n", "|85:11:1251:003 |Zürich HB |03.09.2018 07:00| |03.09.2018 07:00:01| |8503000|\n", "|85:11:1252:001 |Zürich HB |03.09.2018 21:23|03.09.2018 21:36|03.09.2018 21:24:55|03.09.2018 21:36:57|8503000|\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+\n", "only showing top 5 rows" ] } ], "source": [ "# Used to subset sbb table based on stop_id \n", "l1_id = stops_15km.select('stop_id').collect()\n", "l2_id = [item.stop_id for item in l1_id]\n", "\n", "# Used to subset sbb table based on stop_names \n", "l1_name = stops_15km.select('stop_name').collect()\n", "l2_name = [item.stop_name for item in l1_name]\n", "\n", "# Make the subset dataframe\n", "sbb_filt = sbb.filter( ( sbb['bpuic'].isin(l2_id) | sbb['bpuic'].isin(l2_name) ) &\\\n", " ((sbb.an_prognose_status == 'REAL') | \\\n", " (sbb.an_prognose_status == 'GESCHAETZ') | \\\n", " (sbb.ab_prognose_status == 'REAL') | \\\n", " (sbb.ab_prognose_status == 'GESCHAETZ') ) ) \\\n", " .select('fahrt_bezeichner','haltestellen_name', \\\n", " 'ankunftszeit', 'abfahrtszeit', \\\n", " 'an_prognose', 'ab_prognose', \\\n", " col('bpuic').alias('stop_id'))\n", "\n", "print sbb_filt.count()\n", "sbb_filt.show(5,False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Write subset table in HDFS for better performance during later usage" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# save\n", "username = 'acoudray'\n", "sbb_filt.write.format(\"orc\").save(\"/user/{}/sbb_filt_forDelays_GeschaetzAndReal.orc\".format(username))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Summary of tables writen in /user/{}/ :\n", "- sbb_filt_forDelays_noGaeschetz.orc : table with all dates, < 15km, no GESCHAETZ, used 7-char trimmed stop_id in timetable data\n", "- sbb_filt_forDelays2.orc : table with all dates, < 15km, only GESCHAETZ, used 7-char trimmed stop_id in timetable data\n", "- sbb_filt_forDelays.orc : table with all dates, < 15km, only GESCHAETZ\n", "- sbb_sub_forDelays.ord : Old to remove" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Measure Distributions of Delay Times per trip and station\n", "\n", "The goal of this chapter is to pre-compute probabilities for McRaptor implementation, which will be ultimately used to choose the best trip according to its time __and probability of success__. The goal is to create a distribution of arrival delays for each station / trip_id pair. \n", "\n", "We begin with a simple query of trip_id / station_id and build up to the full table generation made from correspondance tables between sbb and timetable trip_ids (they need to be translated first, which is done in `match_datasets.ipynb`.\n", "\n", "#### Simple task : returning the distribution for a given station / trip id\n", "\n", "Let's begin by exploring _sbb_ data and compute a distribution step by step for a given station / trip_id " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "2133164" ] } ], "source": [ "# Load sbb data \n", "username='acoudray'\n", "sbb = spark.read.orc(\"/user/{}/sbb_filt_forDelays.orc\".format(username))\n", "sbb.count()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------------+\n", "| haltestellen_name|\n", "+-------------------+\n", "|Winkel am Zürichsee|\n", "| Zürich Flughafen|\n", "| Kemptthal|\n", "| Urdorf|\n", "| Zürich Wiedikon|\n", "+-------------------+\n", "only showing top 5 rows" ] } ], "source": [ "sbb.select(\"haltestellen_name\").distinct().show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we show the first few lines of all unique stations. We pick one of them and show its first associated trip id." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------------+\n", "|fahrt_bezeichner|\n", "+----------------+\n", "| 85:11:1507:002|\n", "| 85:11:1509:003|\n", "| 85:11:1510:003|\n", "| 85:11:1511:003|\n", "| 85:11:1512:003|\n", "+----------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop=\"Zürich Flughafen\"\n", "sbb.filter(sbb.haltestellen_name == stop).select(\"fahrt_bezeichner\").show(5)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------------+--------------+-------------------+-------------+-------------+-------+\n", "| station| trip_id| arrival_true|DiffInSeconds|DiffInMinutes|weekday|\n", "+----------------+--------------+-------------------+-------------+-------------+-------+\n", "|Zürich Flughafen|85:11:1507:002|2018-05-06 06:49:24| 24| 0.0| Sun|\n", "|Zürich Flughafen|85:11:1507:002|2018-05-05 06:49:15| 15| 0.0| Sat|\n", "|Zürich Flughafen|85:11:1507:002|2018-05-04 06:50:38| 98| 2.0| Fri|\n", "|Zürich Flughafen|85:11:1507:002|2018-05-03 06:50:11| 71| 1.0| Thu|\n", "|Zürich Flughafen|85:11:1507:002|2018-05-02 06:49:30| 30| 1.0| Wed|\n", "|Zürich Flughafen|85:11:1507:002|2018-05-01 06:49:38| 38| 1.0| Tue|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-30 06:49:59| 59| 1.0| Mon|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-29 06:49:16| 16| 0.0| Sun|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-28 06:49:37| 37| 1.0| Sat|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-27 06:50:00| 60| 1.0| Fri|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-26 06:49:58| 58| 1.0| Thu|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-25 06:49:44| 44| 1.0| Wed|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-24 06:50:10| 70| 1.0| Tue|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-23 06:49:53| 53| 1.0| Mon|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-22 06:49:33| 33| 1.0| Sun|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-21 06:49:00| 0| 0.0| Sat|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-20 06:49:43| 43| 1.0| Fri|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-19 06:49:00| 0| 0.0| Thu|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-18 06:49:39| 39| 1.0| Wed|\n", "|Zürich Flughafen|85:11:1507:002|2018-04-17 06:49:36| 36| 1.0| Tue|\n", "+----------------+--------------+-------------------+-------------+-------------+-------+\n", "only showing top 20 rows" ] } ], "source": [ "trip_id=\"85:11:1507:002\"\n", "\n", "# First filter - filter selected station/trip_id, with define arrival time and GAESCHETZ status\n", "# Select 4 fields of interest, rename \n", "# Convert date-like string to timestamp\n", "# Compute difference between scheduled and actual arrivals times\n", "# reselect to generate weekday\n", "sbb_filt = sbb.filter( (sbb.fahrt_bezeichner == trip_id) & (sbb.haltestellen_name == stop) )\\\n", " .select(col(\"haltestellen_name\").alias(\"station\"), \\\n", " col(\"fahrt_bezeichner\").alias(\"trip_id\"), \\\n", " col(\"an_prognose\").alias(\"arrival_true\"),\\\n", " col(\"ankunftszeit\").alias(\"arrival_expected\"))\\\n", " .withColumn('arrival_true',to_timestamp(col('arrival_true'),\\\n", " format='dd.MM.yyyy HH:mm:ss'))\\\n", " .withColumn('arrival_expected',to_timestamp(col('arrival_expected'),\\\n", " format='dd.MM.yyyy HH:mm'))\\\n", " .withColumn('DiffInSeconds',col('arrival_true').cast(LongType()) - col('arrival_expected').cast(LongType()))\\\n", " .withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\\\n", " .select(\"station\", \"trip_id\", \"arrival_true\", \"DiffInSeconds\", \"DiffInMinutes\",\\\n", " date_format('arrival_expected', 'E').alias('weekday'))\\\n", " .orderBy(\"arrival_true\", ascending=False)\n", "sbb_filt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Given a station name and a trip id, we can get all arrival times (prognosed and real), and compute all delays in seconds and minutes. As we see the expected arrival time `ankunftzeit` is always the same as opposed to the actual arrival `an_prognose` with `an_prognose_status` equal to `GESCHAETZT` which varies.\n", "\n", "We remove Saturdays and Sundays to compute the arrival distribution only based on week days " ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+-----+\n", "|DiffInMinutes|count|\n", "+-------------+-----+\n", "| 0.0| 19|\n", "| 1.0| 50|\n", "| 2.0| 14|\n", "| 3.0| 4|\n", "| 4.0| 1|\n", "| 11.0| 1|\n", "| 21.0| 1|\n", "+-------------+-----+" ] } ], "source": [ "sbb_filt.filter( (sbb_filt.weekday != \"Sun\") & (sbb_filt.weekday != \"Sat\") )\\\n", " .groupBy('DiffInMinutes').count()\\\n", " .orderBy(\"DiffInMinutes\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For next steps, we will be able to pivot this kind of table for multiple trip ids at once. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Load Table and make distribution from list of stations/trip_id\n", "\n", "Here we compute distribution of delays for a group of stations with all associated trips. The goal is to develop a script able to make a distribution for all stations/trips of interests.\n", "\n", "To train a bit the concept, let's first use all station with _Zurich_ pattern in their name and compute their delay distribution." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| haltestellen_name|\n", "+--------------------+\n", "| Winkel am Zürichsee|\n", "| Zürich Flughafen|\n", "| Zürich Wiedikon|\n", "| Zürich Stadelhofen|\n", "|Zürich Tiefenbrunnen|\n", "+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "expr = \"Z.rich*\" # regular expression to be used to get all Zurich* stations\n", "sbb.filter(sbb[\"haltestellen_name\"].rlike(expr))\\\n", " .select(\"haltestellen_name\")\\\n", " .distinct().show(5)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+---------------+-------------------+-------------+-------------+-------+\n", "| station| trip_id| arrival_true|DiffInSeconds|DiffInMinutes|weekday|\n", "+--------------------+---------------+-------------------+-------------+-------------+-------+\n", "| Zürich HB| 85:11:543:001|2018-05-07 01:12:14| 14| 0.0| Mon|\n", "| Zürich HB|85:11:30797:011|2018-05-07 00:58:45| 225| 4.0| Mon|\n", "| Zürich HB|85:11:19694:001|2018-05-07 00:56:12| -48| -1.0| Mon|\n", "| Zürich HB| 85:11:2294:003|2018-05-07 00:54:06| -54| -1.0| Mon|\n", "| Zürich Hardbrücke|85:11:30797:011|2018-05-07 00:54:03| 123| 2.0| Mon|\n", "| Zürich Stadelhofen|85:11:19694:001|2018-05-07 00:53:40| 40| 1.0| Mon|\n", "| Zürich Hardbrücke|85:11:30794:002|2018-05-07 00:50:26| 86| 1.0| Mon|\n", "| Zürich Oerlikon|85:11:30797:011|2018-05-07 00:49:46| 106| 2.0| Mon|\n", "|Zürich Tiefenbrunnen|85:11:19694:001|2018-05-07 00:49:38| -22| 0.0| Mon|\n", "| Zürich HB|85:11:30794:002|2018-05-07 00:47:16| 136| 2.0| Mon|\n", "| Zürich HB|85:11:30692:007|2018-05-07 00:45:02| 242| 4.0| Mon|\n", "| Zürich HB|85:11:20495:001|2018-05-07 00:45:01| -59| -1.0| Mon|\n", "| Zürich Altstetten|85:11:18594:001|2018-05-07 00:44:53| 53| 1.0| Mon|\n", "| Zürich Stadelhofen|85:11:30794:002|2018-05-07 00:44:27| 147| 2.0| Mon|\n", "| Zürich Stadelhofen|85:11:30692:007|2018-05-07 00:42:37| 277| 5.0| Mon|\n", "| Zürich HB| 85:11:4793:001|2018-05-07 00:42:10| 10| 0.0| Mon|\n", "| Zürich Flughafen| 85:11:2294:003|2018-05-07 00:42:07| -113| -2.0| Mon|\n", "| Zürich Hardbrücke|85:11:18594:001|2018-05-07 00:41:27| 27| 0.0| Mon|\n", "| Zürich HB|85:11:18795:001|2018-05-07 00:41:05| 65| 1.0| Mon|\n", "| Zürich Wipkingen|85:11:20495:001|2018-05-07 00:40:52| 52| 1.0| Mon|\n", "+--------------------+---------------+-------------------+-------------+-------------+-------+\n", "only showing top 20 rows" ] } ], "source": [ "expr = \"Z.rich*\"\n", "\n", "# First filter - Take Zurich-like stations , with define arrival time and GAESCHETZ status\n", "# Select 4 fields of interest, rename \n", "# Convert date-like string to timestamp\n", "# Compute difference between scheduled and actual arrivals times\n", "# reselect to generate weekday\n", "sbb_filt = sbb.filter((sbb[\"haltestellen_name\"].rlike(expr)) )\\\n", " .select(col(\"haltestellen_name\").alias(\"station\"), \\\n", " col(\"fahrt_bezeichner\").alias(\"trip_id\"), \\\n", " col(\"an_prognose\").alias(\"arrival_true\"),\\\n", " col(\"ankunftszeit\").alias(\"arrival_expected\"))\\\n", " .withColumn('arrival_true',to_timestamp(col('arrival_true'),\\\n", " format='dd.MM.yyyy HH:mm:ss'))\\\n", " .withColumn('arrival_expected',to_timestamp(col('arrival_expected'),\\\n", " format='dd.MM.yyyy HH:mm'))\\\n", " .withColumn('DiffInSeconds',col('arrival_true').cast(LongType()) - col('arrival_expected').cast(LongType()))\\\n", " .withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\\\n", " .select(\"station\", \"trip_id\", \"arrival_true\", \"DiffInSeconds\", \"DiffInMinutes\",\\\n", " date_format('arrival_expected', 'E').alias('weekday'))\\\n", " .orderBy(\"arrival_true\", ascending=False)\n", "\n", "# Remove Saturday and Sunday weekdays from table - show\n", "sbb_filt = sbb_filt.filter( (sbb_filt.weekday != \"Sun\") & (sbb_filt.weekday != \"Sat\") )\n", "sbb_filt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To make distribution, we use groupBy followed by a pivot using delay time in minutes. We fill null values with 0. No lower/upper bounds for now. Negative column keys means arrival ahead of schedule." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "textn", "| station| trip_id|null|-28.0|-15.0|-13.0|-12.0|-11.0|-10.0|-9.0|-8.0|-7.0|-6.0|-5.0|-4.0|-3.0|-2.0|-1.0|0.0|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|10.0|11.0|12.0|13.0|14.0|15.0|16.0|17.0|18.0|19.0|20.0|21.0|22.0|23.0|24.0|25.0|26.0|27.0|28.0|29.0|30.0|31.0|32.0|33.0|34.0|35.0|36.0|37.0|38.0|39.0|40.0|41.0|42.0|43.0|44.0|45.0|46.0|47.0|48.0|49.0|50.0|51.0|52.0|53.0|54.0|55.0|56.0|57.0|59.0|60.0|61.0|62.0|63.0|64.0|65.0|66.0|67.0|68.0|69.0|70.0|71.0|72.0|73.0|76.0|77.0|78.0|79.0|80.0|82.0|85.0|86.0|90.0|96.0|99.0|102.0|111.0|120.0|122.0|127.0|132.0|149.0|150.0|152.0|180.0|210.0|\nn", "|Zürich Tiefenbrunnen|85:11:19639:001| 2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 65| 10| 5| 2| 2| 2| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "| Zürich Enge|85:11:18267:001| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 5| 1| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "| Zürich HB|85:11:30992:009| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 9| 3| 1| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "| Zürich Altstetten|85:11:19978:001| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 2| 70| 8| 2| 0| 1| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "| Zürich HB|85:11:18873:001| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 20| 22| 24| 7| 7| 1| 0| 0| 1| 0| 0| 0| 1| 0| 0| 0| 1| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\nn", "only showing top 5 rows" ] } ], "source": [ "sbb_filt.groupBy('station', 'trip_id').pivot(\"DiffInMinutes\").count()\\\n", " .na.fill(0)\\\n", " .show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As an addition to this distribution, we can set up lower / upper bound to constrain the distribution to a specific window of interest. We do not really care about train being ahead, so we put them all in -1 column index, And we look at delays until 30 minutes only." ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------------+---------------+----+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", "| station| trip_id|-1.0|0.0|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|10.0|11.0|12.0|13.0|14.0|15.0|16.0|17.0|18.0|19.0|20.0|21.0|22.0|23.0|24.0|25.0|26.0|27.0|28.0|29.0|30.0|\n", "+------------------+---------------+----+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", "| Zürich HB|85:11:30992:009| 9| 3| 1| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "| Zürich HB|85:11:18873:001| 0| 20| 22| 24| 7| 7| 1| 0| 0| 1| 0| 0| 0| 1| 0| 0| 0| 1| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "| Zürich Oerlikon|85:11:20438:002| 0| 0| 27| 50| 8| 4| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|Zürich Wollishofen|85:11:18822:001| 0| 2| 2| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "| Zürich Flughafen| 85:11:2270:001| 9| 46| 30| 2| 1| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "+------------------+---------------+----+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", "only showing top 5 rows" ] } ], "source": [ "lower_bound = -1.0\n", "upped_bound = +30.0\n", "\n", "sbb_bounded = sbb_filt.withColumn('DiffInMinutes_bounded1',\\\n", " greatest(col('DiffInMinutes'), lit(lower_bound) ))\\\n", " .withColumn('DiffInMinutes_bounded2',\\\n", " least(col('DiffInMinutes_bounded1'), lit(upped_bound) ))\n", "\n", "sbb_bounded.groupBy('station', 'trip_id').pivot(\"DiffInMinutes_bounded2\").count()\\\n", " .na.fill(0)\\\n", " .show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Work from translation tables \n", "\n", "We will use data generated in `match_datasets.ipynb`. We begin by looking at all trip_id that are found in both dataset with at least 5 stations in common.\n", "\n", "Our goal is to find a match in sbb dataset for all _timetable_ trips (and not the other way around). So we will focus on getting this assymetrical correspondance table. \n", "\n", "When we find a clear one-one match, we will mark them as _resolved_, when there is a one-to-many relation, we will call it _partly_resolved_ and if we cannot find a sbb trip that correspond to a timetable trip_id, we will call it _fail_to_resolve_. \n", "\n", "These labels will be used to differentiate 3 different ways to compute probabilities :\n", "- __One-to-one__ we find a clear match : we use distribution of delays on weekdays for a given trip/station_id based on all past sbb data. \n", "- __One-to-many__ we find multiple match :\n", " - First we double check the matches, if we have the same type of transportation for example.\n", " - If they seem to be correct, we can merge the trips from sbb and get the merged distribution of their delays.\n", "- __One-to-none__ we find no match : then we get the distribution of delays for similar transportation types, at similar hour (in a window), during weekdays of sbb dataset.\n", " - Alternative : Try to find the best match and use only the closest location/time to infer a given distribution.\n", " - Alternative 2 : use k-nearest neighbors in terms of location/time." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "243152\n", "+------------------------+----------------------+-----+\n", "|trip_id |fahrt_bezeichner |count|\n", "+------------------------+----------------------+-----+\n", "|241.TA.26-14-j19-1.43.H |85:11:19435:001 |13 |\n", "|1419.TA.26-8-C-j19-1.8.R|85:3849:169172-07008-1|23 |\n", "|1015.TA.26-4-j19-1.25.H |85:3849:49891-03002-1 |7 |\n", "|1955.TA.26-13-j19-1.24.H|85:3849:89261-02013-1 |5 |\n", "|1217.TA.26-72-j19-1.6.R |85:849:55624-25033-1 |7 |\n", "+------------------------+----------------------+-----+\n", "only showing top 5 rows" ] } ], "source": [ "joined_trip_atL5 = spark.read.csv('data/lgpt_guys/joined_trip_atL5.csv', header = True)\n", "print joined_trip_atL5.count()\n", "joined_trip_atL5.show(5, False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "31103" ] } ], "source": [ "joined_trip_atL5.select('fahrt_bezeichner').distinct().count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We also use the subset of sbb data (we use the filtered data `sbb_filt` made at the top of the notebook)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can begin by assembling sbb data set with translation table `joined_trip_atL5` " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "10848628\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+\n", "|fahrt_bezeichner|haltestellen_name| ankunftszeit| abfahrtszeit| an_prognose| ab_prognose|stop_id|\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+\n", "| 85:11:10:002| Zürich HB|12.10.2018 21:51| |12.10.2018 21:51:50| |8503000|\n", "| 85:11:10293:004| Zürich HB| |13.10.2018 00:25| |13.10.2018 00:26:08|8503000|\n", "| 85:11:10293:004| Zürich Flughafen|13.10.2018 00:34|13.10.2018 00:35|13.10.2018 00:35:27|13.10.2018 00:36:44|8503016|\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+\n", "only showing top 3 rows" ] } ], "source": [ "username = 'acoudray'\n", "sbb_filt = spark.read.orc(\"/user/{}/sbb_filt_forDelays_GeschaetzAndReal.orc\".format(username))\n", "print(sbb_filt.count())\n", "sbb_filt.show(3)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "16474877\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+-------+-----+\n", "|fahrt_bezeichner|haltestellen_name|ankunftszeit |abfahrtszeit |an_prognose |ab_prognose |stop_id|trip_id|count|\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+-------+-----+\n", "|85:11:10:002 |Zürich HB |12.10.2018 21:51| |12.10.2018 21:51:50| |8503000|null |null |\n", "|85:11:10293:004 |Zürich HB | |13.10.2018 00:25| |13.10.2018 00:26:08|8503000|null |null |\n", "|85:11:10293:004 |Zürich Flughafen |13.10.2018 00:34|13.10.2018 00:35|13.10.2018 00:35:27|13.10.2018 00:36:44|8503016|null |null |\n", "|85:11:10536:004 |Zürich HB | |12.10.2018 20:03| |12.10.2018 20:04:20|8503000|null |null |\n", "|85:11:10537:006 |Zürich HB |12.10.2018 21:59| |12.10.2018 22:01:43| |8503000|null |null |\n", "|85:11:10538:004 |Zürich HB | |12.10.2018 21:03| |12.10.2018 21:04:42|8503000|null |null |\n", "|85:11:10539:005 |Zürich HB |12.10.2018 22:59| |12.10.2018 23:00:10| |8503000|null |null |\n", "|85:11:10540:004 |Zürich HB | |12.10.2018 22:03| |12.10.2018 22:06:29|8503000|null |null |\n", "|85:11:10734:007 |Zürich Flughafen |12.10.2018 20:16|12.10.2018 20:18|12.10.2018 20:15:27|12.10.2018 20:18:39|8503016|null |null |\n", "|85:11:10734:007 |Zürich HB |12.10.2018 20:27|12.10.2018 20:32|12.10.2018 20:26:44|12.10.2018 20:33:02|8503000|null |null |\n", "+----------------+-----------------+----------------+----------------+-------------------+-------------------+-------+-------+-----+\n", "only showing top 10 rows" ] } ], "source": [ "joined_sbb = sbb_filt.join(joined_trip_atL5, on = ['fahrt_bezeichner'], how = 'left_outer')\n", "\n", "print joined_sbb.count()\n", "joined_sbb.show(10,False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "46399" ] } ], "source": [ "joined_sbb.select(\"fahrt_bezeichner\", \"trip_id\").distinct().count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The reference table we will use is the `stop_times` tables containing trip_id and stop_id. As a next step, we will put them in the same order raptor will read them." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+------------+------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|hour_departure| route_id|direction_id|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+------------+------------+\n", "|666.TA.26-4-j19-1...|8576182| 07:02:00| 07:02:00| 1| 0| 0| 7.0| 26-4-j19-1| 1|\n", "|243.TA.26-311-j19...|8590834| 07:16:00| 07:16:00| 1| 0| 0| 7.0|26-311-j19-1| 1|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------------+------------+------------+\n", "only showing top 2 rows" ] } ], "source": [ "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)\n", "stop_times_curated.show(2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "250777\n", "+-----------+-----------------------+-------+\n", "|stop_id_raw|trip_id |stop_id|\n", "+-----------+-----------------------+-------+\n", "|8576182 |666.TA.26-4-j19-1.20.R |8576182|\n", "|8590834 |243.TA.26-311-j19-1.3.R|8590834|\n", "|8591349 |406.TA.26-62-j19-1.3.R |8591349|\n", "+-----------+-----------------------+-------+\n", "only showing top 3 rows" ] } ], "source": [ "# We use only first 7 characters of stop_id to remove special cases\n", "stop_times_curated = stop_times_curated.select(col('stop_id').alias('stop_id_raw'), \n", " 'trip_id')\\\n", " .withColumn('stop_id',col('stop_id_raw').substr(1, 7))\n", "\n", "print stop_times_curated.count()\n", "stop_times_curated.show(3, False)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "19800" ] } ], "source": [ "stop_times_curated.select('trip_id').distinct().count()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "9478785\n", "+-------------------------+-------+-----+----------------+------------+------------+-----------+-----------+\n", "|trip_id |stop_id|count|fahrt_bezeichner|ankunftszeit|abfahrtszeit|an_prognose|ab_prognose|\n", "+-------------------------+-------+-----+----------------+------------+------------+-----------+-----------+\n", "|1.TA.26-89-j19-1.1.R |8591209|null |null |null |null |null |null |\n", "|10.TA.1-305-j19-1.1.R |8587018|null |null |null |null |null |null |\n", "|10.TA.26-69-j19-1.2.H |8591122|null |null |null |null |null |null |\n", "|10.TA.26-845-j19-1.2.H |8580879|null |null |null |null |null |null |\n", "|10.TA.26-918-j19-1.1.R |8590701|null |null |null |null |null |null |\n", "|10.TA.79-485-j19-1.1.R |8590461|null |null |null |null |null |null |\n", "|100.TA.26-748-j19-1.1.R |8590543|null |null |null |null |null |null |\n", "|1001.TA.26-70-A-j19-1.5.H|8591106|null |null |null |null |null |null |\n", "|1005.TA.26-70-A-j19-1.5.H|8591410|null |null |null |null |null |null |\n", "|1008.TA.26-142-j19-1.2.R |8590830|null |null |null |null |null |null |\n", "+-------------------------+-------+-----+----------------+------------+------------+-----------+-----------+\n", "only showing top 10 rows" ] } ], "source": [ "stop_times_join = stop_times_curated.join(joined_sbb, on=['trip_id', 'stop_id'], \n", " how='left_outer')\\\n", " .select('trip_id', 'stop_id', 'count',\n", " 'fahrt_bezeichner', 'ankunftszeit', 'abfahrtszeit',\n", " 'an_prognose', 'ab_prognose')\n", "\n", "print stop_times_join.count()\n", "stop_times_join.show(10, False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We then compute arrival delays using the following approach : \n", "- arrival_true ( = `an_prognose`) - arrival_expected ( = `ankunftszeit`). Train being late have a positive delay and trains being ahead of schedule a negative one." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------------+-------------------+-------------+-------------+-------+\n", "|stop_id|trip_id |arrival_true |DiffInSeconds|DiffInMinutes|weekday|\n", "+-------+------------------------+-------------------+-------------+-------------+-------+\n", "|8503006|419.TA.26-2-j19-1.164.H |2018-10-12 06:10:43|43 |0 |Fri |\n", "|8503000|419.TA.26-2-j19-1.164.H |2018-10-12 06:15:56|56 |0 |Fri |\n", "|8503011|419.TA.26-2-j19-1.164.H |2018-10-12 06:19:45|45 |0 |Fri |\n", "|8503010|419.TA.26-2-j19-1.164.H |2018-10-12 06:21:35|-25 |0 |Fri |\n", "|8503202|419.TA.26-2-j19-1.164.H |2018-10-12 06:29:28|-32 |0 |Fri |\n", "|8503204|419.TA.26-2-j19-1.164.H |2018-10-12 06:34:42|-18 |0 |Fri |\n", "|8503204|214.TA.26-24-j19-1.121.R|2018-10-12 06:23:17|17 |0 |Fri |\n", "|8503204|74.TA.26-2-j19-1.9.R |2018-10-12 06:23:17|17 |0 |Fri |\n", "|8503202|214.TA.26-24-j19-1.121.R|2018-10-12 06:27:58|-2 |0 |Fri |\n", "|8503202|74.TA.26-2-j19-1.9.R |2018-10-12 06:27:58|-2 |0 |Fri |\n", "+-------+------------------------+-------------------+-------------+-------------+-------+\n", "only showing top 10 rows" ] } ], "source": [ "stop_times_diff = stop_times_join.select( col(\"an_prognose\").alias(\"arrival_true\"),\\\n", " col(\"ankunftszeit\").alias(\"arrival_expected\"),\\\n", " 'trip_id', 'stop_id')\\\n", " .withColumn('arrival_true',to_timestamp(col('arrival_true'),\\\n", " format='dd.MM.yyyy HH:mm:ss'))\\\n", " .withColumn('arrival_expected',to_timestamp(col('arrival_expected'),\\\n", " format='dd.MM.yyyy HH:mm'))\\\n", " .withColumn('DiffInSeconds',col('arrival_true').cast(LongType()) - col('arrival_expected').cast(LongType()))\\\n", " .withColumn('DiffInMinutes',(col('DiffInSeconds')/60).cast('integer'))\\\n", " .select(\"stop_id\", \"trip_id\", \"arrival_true\", \"DiffInSeconds\", \"DiffInMinutes\",\\\n", " date_format('arrival_expected', 'E').alias('weekday'))\n", "\n", "# Remove Saturday and Sunday weekdays from table - show\n", "stop_times_diff = stop_times_diff.filter( (stop_times_diff.weekday != \"Sun\") & (stop_times_diff.weekday != \"Sat\") )\n", "stop_times_diff.show(10, False)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+-------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "|stop_id|trip_id |-1 |0 |1 |2 |3 |4 |5 |6 |7 |8 |9 |10 |11 |12 |13 |14 |15 |16 |17 |18 |19 |20 |21 |22 |23 |24 |25 |26 |27 |28 |29 |30 |\n", "+-------+-------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "|8503020|45.TA.26-7-A-j19-1.12.H |0 |537|107|38 |13 |2 |3 |4 |1 |1 |1 |0 |1 |1 |0 |1 |1 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8594307|44.TA.1-11-B-j19-1.2.H |0 |1 |4 |1 |2 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503125|59.TA.26-5-A-j19-1.28.R |0 |578|179|30 |12 |5 |2 |0 |1 |1 |0 |1 |1 |0 |0 |2 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8590275|501.TA.1-2-A-j19-1.15.R |0 |23 |28 |9 |4 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503000|147.TA.26-15-j19-1.41.H |0 |271|114|20 |7 |2 |1 |0 |0 |0 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503203|590.TA.26-8-A-j19-1.353.H|0 |463|648|340|106|34 |20 |8 |5 |3 |6 |3 |3 |7 |3 |1 |1 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8502208|432.TA.26-24-j19-1.220.R |0 |86 |40 |10 |3 |2 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503305|201.TA.26-24-j19-1.121.R |0 |184|60 |22 |7 |5 |2 |0 |1 |0 |0 |0 |0 |1 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8590279|136.TA.1-4-B-j19-1.10.H |0 |4 |4 |3 |2 |2 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503003|258.TA.26-16-A-j19-1.93.H|85 |913|359|167|53 |34 |20 |10 |4 |5 |3 |2 |0 |0 |1 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503011|571.TA.26-8-A-j19-1.347.H|0 |590|457|116|40 |16 |4 |7 |1 |1 |2 |1 |1 |1 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503020|389.TA.26-7-A-j19-1.108.R|0 |243|150|19 |1 |1 |0 |1 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8517376|197.TA.1-17-A-j19-1.16.R |0 |274|499|212|101|34 |4 |2 |1 |0 |0 |1 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503006|377.TA.26-7-A-j19-1.108.R|0 |541|223|58 |9 |2 |2 |1 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503104|135.TA.26-6-A-j19-1.32.R |0 |394|165|63 |30 |10 |5 |5 |2 |2 |1 |1 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8502187|187.TA.1-17-A-j19-1.16.R |1 |479|71 |6 |4 |1 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503020|105.TA.26-5-A-j19-1.37.R |0 |443|226|98 |35 |14 |5 |2 |0 |2 |1 |0 |2 |1 |2 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8502221|54.TA.26-5-A-j19-1.28.R |152|492|116|41 |14 |6 |8 |2 |1 |0 |0 |1 |0 |1 |2 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8502273|101.TA.1-17-A-j19-1.9.R |2 |507|330|205|95 |25 |4 |2 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503052|238.TA.26-10-B-j19-1.10.R|0 |260|82 |47 |21 |12 |6 |3 |3 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "+-------+-------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "only showing top 20 rows" ] } ], "source": [ "# we bound distribution to this \n", "lower_bound = -1\n", "upped_bound = +30\n", "\n", "stop_times_bounded = stop_times_diff.withColumn('DiffInMinutes_bounded1',\\\n", " greatest(col('DiffInMinutes'), lit(lower_bound) ))\\\n", " .withColumn('DiffInMinutes_bounded2',\\\n", " least(col('DiffInMinutes_bounded1'), lit(upped_bound) ))\n", "\n", "stop_times_distribution = stop_times_bounded.groupBy('stop_id', 'trip_id')\\\n", " .pivot(\"DiffInMinutes_bounded2\").count()\\\n", " .na.fill(0)\n", "\n", "stop_times_distribution.show(20, False)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "12309" ] } ], "source": [ "stop_times_distribution.count()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_distribution.write.csv('data/lgpt_guys/distribution_geschaetzAndReal.csv', \\\n", " header = True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Analysing matches found \n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+------------+------------+--------------+-------------+-----------+-------------+--------------+------------+------------+\n", "| trip_id| stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|hour_departure| route_id|direction_id|\n", "+--------------------+------------+------------+--------------+-------------+-----------+-------------+--------------+------------+------------+\n", "|666.TA.26-4-j19-1...| 8576182| 07:02:00| 07:02:00| 1| 0| 0| 7.0| 26-4-j19-1| 1|\n", "|243.TA.26-311-j19...| 8590834| 07:16:00| 07:16:00| 1| 0| 0| 7.0|26-311-j19-1| 1|\n", "|406.TA.26-62-j19-...| 8591349| 07:24:00| 07:24:00| 1| 0| 0| 7.0| 26-62-j19-1| 1|\n", "|62.TA.57-2-Y-j19-...|8503000:0:13| 07:34:00| 07:34:00| 1| 0| 0| 7.0|57-2-Y-j19-1| 0|\n", "|1179.TA.26-5-B-j1...| 8591245| 07:36:00| 07:36:00| 1| 0| 0| 7.0|26-5-B-j19-1| 1|\n", "+--------------------+------------+------------+--------------+-------------+-----------+-------------+--------------+------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)\n", "stop_times_curated.show(5)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "250777\n", "+-----------+-----------------------+-------+\n", "|stop_id_raw|trip_id |stop_id|\n", "+-----------+-----------------------+-------+\n", "|8576182 |666.TA.26-4-j19-1.20.R |8576182|\n", "|8590834 |243.TA.26-311-j19-1.3.R|8590834|\n", "|8591349 |406.TA.26-62-j19-1.3.R |8591349|\n", "+-----------+-----------------------+-------+\n", "only showing top 3 rows" ] } ], "source": [ "# We use only first 7 characters of stop_id to remove special cases\n", "stop_times_curated = stop_times_curated.select(col('stop_id').alias('stop_id_raw'), \n", " 'trip_id')\\\n", " .withColumn('stop_id',col('stop_id_raw').substr(1, 7))\n", "\n", "print stop_times_curated.count()\n", "stop_times_curated.show(3, False)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "12309\n", "+-------+-------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "|stop_id|trip_id |-1 |0 |1 |2 |3 |4 |5 |6 |7 |8 |9 |10 |11 |12 |13 |14 |15 |16 |17 |18 |19 |20 |21 |22 |23 |24 |25 |26 |27 |28 |29 |30 |\n", "+-------+-------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "|8503003|286.TA.26-11-j19-1.80.H |0 |395|75 |23 |5 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503089|40.TA.26-4-B-j19-1.1.R |0 |378|238|194|70 |46 |30 |11 |9 |8 |6 |4 |2 |2 |1 |0 |2 |2 |1 |0 |1 |1 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |\n", "|8503094|166.TA.26-4-B-j19-1.7.H |0 |28 |158|102|50 |23 |21 |15 |7 |7 |1 |0 |1 |1 |2 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503203|580.TA.26-8-A-j19-1.347.H|0 |659|504|246|110|58 |32 |13 |3 |1 |0 |5 |6 |3 |2 |2 |1 |1 |0 |2 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8502223|116.TA.26-14-j19-1.18.R |0 |31 |144|56 |12 |3 |0 |0 |0 |0 |3 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503053|55.TA.79-10-B-j19-1.3.H |0 |165|43 |7 |2 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503000|37.TA.26-15-j19-1.17.R |8 |220|93 |47 |26 |9 |5 |5 |1 |1 |1 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503006|297.TA.26-14-j19-1.41.H |0 |123|197|59 |14 |11 |6 |1 |1 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503091|24.TA.26-4-B-j19-1.1.R |0 |184|258|64 |7 |2 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|8503147|150.TA.26-3-j19-1.12.H |5 |470|119|43 |14 |7 |1 |3 |1 |0 |0 |1 |1 |0 |1 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "+-------+-------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "only showing top 10 rows" ] } ], "source": [ "stop_times_distrib = spark.read.csv('data/lgpt_guys/distribution_geschaetzAndReal.csv', \\\n", " header = True)\n", "print stop_times_distrib.count()\n", "stop_times_distrib.show(10, False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many unique combination of stop_id / trip_id do we have ?" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "12309" ] } ], "source": [ "stop_times_distrib.select(\"stop_id\",\"trip_id\").distinct().count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many seems to have an empty line ? " ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "4" ] } ], "source": [ "stop_times_distrib.filter( (stop_times_distrib['-1'] == 0) &\\\n", " (stop_times_distrib['0'] == 0) &\\\n", " (stop_times_distrib['1'] == 0) &\\\n", " (stop_times_distrib['2'] == 0) ).count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Actually there is no line with all values equal to zero : it would not have been assembled at the pivot stage. Now we want to see how many of the `stop_times_curated` lines we can get from this table :" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "| key| -1| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30|\n", "+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "|10.TA.1-11-B-j19-...| 0| 2| 2| 1| 3| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.1-11-B-j19-...| 0| 3| 2| 2| 3| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.1-11-B-j19-...| 0| 0| 4| 4| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.1-11-B-j19-...| 0| 1| 5| 3| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.1-11-B-j19-...| 0| 1| 3| 4| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.26-912-j19-...| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.79-10-B-j19...| 1|129|143| 71| 33| 17| 11| 4| 3| 3| 1| 1| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.79-10-B-j19...| 0|333| 40| 22| 9| 5| 3| 4| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.79-10-B-j19...| 1|340| 37| 21| 6| 2| 6| 2| 2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.79-10-B-j19...| 0| 0|177| 23| 7| 0| 1| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.79-10-B-j19...| 0|266| 81| 33| 15| 10| 3| 4| 3| 1| 0| 2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|10.TA.79-10-B-j19...| 0|142|139| 69| 31| 16| 8| 5| 4| 1| 1| 1| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0|325| 62| 27| 1| 2| 2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0|310| 84| 15| 5| 4| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0|330| 73| 9| 4| 2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0|257|103| 42| 14| 1| 1| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0| 69|228| 88| 23| 8| 1| 2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0|300| 86| 22| 8| 1| 0| 2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0|200|158| 38| 19| 2| 0| 1| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "|100.TA.26-6-A-j19...| 0|349| 47| 14| 7| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", "+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "only showing top 20 rows" ] } ], "source": [ "stop_times_final = stop_times_curated.join(stop_times_distrib,\\\n", " on = ['stop_id', 'trip_id'],\\\n", " how = 'inner').drop('stop_id_raw')\\\n", ".orderBy('trip_id', 'stop_id')\\\n", ".withColumn('key2', concat(col('trip_id'), lit('__'), col('stop_id')))\\\n", ".drop('trip_id').drop('stop_id')\\\n", ".select(col('key2').alias('key'), \"*\")\\\n", ".drop('key2')\n", "\n", "stop_times_final.show(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We still have null values. Let's count how many null we have on the full table" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "reference table stop_times number of lines : 250777\n", "distribution table number of lines : 12309\n", "Number of missing keys in distribution : 0" ] } ], "source": [ "print \"reference table stop_times number of lines : {}\".format(stop_times_curated.count())\n", "print \"distribution table number of lines : {}\".format(stop_times_final.count())\n", "print \"Number of missing keys in distribution : {}\".format(stop_times_final.filter(stop_times_final['0'].isNull()).count())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We write two version of the table : one with missing values, and one with missing values filled with '1',allowing development of next steps in the meantime (filling these values with a better approach is discussed in next section _Recovering missing distributions_)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_final_fill1 = stop_times_final.na.fill(1) # not working, not IntegerType ..." ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "|key |-1 |0 |1 |2 |3 |4 |5 |6 |7 |8 |9 |10 |11 |12 |13 |14 |15 |16 |17 |18 |19 |20 |21 |22 |23 |24 |25 |26 |27 |28 |29 |30 |\n", "+-----------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "|1.TA.1-231-j19-1.1.H__8502553|1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8502879|0 |78 |21 |6 |3 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8502955|1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8503598|0 |170|42 |4 |2 |0 |2 |2 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8503598|0 |170|42 |4 |2 |0 |2 |2 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8572600|1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8572601|1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8572602|1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8572603|2 |94 |9 |2 |2 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "|1.TA.1-231-j19-1.1.H__8572747|1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |\n", "+-----------------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+\n", "only showing top 10 rows" ] } ], "source": [ "# This contains the list of columns where we apply replace() function\n", "all_column_names = stop_times_final.columns\n", "columns_to_remove = ['key']\n", "columns_for_replacement = [i for i in all_column_names if i not in columns_to_remove]\n", "\n", "# Doing the replacement on all the requisite columns\n", "for i in columns_for_replacement:\n", " stop_times_final_fill1 = stop_times_final_fill1.withColumn(i,when((col(i).isNull()),int(int(i)<=10))\\\n", " .otherwise(col(i).cast(IntegerType())))\n", "stop_times_final_fill1.show(10, False)" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_final.write.csv('data/lgpt_guys/distribution_1to1match_wNull.csv', \\\n", " header = True, mode=\"overwrite\")\n", "stop_times_final_fill1.write.csv('data/lgpt_guys/distribution_1to1match_fill1.csv', \\\n", " header = True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Use local python to make definitive table with right ordering \n", "\n", "We first use the tables where null values were filled with 1 and 0 " ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "username = 'acoudray'\n", "stop_times_final_fill1.write.csv(\"/user/{}/distribution_1to1match_fill1.csv\".format(username), \\\n", " header = True, mode = 'overwrite')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# for geschaetz only\n", "stop_times_final.write.csv(\"/user/{}/distribution_1to1match_geschaetz.csv\".format(username), \\\n", " header = True, mode = 'overwrite')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# for geschaetz and real only\n", "stop_times_final.write.csv(\"/user/{}/distribution_1to1match_geschaetzAndReal.csv\".format(username), \\\n", " header = True, mode = 'overwrite')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1| route_int|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 500|1-231-j19-1| 10.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 599|1-231-j19-1| 11.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 401|1-231-j19-1| 12.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "only showing top 3 rows" ] } ], "source": [ "username = 'acoudray'\n", "stop_times_curated.write.csv(\"/user/{}/stop_times_curated_sbbCompatible\".format(username), \\\n", " header = True, mode = 'overwrite')\n", "stop_times_curated.show(3)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('10.TA.1-11-B-j19-1.1.R__8590314',\n", " array([0, 2, 2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.1-11-B-j19-1.1.R__8590317',\n", " array([0, 3, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.1-11-B-j19-1.1.R__8594304',\n", " array([0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.1-11-B-j19-1.1.R__8594307',\n", " array([0, 1, 5, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.1-11-B-j19-1.1.R__8594310',\n", " array([0, 1, 3, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.26-912-j19-1.2.R__8576195',\n", " array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.79-10-B-j19-1.2.R__8503051',\n", " array([ 1, 129, 143, 71, 33, 17, 11, 4, 3, 3, 1, 1, 1,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.79-10-B-j19-1.2.R__8503052',\n", " array([ 0, 333, 40, 22, 9, 5, 3, 4, 1, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.79-10-B-j19-1.2.R__8503053',\n", " array([ 1, 340, 37, 21, 6, 2, 6, 2, 2, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0])),\n", " ('10.TA.79-10-B-j19-1.2.R__8503054',\n", " array([ 0, 0, 177, 23, 7, 0, 1, 0, 1, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0]))]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%local\n", "\n", "from hdfs3 import HDFileSystem\n", "import pandas as pd\n", "import numpy as np \n", "import pickle \n", "import gzip\n", "from itertools import islice\n", "\n", "hdfs = HDFileSystem(host='hdfs://iccluster044.iccluster.epfl.ch', port=8020, user='ebouille')\n", "\n", "username = 'acoudray'\n", "\n", "# Load distribution file from HDFS and concatenate individual csv\n", "distrib_files = hdfs.glob('/user/{}/distribution_1to1match_geschaetzAndReal.csv/*.csv'.format(username))\n", "distrib = pd.DataFrame()\n", "for file in distrib_files:\n", " with hdfs.open(file) as f:\n", " distrib = distrib.append(pd.read_csv(f))\n", "distrib = distrib.set_index('key')\n", "\n", "# zip index and values to get {key : np.array()} shape \n", "d = dict(zip(distrib.index, np.array(distrib.values)))\n", "\n", "# Write it to local \n", "with gzip.open(\"../data/distributions_geschaetzAndReal.pkl.gz\", \"wb\") as output_file:\n", " pickle.dump(d, output_file)\n", "\n", "# Functon to take a slice from a dictionnary - head equivalent\n", "def take(n, iterable):\n", " \"Return first n items of the iterable as a list\"\n", " return list(islice(iterable, n))\n", "\n", "# display a slice of it\n", "take(10, d.items())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many RAM does the dictionnary occupy when it is open ? Open pickle and calculate amount of memory occupied using _resource_ lib" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "length of dict : 246968\n", "Data size is: 106968218\n" ] } ], "source": [ "%local \n", "\n", "import pickle \n", "import gzip\n", "import sys\n", "import os\n", "import resource\n", "\n", "with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n", " d = pickle.load(input_file)\n", " \n", "\n", "d['1290.TA.26-32-j19-1.12.H__8591151']\n", "print('length of dict : ',len(d))\n", "\n", "def getsizeof_r(obj):\n", " total = 0\n", " if isinstance(obj, list):\n", " for i in obj:\n", " total += getsizeof_r(i)\n", " elif isinstance(obj, dict):\n", " for k, v in obj.items():\n", " total += getsizeof_r(k) + getsizeof_r(v)\n", " else:\n", " total += sys.getsizeof(obj)\n", " return total\n", "\n", "print('Data size is: {}'.format(getsizeof_r(d)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many time does it take to access elements in the dictionnary ?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 0 1008 405 207 95 39 25 11 5 3 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0]\n", "running time to get value from key when exists : 0.0004305839538574219\n", "\n", "KEY ERROR: .26-32-j19-1.12.H__8591151 not found un distribution dictionnary\n", "running time to get error when key does NOT exists : 0.00010466575622558594\n", "\n" ] } ], "source": [ "%local\n", "\n", "import pickle \n", "import gzip\n", "import time\n", "\n", "def get_distribution(key, dico):\n", " if key in dico:\n", " print(dico[key])\n", " else:\n", " print(\"KEY ERROR: {} not found un distribution dictionnary\".format(key))\n", " \n", "with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n", " d = pickle.load(input_file)\n", " \n", "this_key = '1290.TA.26-32-j19-1.12.H__8591151'\n", "\n", "start = time.time()\n", "get_distribution(this_key, d)\n", "end = time.time()\n", "print(\"running time to get value from key when exists : {}\\n\".format(end - start))\n", "\n", "start = time.time()\n", "get_distribution(this_key.replace('1290.TA',''), d)\n", "end = time.time()\n", "print(\"running time to get error when key does NOT exists : {}\\n\".format(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "when key exists we access it in $5\\cdot10^{-4}$ seconds and when it does not exists error message is displayed in $1\\cdot10^{-4}$ seconds. Should be more than enough to be called multiple time when using raptor." ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Make function to compute probabilities from distributions\n", - "\n", - "We make a script that takes a key, a arrival time and a departure time to compute a probability to be at least 2 minutes ahead for transfert. We assume that with less than 2 minutes, we miss the transfert.\n", - "\n", - "We will use a Poisson distribution to compute this probability using its cumulative distribution.\n", - "\n", - "A Poisson Process meets the following criteria (in reality many phenomena modeled as Poisson processes don’t meet these exactly):\n", - "- Events are independent of each other. The occurrence of one event does not affect the probability another event will occur.\n", - "- The average rate (events per time period) is constant.\n", - "- Two events cannot occur at the same time.\n", - "\n", - "Bounds for the tail probabilities of a Poisson random variable ${\\displaystyle X\\sim \\operatorname {Pois} (\\lambda )}$ can be derived using a Chernoff bound argument: \n", - "\n", - "$${\\displaystyle P(X\\leq x)\\leq {\\frac {(e\\lambda )^{x}e^{-\\lambda }}{x^{x}}},{\\text{ for }}x<\\lambda .}$$\n", - "\n", - "So in our case all we need to find is $\\lambda$. The positive real number λ is equal to the expected value of X and also to its variance :\n", - "\n", - "$${\\lambda =\\operatorname {E} (X)=\\operatorname {Var} (X)}$$\n", - "\n", - "We can easily find $\\lambda$ by finding the _average number of success per unit time_. We have a distribution going from -1 to +30, therefore we will iterate over it and sum up all successes $x_t$ at each time point $t$, for all our time points. \n", - "\n", - "$$ {\\lambda = \\frac{1}{N} \\displaystyle\\sum_{t=-1}^{N=30} x_t \\cdot t}$$" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "lambda (expectation given distribution): 4.0 \n", - "\n", - "Probability of success for transfer time = 5.0 minutes : 0.7851303870304052\n" - ] - }, - { - "data": { - "text/plain": [ - "0.7851303870304052" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%local\n", - "\n", - "import pickle \n", - "import gzip\n", - "import time\n", - "import math \n", - "import datetime\n", - "import time\n", - "from scipy.stats import poisson\n", - "\n", - "with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n", - " d = pickle.load(input_file)\n", - " \n", - "def get_distrib(key, dico):\n", - " if key in dico:\n", - " return dico[key]\n", - " else:\n", - " raise ValueError(\"KEY ERROR: {} not found un distribution dictionnary\".format(key))\n", - " \n", - "def evaluate_lamda(distrib):\n", - " # First calculate total number of measures N\n", - " N = 0\n", - " for x in distrib:\n", - " N += x\n", - "\n", - " lambda_p = 0 # expectation - we want to calculate it\n", - " t = -1 # time = index - 1\n", - "\n", - " for x in distrib:\n", - " lambda_p += t*x\n", - " t += 1\n", - "\n", - " # calculate lambda which is the expectation of x\n", - " if N > 0:\n", - " lambda_p /= N \n", - " print('lambda (expectation given distribution): ',lambda_p, '\\n')\n", - " return lambda_p\n", - " else : \n", - " raise ValueError(\"ERROR : {} distribution has 0 counts\".format(key))\n", - " #print('Returning 1 to avoid later problem... \\n')\n", - " return 1\n", - "\n", - "def process_time(str_time):\n", - " x = time.strptime(str_time,'%H:%M')\n", - " return datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()\n", - "\n", - "def get_transfer_time(arr_time, dep_time, delta=2.0):\n", - " diff_time_min = ( process_time(dep_time) - process_time(arr_time) ) / 60\n", - " return diff_time_min - delta\n", - "\n", - "def poisson_proba(trip_id, stop_id, arr_time, dep_time, dico):\n", - " # Generate key from trip_id / stop_id \n", - " key = str(trip_id) + '__' + str(stop_id[0:7]) # 7 first char to be sbb-compatible\n", - "\n", - " # Get distribution from dictionnary\n", - " distrib = get_distrib(key, dico)\n", - " \n", - " # Calculate transfer time at disposal \n", - " T = get_transfer_time(arr_time, dep_time)\n", - " \n", - " # Get lambda value to calculate proba\n", - " lambda_p = evaluate_lamda(distrib)\n", - "\n", - " # Get proba\n", - " poisson_p = poisson.cdf(T, lambda_p)\n", - " print('Probability of success for transfer time = {} minutes : '.format(T),poisson_p)\n", - "\n", - " return poisson_p\n", - "\n", - "# 129.TA.90-173-Y-j19-1.1.H__8530643\n", - "# input data :\n", - "trip_id = '129.TA.90-173-Y-j19-1.1.H'\n", - "stop_id = '8530643'\n", - "arrival_time = '07:45'\n", - "departure_time = '07:52'\n", - "Pr = poisson_proba(trip_id, stop_id, arrival_time, departure_time, d)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# trip_id missing \n", - "\n", - "stop_id , time , transport_type -> estimate proba \n", - "\n", - "# Make recovery tables\n", - "1500 x 24 x 5 = 180'000\n", - "\n", - "# Validate recovery table \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/notebooks/match_datasets.ipynb b/notebooks/hdfs_match_datasets.ipynb similarity index 100% rename from notebooks/match_datasets.ipynb rename to notebooks/hdfs_match_datasets.ipynb diff --git a/notebooks/proba_functions.ipynb b/notebooks/proba_functions.ipynb index 70e7bc6..0b1962b 100644 --- a/notebooks/proba_functions.ipynb +++ b/notebooks/proba_functions.ipynb @@ -1,2682 +1,2668 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ - "## Compute probability of missing a transfer from delays distributions\n", + "## Compute probability of transfer success from delays distributions\n", "\n", - "Let's first have a look at a slice of the dictionnary of distribution" + "To be able to compute the probability of success of a given transfert, we use the arrival delay distribution compared with the next trip departure. To be able to do that, we need delay distributions for each trip arrival to a given station. Whenever we have a clear match, we can use an __cumulative distribution function__ to compute $P(X \\leq x)$ :\n", + "\n", + "$${\\displaystyle F_{X}(x)=\\operatorname {P} (T\\leq t)=\\sum _{t_{i}\\leq t}\\operatorname {P} (T=t_{i})=\\sum _{t_{i}\\leq t}p(t_{i}).}$$\n", + "\n", + "The strategy was to rely entirely on past data we have to compute $p(t_i)$, without the need of building a model which imply making additionnal assumptions. If we have enough data for a given transfer with known trip_id x stop_id, we use the the abovementionned formula to compute each $p(t_i)$ by simply using :\n", + "\n", + "$$p(t_i) = \\frac{x_i}{\\sum x_i}$$\n", + "\n", + "with $x_i$ being the number of delays at time $t_i$ from SBB dataset.\n", + "\n", + "### Recover missing data \n", + "\n", + "As we are using SBB data to compute delays from timetable trip_id, we may encounter problems with the translation between the two datasets (certain trip_id/stop_id have no correspondance datasets!). We may also encounter To recover missing or faulty data, the strategy is the following :\n", + "\n", + "1. If we have more than 100 data points in `real` group, we rely exclusively on its delay distribution to compute probabilities for a given transfer on a `trip_id x stop_id`.\n", + "\n", + "_Note : `real` group corresponds to arrival time with status `geschaetz` or `real`, meaning it comes from actual measurments._\n", + "\n", + "2. If we do not find enough data within `real` group, we use delay distributions in `all` group (contains all delays including `prognose` status), if there is more than 100 data points for a given `trip_id x stop_id`.\n", + "\n", + "3. If `all` group still does not have more than 100 data points, we rely on `recovery tables` to estimate delay distributions. The strategy is the following :\n", + " - As we will always know the `stop_id`, the `time` and the `transport_type`, we rely on arrival delays from aggregated values of similar transfer. \n", + " - First, we compute a table of distribution with all possible combination of `stop_id`, `time` (round to hours) and `transport_type`, and aggregate all the counts we have to compute cumulative distribution probabilities. \n", + " - Is there is less than 100 data points in one of these intersections, we use the last possibilities : a table with `transport_type` x `time` aggregate counts.\n", + " - The last values with no match are given the overall average of cumulative distribution probabilities for each `transport_type` with no limit for the minimum number of data points.\n", + "\n", + "Following this approach, we can find cumulative distribution probabilities for every combination of `trip_id x stop_id` as defined in `stop_times_df`. We will make a table with the same row order so that McRaptor can easily find their indexes. " ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "import pickle \n", "import gzip\n", "from itertools import islice\n", "import matplotlib as mlt \n", "import matplotlib.pyplot as plt\n", "import numpy as np \n", "import pandas as pd \n", "import math" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Functon to take a slice from a dictionnary - head equivalent\n", "def take(n, iterable):\n", " \"Return first n items of the iterable as a list\"\n", " return list(islice(iterable, n))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load dictionnaries of distributions" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "len dict_real : 12309\n", "[('10.TA.1-11-B-j19-1.1.R__8590314', array([0, 2, 2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ('10.TA.1-11-B-j19-1.1.R__8590317', array([0, 3, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ('10.TA.1-11-B-j19-1.1.R__8594304', array([0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ('10.TA.1-11-B-j19-1.1.R__8594307', array([0, 1, 5, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ('10.TA.1-11-B-j19-1.1.R__8594310', array([0, 1, 3, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))]\n", "len dict_all : 246968\n", "[('1286.TA.26-32-j19-1.12.H__8591182', array([ 0, 1158, 306, 162, 94, 24, 28, 21, 3, 2, 0,\n", " 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ('1286.TA.26-32-j19-1.12.H__8591184', array([ 1, 762, 552, 292, 118, 48, 13, 8, 0, 1, 1, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0])), ('1286.TA.26-32-j19-1.12.H__8591195', array([ 0, 1083, 444, 143, 64, 35, 16, 9, 3, 1, 0,\n", " 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ('1286.TA.26-32-j19-1.12.H__8591200', array([ 2, 239, 227, 228, 212, 128, 74, 42, 29, 17, 3, 3, 2,\n", " 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 1])), ('1286.TA.26-32-j19-1.12.H__8591209', array([ 0, 1151, 308, 169, 94, 24, 29, 16, 4, 3, 1,\n", " 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))]\n" ] } ], "source": [ "with gzip.open(\"../data/distributions_geschaetzAndReal.pkl.gz\", \"rb\") as input_file:\n", " d_real = pickle.load(input_file)\n", "\n", "with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n", " d_all = pickle.load(input_file)\n", "\n", "# display a slice of it\n", "print('len dict_real : ', len(d_real))\n", "print(take(5, d_real.items()))\n", "\n", "# display a slice of it\n", "print('len dict_all : ', len(d_all))\n", "print(take(5, d_all.items()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Probability using cumulative distribution based on frequency of delays \n", "\n", "When we have __enough data__ and no ambiguity about `trip_id` and `stop_id` for a given distribution, then we can compute the probability $P(x \\leq X)$ for every x (delay in minute). \n", "\n", "Let's take a __threshold of 100__ sample points (=number of time we could measure a delay) as a minimum number of points to use this approach. \n", "\n", "_How many keys in our distionnary of distribution have at least this number of samples ?_" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def plot_data_points_hist(dico):\n", " list_tot_points = []\n", " for key in dico:\n", " distrib = dico[key]\n", " list_tot_points.append(np.sum(distrib))\n", "\n", " tot_per_key = np.array(list_tot_points)\n", " binwidth = 100\n", " n_keys_less_than_binwidth = np.sum(np.array(tot_per_key < binwidth))\n", " perc_key_to_recover = round(100 * ( n_keys_less_than_binwidth / len(tot_per_key) ), 2)\n", " plt.figure(figsize = (10,5))\n", " plt.hist(tot_per_key, bins = range(min(tot_per_key), max(tot_per_key) + binwidth, binwidth))\n", " plt.title(\"Total number of data points per trip_id / stop_id key. N keys with less than {0} points: {1} ({2}%)\"\\\n", " .format(binwidth, n_keys_less_than_binwidth, perc_key_to_recover))\n", " plt.xlabel('n data points')\n", " plt.ylabel('n keys')\n", " return plt.show()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plot_data_points_hist(d_all)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plot_data_points_hist(d_real)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First we generate a dictionnary with cumulative probability based on frequency of delays, for each keys in our reference dictionnary." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def cumul_distri_probas_dict(dico):\n", " list_tot_points = []\n", " for key in dico:\n", " distrib = dico[key]\n", "\n", " # get total number of elements \n", " N = np.sum(distrib)\n", "\n", " # make cumulative distribution probabilities\n", " cdf_distrib = np.empty((len(distrib)), dtype=float)\n", " save_x = 0\n", " for x in range(len(distrib)):\n", " cdf_distrib[x] = float(distrib[x])/float(N) + float(save_x)/float(N)\n", " save_x += distrib[x]\n", "\n", " dico[key] = cdf_distrib\n", " return dico" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('1286.TA.26-32-j19-1.12.H__8591182',\n", " array([0. , 0.64333333, 0.81333333, 0.90333333, 0.95555556,\n", " 0.96888889, 0.98444444, 0.99611111, 0.99777778, 0.99888889,\n", " 0.99888889, 0.99944444, 0.99944444, 0.99944444, 0.99944444,\n", " 0.99944444, 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. ])),\n", " ('1286.TA.26-32-j19-1.12.H__8591184',\n", " array([5.56483027e-04, 4.24596550e-01, 7.31775181e-01, 8.94268225e-01,\n", " 9.59933222e-01, 9.86644407e-01, 9.93878687e-01, 9.98330551e-01,\n", " 9.98330551e-01, 9.98887034e-01, 9.99443517e-01, 9.99443517e-01,\n", " 9.99443517e-01, 9.99443517e-01, 9.99443517e-01, 9.99443517e-01,\n", " 9.99443517e-01, 9.99443517e-01, 9.99443517e-01, 9.99443517e-01,\n", " 9.99443517e-01, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,\n", " 1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,\n", " 1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00])),\n", " ('1286.TA.26-32-j19-1.12.H__8591195',\n", " array([0. , 0.60166667, 0.84833333, 0.92777778, 0.96333333,\n", " 0.98277778, 0.99166667, 0.99666667, 0.99833333, 0.99888889,\n", " 0.99888889, 0.99888889, 0.99888889, 0.99888889, 0.99888889,\n", " 0.99944444, 0.99944444, 0.99944444, 0.99944444, 0.99944444,\n", " 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. ]))]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_all_cdp = cumul_distri_probas_dict(d_all)\n", "take(3, d_all_cdp.items())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('10.TA.1-11-B-j19-1.1.R__8590314',\n", " array([0. , 0.25 , 0.5 , 0.625, 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. ])),\n", " ('10.TA.1-11-B-j19-1.1.R__8590317',\n", " array([0. , 0.3, 0.5, 0.7, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. , 1. ])),\n", " ('10.TA.1-11-B-j19-1.1.R__8594304',\n", " array([0. , 0. , 0.5, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,\n", " 1. , 1. , 1. , 1. , 1. , 1. ]))]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_real_cdp = cumul_distri_probas_dict(d_real)\n", "take(3, d_real_cdp.items())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# write dictionnary \n", "with gzip.open(\"../data/distributions_cumulative_real.pkl.gz\", \"wb\") as output_file:\n", " pickle.dump(d_real_cdp, output_file)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'd' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# write dictionnary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mgzip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"../data/distributions_cumulative.pickle\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0moutput_file\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdump\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'd' is not defined" ] } ], "source": [ "# write dictionnary \n", "with gzip.open(\"../data/distributions_cumulative.pickle\", \"wb\") as output_file:\n", " pickle.dump(d, output_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Construct recovery tables \n", "\n", "First approach is to simple sum up similar distribution to get a new distribution we can use. For that, we need to have transport type (`route_desc`), `time` (rounded to hour) and `stop_id` which are valid. We then make all combination of these tree parameters and get the associate distributions" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_inttrip_intstop_intstop_sequencearrival_timedeparture_timeroute_idtrip_idstop_idroute_descstop_id_rawsequence_shift_1
00001NaT2020-05-21 07:18:0030-57-Y-j19-14.TA.30-57-Y-j19-1.1.H8502208Bus85022082
100122020-05-21 07:23:002020-05-21 07:23:0030-57-Y-j19-14.TA.30-57-Y-j19-1.1.H8502209Bus85022093
200232020-05-21 07:29:00NaT30-57-Y-j19-14.TA.30-57-Y-j19-1.1.H8503202Bus85032021
30101NaT2020-05-21 07:48:0030-57-Y-j19-15.TA.30-57-Y-j19-1.1.H8502208Bus85022082
401122020-05-21 07:53:002020-05-21 07:53:0030-57-Y-j19-15.TA.30-57-Y-j19-1.1.H8502209Bus85022093
\n", "
" ], "text/plain": [ " route_int trip_int stop_int stop_sequence arrival_time \\\n", "0 0 0 0 1 NaT \n", "1 0 0 1 2 2020-05-21 07:23:00 \n", "2 0 0 2 3 2020-05-21 07:29:00 \n", "3 0 1 0 1 NaT \n", "4 0 1 1 2 2020-05-21 07:53:00 \n", "\n", " departure_time route_id trip_id stop_id \\\n", "0 2020-05-21 07:18:00 30-57-Y-j19-1 4.TA.30-57-Y-j19-1.1.H 8502208 \n", "1 2020-05-21 07:23:00 30-57-Y-j19-1 4.TA.30-57-Y-j19-1.1.H 8502209 \n", "2 NaT 30-57-Y-j19-1 4.TA.30-57-Y-j19-1.1.H 8503202 \n", "3 2020-05-21 07:48:00 30-57-Y-j19-1 5.TA.30-57-Y-j19-1.1.H 8502208 \n", "4 2020-05-21 07:53:00 30-57-Y-j19-1 5.TA.30-57-Y-j19-1.1.H 8502209 \n", "\n", " route_desc stop_id_raw sequence_shift_1 \n", "0 Bus 8502208 2 \n", "1 Bus 8502209 3 \n", "2 Bus 8503202 1 \n", "3 Bus 8502208 2 \n", "4 Bus 8502209 3 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open(\"../data/stop_times_df.pkl\", \"rb\") as input_file:\n", " stoptimes = pickle.load(input_file)\n", " \n", "stoptimes.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0%, 4.07%, 8.14%, 12.21%, 16.28%, 20.35%, 24.42%, 28.49%, 32.55%, 36.62%, 40.69%, 44.76%, 48.83%, 52.9%, 56.97%, 61.04%, 65.11%, 69.18%, 73.25%, 77.32%, 81.39%, 85.46%, 89.53%, 93.6%, 97.66%, " ] } ], "source": [ "# Set same stoptimes index as distribution dict \n", "stoptimes['stop_id'] = stoptimes['stop_id'].astype(str).str[0:7]\n", "stoptimes['key'] = stoptimes['trip_id'] + '__' + stoptimes['stop_id']\n", "stoptimes = stoptimes.set_index('key')\n", "\n", "stoptimes = stoptimes[['trip_id','stop_id', 'route_desc', 'arrival_time', 'departure_time']]\n", "\n", "list_hours = []\n", "size_stop_times = stoptimes.shape[0]\n", "for x in range(size_stop_times):\n", " if (x % 10000) == 0 :\n", " print('{}%'.format(round(100*x/size_stop_times,2)), end = ', ')\n", " \n", " arr_time_hour = pd.to_datetime(stoptimes.iloc[x,:]['arrival_time']).hour\n", " if math.isnan(arr_time_hour): # if arrival is NaT, use departure time\n", " arr_time_hour = pd.to_datetime(stoptimes.iloc[x,:]['departure_time']).hour\n", " list_hours.append(int(arr_time_hour))\n", " \n", "stoptimes['hour'] = list_hours\n", "stoptimes = stoptimes.drop(columns=['trip_id', 'arrival_time', 'departure_time'])\n", "\n", "# Write this pickle to avoid re-running this above code all the time\n", "with gzip.open(\"../data/stop_times_wHour.pkl\", \"wb\") as output_file:\n", " pickle.dump(stoptimes, output_file) \n", " " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(17321, 32)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...22232425262728293031
stop_idhourroute_desc
85009268.0Bus0233444444...4444444444
9.0Bus0122222222...2222222223
10.0Bus0011111222...2222222222
11.0Bus0011122222...2222222222
\n", "

4 rows × 32 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 22 23 \\\n", "stop_id hour route_desc ... \n", "8500926 8.0 Bus 0 2 3 3 4 4 4 4 4 4 ... 4 4 \n", " 9.0 Bus 0 1 2 2 2 2 2 2 2 2 ... 2 2 \n", " 10.0 Bus 0 0 1 1 1 1 1 2 2 2 ... 2 2 \n", " 11.0 Bus 0 0 1 1 1 2 2 2 2 2 ... 2 2 \n", "\n", " 24 25 26 27 28 29 30 31 \n", "stop_id hour route_desc \n", "8500926 8.0 Bus 4 4 4 4 4 4 4 4 \n", " 9.0 Bus 2 2 2 2 2 2 2 3 \n", " 10.0 Bus 2 2 2 2 2 2 2 2 \n", " 11.0 Bus 2 2 2 2 2 2 2 2 \n", "\n", "[4 rows x 32 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with gzip.open(\"../data/stop_times_wHour.pkl\", \"rb\") as input_file:\n", " stoptimes = pickle.load(input_file)\n", " \n", "distrib_df = pd.DataFrame(d_all).transpose()\n", "distrib_to_rm = np.array(distrib_df.iloc[:,range(11)].sum(axis=1) == 11) # missing trips\n", "distrib_df = distrib_df.iloc[~distrib_to_rm,:]\n", "\n", "stoptimes_df = pd.DataFrame(stoptimes)\n", "\n", "recovery_df = distrib_df.join(stoptimes_df)\n", "list_bins = [x for x in range(32)]\n", "\n", "recovery_df = recovery_df.groupby(['stop_id','hour', 'route_desc'])[list_bins].apply(lambda x : x.astype(float).sum())\n", "recovery_df = recovery_df.astype('int')\n", "print(recovery_df.shape)\n", "recovery_df.head(4)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def plot_df_missing(df, max_bin = 10000):\n", " tot_per_key = np.array(df.sum(axis=1)).astype('int')\n", " binwidth = 100\n", " n_keys_less_than_binwidth = np.sum(np.array(tot_per_key < binwidth))\n", " perc_key_to_recover = round(100 * ( n_keys_less_than_binwidth / len(tot_per_key) ), 2)\n", " plt.figure(figsize = (10,5))\n", " plt.hist(tot_per_key, bins = range(min(tot_per_key), max_bin + binwidth, binwidth))\n", " plt.title(\"Total number of data points per stop_id / hour key. N keys with less than {0} points: {1} ({2}%)\"\\\n", " .format(binwidth, n_keys_less_than_binwidth, perc_key_to_recover))\n", " plt.xlabel('n data points')\n", " plt.ylabel('n keys')\n", " return plt.show()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plot_df_missing(recovery_df)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...22232425262728293031
stop_idhourroute_desc
85009268.0Bus0233444444...4444444444
9.0Bus0122222222...2222222223
10.0Bus0011111222...2222222222
11.0Bus0011122222...2222222222
12.0Bus0011122222...2222222222
13.0Bus0011122222...2222222222
14.0Bus0011222222...2222222222
15.0Bus0011222222...2222222222
16.0Bus0133333444...4444444444
17.0Bus0133333333...4444444444
18.0Bus0233333333...4444444444
19.0Bus0222233333...3333333333
85021868.0S-Bahn0367777777...8888888888
9.0S-Bahn0477777777...8888888888
10.0S-Bahn0477777777...7777788888
11.0S-Bahn0477777777...7777777778
12.0S-Bahn0377777777...7777777778
13.0S-Bahn0377777777...8888888888
14.0S-Bahn0377777777...8888888888
15.0S-Bahn0377777777...7777777788
\n", "

20 rows × 32 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 22 23 \\\n", "stop_id hour route_desc ... \n", "8500926 8.0 Bus 0 2 3 3 4 4 4 4 4 4 ... 4 4 \n", " 9.0 Bus 0 1 2 2 2 2 2 2 2 2 ... 2 2 \n", " 10.0 Bus 0 0 1 1 1 1 1 2 2 2 ... 2 2 \n", " 11.0 Bus 0 0 1 1 1 2 2 2 2 2 ... 2 2 \n", " 12.0 Bus 0 0 1 1 1 2 2 2 2 2 ... 2 2 \n", " 13.0 Bus 0 0 1 1 1 2 2 2 2 2 ... 2 2 \n", " 14.0 Bus 0 0 1 1 2 2 2 2 2 2 ... 2 2 \n", " 15.0 Bus 0 0 1 1 2 2 2 2 2 2 ... 2 2 \n", " 16.0 Bus 0 1 3 3 3 3 3 4 4 4 ... 4 4 \n", " 17.0 Bus 0 1 3 3 3 3 3 3 3 3 ... 4 4 \n", " 18.0 Bus 0 2 3 3 3 3 3 3 3 3 ... 4 4 \n", " 19.0 Bus 0 2 2 2 2 3 3 3 3 3 ... 3 3 \n", "8502186 8.0 S-Bahn 0 3 6 7 7 7 7 7 7 7 ... 8 8 \n", " 9.0 S-Bahn 0 4 7 7 7 7 7 7 7 7 ... 8 8 \n", " 10.0 S-Bahn 0 4 7 7 7 7 7 7 7 7 ... 7 7 \n", " 11.0 S-Bahn 0 4 7 7 7 7 7 7 7 7 ... 7 7 \n", " 12.0 S-Bahn 0 3 7 7 7 7 7 7 7 7 ... 7 7 \n", " 13.0 S-Bahn 0 3 7 7 7 7 7 7 7 7 ... 8 8 \n", " 14.0 S-Bahn 0 3 7 7 7 7 7 7 7 7 ... 8 8 \n", " 15.0 S-Bahn 0 3 7 7 7 7 7 7 7 7 ... 7 7 \n", "\n", " 24 25 26 27 28 29 30 31 \n", "stop_id hour route_desc \n", "8500926 8.0 Bus 4 4 4 4 4 4 4 4 \n", " 9.0 Bus 2 2 2 2 2 2 2 3 \n", " 10.0 Bus 2 2 2 2 2 2 2 2 \n", " 11.0 Bus 2 2 2 2 2 2 2 2 \n", " 12.0 Bus 2 2 2 2 2 2 2 2 \n", " 13.0 Bus 2 2 2 2 2 2 2 2 \n", " 14.0 Bus 2 2 2 2 2 2 2 2 \n", " 15.0 Bus 2 2 2 2 2 2 2 2 \n", " 16.0 Bus 4 4 4 4 4 4 4 4 \n", " 17.0 Bus 4 4 4 4 4 4 4 4 \n", " 18.0 Bus 4 4 4 4 4 4 4 4 \n", " 19.0 Bus 3 3 3 3 3 3 3 3 \n", "8502186 8.0 S-Bahn 8 8 8 8 8 8 8 8 \n", " 9.0 S-Bahn 8 8 8 8 8 8 8 8 \n", " 10.0 S-Bahn 7 7 7 8 8 8 8 8 \n", " 11.0 S-Bahn 7 7 7 7 7 7 7 8 \n", " 12.0 S-Bahn 7 7 7 7 7 7 7 8 \n", " 13.0 S-Bahn 8 8 8 8 8 8 8 8 \n", " 14.0 S-Bahn 8 8 8 8 8 8 8 8 \n", " 15.0 S-Bahn 7 7 7 7 7 7 8 8 \n", "\n", "[20 rows x 32 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recovery_df.head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Make second recovery table\n", "\n", "Here only taking combination of `transport_type x hour`" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(127, 32)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...22232425262728293031
hourroute_desc
7.0Bus057991010101111...11121212121212121212
InterRegio0000011111...2222222222
Intercity0000011111...2222222222
S-Bahn0356788899...99999910101010
\n", "

4 rows × 32 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 \\\n", "hour route_desc ... \n", "7.0 Bus 0 5 7 9 9 10 10 10 11 11 ... 11 12 12 12 \n", " InterRegio 0 0 0 0 0 1 1 1 1 1 ... 2 2 2 2 \n", " Intercity 0 0 0 0 0 1 1 1 1 1 ... 2 2 2 2 \n", " S-Bahn 0 3 5 6 7 8 8 8 9 9 ... 9 9 9 9 \n", "\n", " 26 27 28 29 30 31 \n", "hour route_desc \n", "7.0 Bus 12 12 12 12 12 12 \n", " InterRegio 2 2 2 2 2 2 \n", " Intercity 2 2 2 2 2 2 \n", " S-Bahn 9 9 10 10 10 10 \n", "\n", "[4 rows x 32 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with gzip.open(\"../data/stop_times_wHour.pkl\", \"rb\") as input_file:\n", " stoptimes = pickle.load(input_file)\n", " \n", "distrib_df = pd.DataFrame(d_all).transpose()\n", "distrib_to_rm = np.array(distrib_df.iloc[:,range(11)].sum(axis=1) == 11) # missing trips\n", "distrib_df = distrib_df.iloc[~distrib_to_rm,:]\n", "\n", "stoptimes_df = pd.DataFrame(stoptimes)\n", "\n", "recovery_df2 = distrib_df.join(stoptimes_df)\n", "list_bins = [x for x in range(32)]\n", "\n", "recovery_df2 = recovery_df2.groupby(['hour', 'route_desc'])[list_bins].apply(lambda x : x.astype(float).sum())\n", "recovery_df2 = recovery_df2.astype('int')\n", "print(recovery_df2.shape)\n", "recovery_df2.head(4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Last recovery table \n", "\n", "Takes only transport type distribution" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(11, 32)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...22232425262728293031
route_desc
Bus15725921197405115766124269128687131397133346134908136278...137998138003138007138012138014138016138018138021138023138087
Eurocity0001111222...3333333333
InterRegio3374107141174207240273306339...371371371372372372372372372372
Intercity9192939495969798999...109109109109109109109109109109
\n", "

4 rows × 32 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 \\\n", "route_desc \n", "Bus 1572 59211 97405 115766 124269 128687 131397 133346 \n", "Eurocity 0 0 0 1 1 1 1 2 \n", "InterRegio 33 74 107 141 174 207 240 273 \n", "Intercity 9 19 29 39 49 59 69 79 \n", "\n", " 8 9 ... 22 23 24 25 26 \\\n", "route_desc ... \n", "Bus 134908 136278 ... 137998 138003 138007 138012 138014 \n", "Eurocity 2 2 ... 3 3 3 3 3 \n", "InterRegio 306 339 ... 371 371 371 372 372 \n", "Intercity 89 99 ... 109 109 109 109 109 \n", "\n", " 27 28 29 30 31 \n", "route_desc \n", "Bus 138016 138018 138021 138023 138087 \n", "Eurocity 3 3 3 3 3 \n", "InterRegio 372 372 372 372 372 \n", "Intercity 109 109 109 109 109 \n", "\n", "[4 rows x 32 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with gzip.open(\"../data/stop_times_wHour.pkl\", \"rb\") as input_file:\n", " stoptimes = pickle.load(input_file)\n", " \n", "distrib_df = pd.DataFrame(d_all).transpose()\n", "distrib_to_rm = np.array(distrib_df.iloc[:,range(11)].sum(axis=1) == 11) # missing trips\n", "distrib_df = distrib_df.iloc[~distrib_to_rm,:]\n", "\n", "stoptimes_df = pd.DataFrame(stoptimes)\n", "\n", "recovery_df3 = distrib_df.join(stoptimes_df)\n", "list_bins = [x for x in range(32)]\n", "\n", "recovery_df3 = recovery_df3.groupby(['route_desc'])[list_bins].apply(lambda x : x.astype(float).sum())\n", "recovery_df3 = recovery_df3.astype('int')\n", "print(recovery_df3.shape)\n", "recovery_df3.head(4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reconstruct cumulative distribution probabilities from multiple distributions to recover data with few/missing points \n", "\n", - "To recover missing or faulty data, the strategy is the following :\n", - "1. If we have more than 100 data points in `real` group, we rely exclusively on it to compute probabilities for a given transfer on a `trip_id x stop_id` \n", - " - `real` group : the delay was calculated with actual arrival time with status `geschaetz` or `real`, meaning it comes from actual measurments.\n", - "2. If we do not find enough data within `real` group, we look at distributions in `all` group (contains all delays including `prognose` status) to compute probabilities, if there is more than 100 data points for a given `trip_id x stop_id`.\n", - "3. If `all` group still does not have more than 100 data points, we rely on `recovery tables` to estimate delay distributions. The strategy is the following :\n", - " - As we will always know the `stop_id`, the `time` and the `transport_type`, we rely on arrival delays from aggregated values of similar transfer. \n", - " - First, we compute a table of distribution with all possible combination of `stop_id`, `time` (round to hours) and `transport_type`, and aggregate all the counts we have to compute cumulative distribution probabilities. \n", - " - Is there is less than 100 data points in one of these intersections, we use the last possibilities : a table with `transport_type` x `time` aggregate counts.\n", - " - The last values with no match are given the overall average of cumulative distribution probabilities for each `transport_type` with no limit for the minimum number of data points.\n", - "\n", - "Following this approach, we can find cumulative distribution probabilities for every combination of `trip_id x stop_id` as defined in `stop_times_df`. We will make a table with the same row order so that McRaptor can easily find their indexes. \n", + "At this point, we have 2 dictionnaries of distributions and 3 recovery dataframes :\n", "\n", - "In order to do that, we have two dictionnaries of distributions and two recovery dataframes :\n", - " - `df_real` : contains delay distribution for each keys in form `trip_id + __ + stop_id` calculated from delays with status `geschaetz` or `real` in sbb datasets.\n", - " - `df_all` : contains delay distributions for each keys in form `trip_id + __ + stop_id`. No filter was applied on status (contains `geschaetz`, `real` __and__ `prognose` = evaluated delay).\n", + " - `d_real` : contains delay distribution for each keys in form `trip_id + __ + stop_id` calculated from delays with status `geschaetz` or `real` in sbb datasets.\n", + " - `d_all` : contains delay distributions for each keys in form `trip_id + __ + stop_id`. No filter was applied on status (contains `geschaetz`, `real` __and__ `prognose` = evaluated delay).\n", " - `recovery_df` : contains aggregated delay distributions for each combination of `stop_id`, `route_desc` (transport type) and `hour` (time rounded to hour). \n", " - `recovery_df2` : contains aggregated delay distributions for each combination of `route_desc` (transport type) and `hour` (time rounded to hour). \n", - " - `recovery_df3` : contains aggregated delay distributions for `route_desc` (transport type) " + " - `recovery_df3` : contains aggregated delay distributions for `route_desc` (transport type) \n", + " \n", + "We will now use these in order to reconstruct the final table with $P(T\\leq t_i)$ for each time points between -1 and +30, using a cumulative probability function as mentionned above." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_inttrip_intstop_intstop_sequencearrival_timedeparture_timeroute_idtrip_idstop_idroute_descstop_id_rawsequence_shift_1
00001NaT2020-05-21 07:18:0030-57-Y-j19-14.TA.30-57-Y-j19-1.1.H8502208Bus85022082
100122020-05-21 07:23:002020-05-21 07:23:0030-57-Y-j19-14.TA.30-57-Y-j19-1.1.H8502209Bus85022093
200232020-05-21 07:29:00NaT30-57-Y-j19-14.TA.30-57-Y-j19-1.1.H8503202Bus85032021
\n", "
" ], "text/plain": [ " route_int trip_int stop_int stop_sequence arrival_time \\\n", "0 0 0 0 1 NaT \n", "1 0 0 1 2 2020-05-21 07:23:00 \n", "2 0 0 2 3 2020-05-21 07:29:00 \n", "\n", " departure_time route_id trip_id stop_id \\\n", "0 2020-05-21 07:18:00 30-57-Y-j19-1 4.TA.30-57-Y-j19-1.1.H 8502208 \n", "1 2020-05-21 07:23:00 30-57-Y-j19-1 4.TA.30-57-Y-j19-1.1.H 8502209 \n", "2 NaT 30-57-Y-j19-1 4.TA.30-57-Y-j19-1.1.H 8503202 \n", "\n", " route_desc stop_id_raw sequence_shift_1 \n", "0 Bus 8502208 2 \n", "1 Bus 8502209 3 \n", "2 Bus 8503202 1 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "###################### MAKE CUMULATIVE PROBABILITY TABLE #######################\n", "\n", "# Load stop_time table, to use its order as a template for our final table \n", "with open(\"../data/stop_times_df.pkl\", \"rb\") as input_file:\n", " stoptimes = pickle.load(input_file)\n", " \n", "stoptimes.head(3)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0%, 4.07%, 8.14%, 12.21%, 16.28%, 20.35%, 24.42%, 28.49%, 32.55%, 36.62%, 40.69%, 44.76%, 48.83%, 52.9%, 56.97%, 61.04%, 65.11%, 69.18%, 73.25%, 77.32%, 81.39%, 85.46%, 89.53%, 93.6%, 97.66%, " ] } ], "source": [ "summary_df = pd.DataFrame(columns = ['key', 'key_int', 'trip_id', 'stop_id', 'transport_type', 'hour', 'distribution'])\n", "n_fail = 0\n", "size_stop_times = stoptimes.shape[0]\n", "n_real = 0\n", "n_all = 0\n", "n_recov1 = 0\n", "n_recov2 = 0\n", "n_recov3 = 0\n", "all_distrib = []\n", "all_transport_type = []\n", "all_hours = []\n", "all_keys = []\n", "\n", "i = 0\n", "for index, row in stoptimes.iterrows():\n", " \n", " trip_id = row[7]\n", " stop_id = str(row[8])[:7]\n", " transport_type = row[9]\n", " key = trip_id + '__' + stop_id\n", "\n", " # Compute rounded hour using arrival if possible - recover with departure\n", " hour = pd.to_datetime(stoptimes.loc[index]['arrival_time']).hour\n", " if math.isnan(hour): # if arrival is NaT, use departure time\n", " hour = pd.to_datetime(stoptimes.loc[index]['departure_time']).hour\n", " \n", " distrib = np.zeros(31)\n", " keep_trying = True\n", " \n", " # 1) try d_real to get distribution from measured delays\n", " if key in d_real:\n", " distrib = d_real[key]\n", " sum_distrib = np.sum(distrib)\n", " if sum_distrib > 100 :\n", " #summary_df.loc[index, 'distribution'] = distrib\n", " all_distrib.append(distrib)\n", " keep_trying = False \n", " n_real += 1\n", " \n", " # 2) try d_all to get distribution from measured + estimated delays\n", " if keep_trying and key in d_all:\n", " distrib = d_all[key]\n", " sum_distrib = np.sum(distrib)\n", " if sum_distrib > 100 :\n", " #summary_df.loc[index, 'distribution'] = distrib\n", " all_distrib.append(distrib)\n", " keep_trying = False\n", " n_all += 1\n", " \n", " # 3) try first recovery table using stop_id, transport_type and hour\n", " if keep_trying and (stop_id, hour, transport_type) in recovery_df.index:\n", " distrib = np.array(recovery_df.loc[(stop_id, hour, transport_type)])\n", " sum_distrib = np.sum(distrib)\n", " if sum_distrib > 100 :\n", " #summary_df.loc[index, 'distribution'] = distrib\n", " all_distrib.append(distrib)\n", " keep_trying = False \n", " n_recov1 += 1\n", " \n", " # 4) use second recovery table using transport_type and hour \n", " if keep_trying and (hour, transport_type) in recovery_df2.index:\n", " distrib = np.array(recovery_df2.loc[(hour, transport_type)])\n", " sum_distrib = np.sum(distrib)\n", " if sum_distrib > 100 :\n", " #summary_df.loc[index, 'distribution'] = distrib\n", " all_distrib.append(distrib)\n", " keep_trying = False \n", " n_recov2 += 1\n", " \n", " # 5) use third recovery table using transport_type only \n", " if keep_trying and (transport_type) in recovery_df3.index:\n", " distrib = np.array(recovery_df3.loc[(transport_type)])\n", " sum_distrib = np.sum(distrib)\n", " #summary_df.loc[index, 'distribution'] = distrib\n", " all_distrib.append(distrib)\n", " keep_trying = False \n", " n_recov3 += 1\n", " \n", " # Record results in summary\n", - " #summary_df.loc[index, 'key'] = key\n", - " #summary_df.loc[index, 'key_int'] = index\n", - " #summary_df.loc[index, 'trip_id'] = trip_id\n", - " #summary_df.loc[index, 'stop_id'] = stop_id\n", - " #summary_df.loc[index, 'transport_type'] = transport_type\n", - " #summary_df.loc[index, 'hour'] = hour\n", " all_keys.append(key)\n", " all_transport_type.append(transport_type)\n", " all_hours.append(hour)\n", "\n", " # save number of failure for recovery\n", " if keep_trying:\n", " print('fail{}'.format(index), end = ', ')\n", " n_fail += 1 \n", " \n", " # print progression \n", " if (index % 10000) == 0 :\n", " print('{}%'.format(round(100*index/size_stop_times,2)), end = ', ')" ] }, { "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10422\n", - "173225\n", - "37031\n", - "20853\n", - "4207\n" - ] - } - ], - "source": [ - "print(n_real)\n", - "print(n_all)\n", - "print(n_recov1)\n", - "print(n_recov2)\n", - "print(n_recov3)" - ] - }, - { - "cell_type": "code", - "execution_count": 82, + "execution_count": 90, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ - "plt.barh(np.array(['real','prognose','recov_tab1','recov_tab2','recov_tab3'])[::-1],\\\n", + "plt.barh(np.array(['real / geschaetz','prognose','recov_tab1 (stop_id x hour x type)','recov_tab2 (hour x type)','recov_tab3 (type)'])[::-1],\\\n", " np.array([n_real,n_all,n_recov1,n_recov2, n_recov3])[::-1])\n", "plt.title('where the distribution data come from')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keyhourtransport_typedistribution
04.TA.30-57-Y-j19-1.1.H__85022087Bus[0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11...
14.TA.30-57-Y-j19-1.1.H__85022097Bus[0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11...
24.TA.30-57-Y-j19-1.1.H__85032027Bus[0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11...
35.TA.30-57-Y-j19-1.1.H__85022087Bus[0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11...
45.TA.30-57-Y-j19-1.1.H__85022097Bus[0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11...
\n", "
" ], "text/plain": [ " key hour transport_type \\\n", "0 4.TA.30-57-Y-j19-1.1.H__8502208 7 Bus \n", "1 4.TA.30-57-Y-j19-1.1.H__8502209 7 Bus \n", "2 4.TA.30-57-Y-j19-1.1.H__8503202 7 Bus \n", "3 5.TA.30-57-Y-j19-1.1.H__8502208 7 Bus \n", "4 5.TA.30-57-Y-j19-1.1.H__8502209 7 Bus \n", "\n", " distribution \n", "0 [0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11... \n", "1 [0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11... \n", "2 [0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11... \n", "3 [0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11... \n", "4 [0, 5, 7, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11... " ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary_df = pd.DataFrame([all_keys, all_hours, all_transport_type, all_distrib],\\\n", " index = ['key','hour','transport_type','distribution']).transpose()\n", "summary_df.head()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "# Load stop_time table, to use its order as a template for our final table \n", "with gzip.open(\"../data/join_distribution_all.pkl.gz\", \"wb\") as out_file:\n", " pickle.dump(summary_df, out_file)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "list_all_rows = []\n", "for index, row in summary_df.iterrows():\n", " distrib = np.array(row['distribution'])\n", " \n", " # get total number of elements \n", " N = np.sum(distrib)\n", " \n", " # make cumulative distribution probabilities\n", " cdf_distrib = np.empty((len(distrib)), dtype=float)\n", " save_x = 0\n", " for x in range(len(distrib)):\n", " cdf_distrib[x] = float(distrib[x])/float(N) + float(save_x)/float(N)\n", " save_x += distrib[x]\n", " \n", " list_all_rows.append(cdf_distrib)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0.01501502, 0.03603604, 0.06306306, 0.09009009,\n", " 0.12012012, 0.15015015, 0.18018018, 0.21321321, 0.24624625,\n", " 0.27927928, 0.31231231, 0.34534535, 0.37837838, 0.41141141,\n", " 0.44444444, 0.47747748, 0.51051051, 0.54354354, 0.57657658,\n", " 0.60960961, 0.64264264, 0.67567568, 0.71171171, 0.74774775,\n", " 0.78378378, 0.81981982, 0.85585586, 0.89189189, 0.92792793,\n", " 0.96396396, 1. ],\n", " [0. , 0.01501502, 0.03603604, 0.06306306, 0.09009009,\n", " 0.12012012, 0.15015015, 0.18018018, 0.21321321, 0.24624625,\n", " 0.27927928, 0.31231231, 0.34534535, 0.37837838, 0.41141141,\n", " 0.44444444, 0.47747748, 0.51051051, 0.54354354, 0.57657658,\n", " 0.60960961, 0.64264264, 0.67567568, 0.71171171, 0.74774775,\n", " 0.78378378, 0.81981982, 0.85585586, 0.89189189, 0.92792793,\n", " 0.96396396, 1. ],\n", " [0. , 0.01501502, 0.03603604, 0.06306306, 0.09009009,\n", " 0.12012012, 0.15015015, 0.18018018, 0.21321321, 0.24624625,\n", " 0.27927928, 0.31231231, 0.34534535, 0.37837838, 0.41141141,\n", " 0.44444444, 0.47747748, 0.51051051, 0.54354354, 0.57657658,\n", " 0.60960961, 0.64264264, 0.67567568, 0.71171171, 0.74774775,\n", " 0.78378378, 0.81981982, 0.85585586, 0.89189189, 0.92792793,\n", " 0.96396396, 1. ],\n", " [0. , 0.01501502, 0.03603604, 0.06306306, 0.09009009,\n", " 0.12012012, 0.15015015, 0.18018018, 0.21321321, 0.24624625,\n", " 0.27927928, 0.31231231, 0.34534535, 0.37837838, 0.41141141,\n", " 0.44444444, 0.47747748, 0.51051051, 0.54354354, 0.57657658,\n", " 0.60960961, 0.64264264, 0.67567568, 0.71171171, 0.74774775,\n", " 0.78378378, 0.81981982, 0.85585586, 0.89189189, 0.92792793,\n", " 0.96396396, 1. ],\n", " [0. , 0.01501502, 0.03603604, 0.06306306, 0.09009009,\n", " 0.12012012, 0.15015015, 0.18018018, 0.21321321, 0.24624625,\n", " 0.27927928, 0.31231231, 0.34534535, 0.37837838, 0.41141141,\n", " 0.44444444, 0.47747748, 0.51051051, 0.54354354, 0.57657658,\n", " 0.60960961, 0.64264264, 0.67567568, 0.71171171, 0.74774775,\n", " 0.78378378, 0.81981982, 0.85585586, 0.89189189, 0.92792793,\n", " 0.96396396, 1. ]])" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_df = pd.DataFrame(list_all_rows)\n", "final_df.index = summary_df.index\n", "final_np = final_df.to_numpy()\n", "final_np[0:5,:]" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(np.array(final_df.index == stoptimes.index)) == stoptimes.shape[0]" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "# write recovery table \n", "with gzip.open(\"../data/join_distribution_cumulative_p_2.pkl.gz\", \"wb\") as output_file:\n", " pickle.dump(final_np, output_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Poisson cumulative distribution\n", "\n", "The Poisson distribution is popular for modeling the number of times an event occurs in an interval of time or space. We modeled a poisson distribution for delays assuming parameter $k$ is the time in minutes (as it was done [here](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0126137), formulas $(4),(5),(6)$).\n", "\n", "A discrete random variable X is said to have a Poisson distribution with parameter λ > 0, if, for k = 0, 1, 2, ..., the probability mass function of X is given by:\n", "\n", "$${\\displaystyle \\!f(k;\\lambda )=\\Pr(X=k)={\\frac {\\lambda ^{k}e^{-\\lambda }}{k!}},}$$\n", "where\n", "\n", "e is Euler's number (e = 2.71828...)\n", "k! is the factorial of k.\n", "The positive real number λ is equal to the expected value of X __and__ to its variance.\n", "\n", "$${\\displaystyle \\lambda =\\operatorname {E} (X)=\\operatorname {Var} (X)}$$\n", "\n", "We can approximate E[𝑋]∼$\\mu_i$ for our data $X_i$, if we assume the sample $X_i$ of size N follow the distribution of $X$ meaning $X_i$∼$X$.\n", "\n", "Poisson-related __assumptions__ :\n", "- $k$ is the __delay time in minutes__ and can take values 0, 1, 2, ... (strictly positive and discrete)\n", "- We assume our sampling $X_i$ of $X$ is good enough to approximate E[X] ~ $\\mu_i$\n", "- The occurrence of one event does not affect probability of others. That is, events occur independently.\n", " - __We assume being late one day is not affecting the delay of the day after__ \n", "- The average rate at which events occur is independent of any occurrences. For simplicity, this is usually assumed to be constant, but may in practice vary with time.\n", " - __we assumes delays occurs with a constant rate over time__\n", "- Two events cannot occur at exactly the same instant\n", "\n", "We made a function `poisson_proba` that takes a `trip_id`, a `stop_id`, an `arrival time` and a `departure time` and a dictionnary {key : distribution} to compute a __probability to be at least 2 minutes before departure of next trip__. \n", "\n", "We make a few __assumptions__ on our side :\n", "- We assume that if we have less than 2 minutes for the transfer, we miss it.\n", "- We assume the next train is on time.\n", "- As for poisson distribution $k$ is strictly positive, we assume trains ahead of schedule were on time ($k=0$)\n", "\n", "\n", "_Question we should address :_\n", "- _Is the poisson a reasonable approximation of the binomial distribution in our case ?_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's first test the poisson distribution and compare it with our distribution to see how well it fits the data. We will compute $Pr(X = k)$ for each values of k and look at the shape of the poisson distribution compared to the shape of our scaled data. Then, we will compare $\\sum_{k=0}^T Pr(X = k)$ with the cumulative distribution function which directly gives $Pr(k \\leq X)$" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "Invalid status code '404' from http://iccluster044.iccluster.epfl.ch:8998/sessions/6821 with error payload: \"Session '6821' not found.\"\n" ] } ], "source": [ "################################# POISSON FIT TEST #########################################\n", "\n", "# to do .. \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here are all the functions needed to calculate probability of success for a given transfer. We need the `trip_id`, `stop_id`, `departure_time`, `arrival_time` and dictionnary `d` (pickled load at the beginning of the cell) to be able to compute a probability of success with following function : \n", "\n", "`poisson_proba(trip_id, stop_id, arrival_time, departure_time, d)`" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "lambda (expectation given distribution): 1.0194769059543685 \n", "\n", "Probability of success for transfer time = 13.0 minutes : 0.999999999994185\n" ] } ], "source": [ "%local\n", "################################# POISSON FUNCTIONS ########################################\n", "\n", "import pickle \n", "import gzip\n", "import time\n", "import math \n", "import datetime\n", "import time\n", "from scipy.stats import poisson\n", "\n", "# Load dictionnary\n", "with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n", " d = pickle.load(input_file)\n", "\n", "# Load dictionnary\n", "with open(\"../data/stop_times_array.pkl\", \"rb\") as input_file:\n", " times = pickle.load(input_file)\n", "\n", "# we take two exemple time in format numpy.datetime64\n", "arr_time = times[4][1]\n", "dep_time = times[0][1]\n", "\n", "# Load distribution in dictinonary given a key\n", "def get_distrib(key, dico):\n", " if key in dico:\n", " return dico[key]\n", " else:\n", " raise ValueError(\"KEY ERROR: {} not found un distribution dictionnary\".format(key))\n", " \n", "# Evaluate lambda parameter assuming it is equal to average \n", "def evaluate_lamda(distrib):\n", " # First calculate total number of measures N\n", " N = 0 # by starting at -1 we ignore trains ahead of schedule\n", " for x in distrib:\n", " N += x\n", "\n", " lambda_p = 0 # expectation - we want to calculate it\n", " t = -1 # time = index - 1\n", "\n", " for x in distrib:\n", " if t>0:\n", " lambda_p += t*x\n", " t += 1\n", "\n", " # calculate lambda - the expectation of x\n", " if N > 0:\n", " lambda_p /= N \n", " print('lambda (expectation given distribution): ',lambda_p, '\\n')\n", " return lambda_p\n", " else : \n", " raise ValueError(\"ERROR : {} distribution has 0 counts\".format(key))\n", " #print('Returning 1 to avoid later problem... \\n')\n", " return 1\n", "\n", "# process time given as string in format 'hh:mm' - not needed\n", "def process_time_str(str_time):\n", " x = time.strptime(str_time,'%H:%M')\n", " return datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()\n", "\n", "# Calculate transfer time given two times in string format 'hh:mm'\n", "def get_transfer_time(arr_time, dep_time, delta=2.0):\n", " diff_time_min = (arr_time - dep_time).astype('timedelta64[m]') / np.timedelta64(1, 'm')\n", " return diff_time_min - delta\n", "\n", "# Calculate poisson probability of success for a given transfert \n", "# for a given trip_id, stop_id, arrival/departure times and dict\n", "def poisson_proba(trip_id, stop_id, arr_time, dep_time, dico):\n", " # Generate key from trip_id / stop_id \n", " key = str(trip_id) + '__' + str(stop_id[0:7]) # 7 first char to be sbb-compatible\n", "\n", " # Get distribution from dictionnary\n", " distrib = get_distrib(key, dico)\n", " \n", " # Calculate transfer time at disposal \n", " T = get_transfer_time(arr_time, dep_time)\n", " \n", " # Get lambda value to calculate proba\n", " lambda_p = evaluate_lamda(distrib)\n", "\n", " # Get proba\n", " if T > 2:\n", " poisson_p = poisson.cdf(T, lambda_p)\n", " else : \n", " poisson_p = 0.0 # if we have less than 2 minutes, we miss it\n", " \n", " print('Probability of success for transfer time = {} minutes : '.format(T),poisson_p)\n", " return poisson_p\n", "\n", "# Mock exemple of probability calculations with given inputs\n", "trip_id = '1286.TA.26-32-j19-1.12.H'\n", "stop_id = '8591184'\n", "\n", "# we take two exemple time from stop_times_array in format numpy.datetime64\n", "arr_time = times[3][1]\n", "dep_time = times[0][1]\n", "\n", "Pr = poisson_proba(trip_id, stop_id, arr_time, dep_time, d)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/notebooks/transfer_to_local.ipynb b/notebooks/transfer_to_local.ipynb index c9649d5..8196c42 100644 --- a/notebooks/transfer_to_local.ipynb +++ b/notebooks/transfer_to_local.ipynb @@ -1,261 +1,261 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## transfer files from HDFS to local\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7704application_1589299642358_2200pysparkidleLinkLink
7735application_1589299642358_2231pysparkidleLinkLink
7737application_1589299642358_2233pysparkidleLinkLink
7739application_1589299642358_2235pysparkdeadLinkLink
7743application_1589299642358_2239pysparkidleLinkLink
7745application_1589299642358_2241pysparkidleLinkLink
7750application_1589299642358_2246pysparkbusyLinkLink
7753application_1589299642358_2249pysparkidleLinkLink
7756application_1589299642358_2252pysparkidleLinkLink
7759application_1589299642358_2255pysparkbusyLinkLink
7760application_1589299642358_2256pysparkidleLinkLink
7761application_1589299642358_2257pysparkidleLinkLink
7762application_1589299642358_2258pysparkidleLinkLink
7764application_1589299642358_2260pysparkidleLinkLink
7767application_1589299642358_2263pysparkidleLinkLink
7768application_1589299642358_2264pysparkidleLinkLink
7770application_1589299642358_2266pysparkidleLinkLink
7772application_1589299642358_2268pysparkidleLinkLink
7773application_1589299642358_2269pysparkidleLinkLink
7774application_1589299642358_2270pysparkidleLinkLink
7775application_1589299642358_2272pysparkidleLinkLink
7776application_1589299642358_2273pysparkidleLinkLink
7777application_1589299642358_2274pysparkidleLinkLink
7778application_1589299642358_2275pysparkidleLinkLink
7779application_1589299642358_2276pysparkbusyLinkLink
7780application_1589299642358_2277pysparkidleLinkLink
7781application_1589299642358_2278pysparkbusyLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7932application_1589299642358_2450pysparkidleLinkLink7933application_1589299642358_2451pysparkidleLinkLink7935application_1589299642358_2453pysparkidleLinkLink7939application_1589299642358_2457pysparkidleLinkLink7940application_1589299642358_2458pysparkidleLinkLink7941application_1589299642358_2459pysparkidleLinkLink7942application_1589299642358_2460pysparkidleLinkLink7944application_1589299642358_2462pysparkidleLinkLink7945application_1589299642358_2463pysparkdeadLinkLink7946application_1589299642358_2464pysparkidleLinkLink7947application_1589299642358_2465pysparkidleLinkLink7948application_1589299642358_2466pysparkidleLinkLink7949application_1589299642358_2467pysparkidleLinkLink" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7782application_1589299642358_2279pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7950application_1589299642358_2468pysparkidleLinkLink✔" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfert and save tables from hdfs to local \n", "\n", "Here we describe the process of loading a table on hdfs, saving it to a proper place so that we can load it in local and then save it.\n", "\n", "First, we load the data that is in an otherwise not accessible place in hdfs :" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "username = 'tturner'\n", "\n", "my_files = ['stop_times_curated.csv',\n", " 'stops_15km.csv', 'transfers.csv', 'stop_times_final_cyril.csv']\n", "\n", "for file in my_files:\n", " this_file = spark.read.csv('data/lgpt_guys/{}'.format(file), \\\n", " header = True) \n", " this_file.write.csv(\"/user/{0}/{1}\".format(username, file.replace('.csv','')), \\\n", " header = True, mode = 'overwrite')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/work/final_project/notebooks\n", "stop_times_curated\n", "stops_15km\n", "transfers\n", "stop_times_final_cyril\n" ] } ], "source": [ "%local\n", "\n", "from hdfs3 import HDFileSystem\n", "import pandas as pd\n", "import numpy as np \n", "import os\n", "print(os.getcwd())\n", "\n", "hdfs = HDFileSystem(host='hdfs://iccluster044.iccluster.epfl.ch', port=8020, user='ebouille')\n", "\n", "username = 'tturner'\n", "\n", "my_folders = ['stop_times_curated', \n", " 'stops_15km', 'transfers', 'stop_times_final_cyril']\n", "\n", "for folder in my_folders:\n", " print(folder)\n", " array_files = hdfs.glob('/user/{0}/{1}/*.csv'.format(username, folder))\n", " array = pd.DataFrame()\n", " for file in array_files:\n", " with hdfs.open(file) as f:\n", " array = array.append(pd.read_csv(f))\n", "\n", " array.to_csv('../data/{}.csv'.format(folder), header=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 }