diff --git a/.gitattributes b/.gitattributes index 101dcef..699bf1a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,37 +1,38 @@ data/distributions.pickle filter=lfs diff=lfs merge=lfs -text data/stop_times_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/transfer_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/routes_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/route_stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/stop_routes_array_version3.csv filter=lfs diff=lfs merge=lfs -text data/stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text object.data filter=lfs diff=lfs merge=lfs -text *.pkl filter=lfs diff=lfs merge=lfs -text data/*.pkl filter=lfs diff=lfs merge=lfs -text data/dere.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_array.pkl filter=lfs diff=lfs merge=lfs -text data/stops_array.pkl filter=lfs diff=lfs merge=lfs -text data/stop_times_array.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_array.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array.pkl filter=lfs diff=lfs merge=lfs -text data/route_stops_array.pkl filter=lfs diff=lfs merge=lfs -text data/route_stops_df.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array_df.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_df.pkl filter=lfs diff=lfs merge=lfs -text data/stop_times_df.pkl filter=lfs diff=lfs merge=lfs -text data/stops_df.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_df.pkl filter=lfs diff=lfs merge=lfs -text data/distrib_recov_tab_stopID_hour.pkl.gz filter=lfs diff=lfs merge=lfs -text data/join_distribution_all.pkl.gz filter=lfs diff=lfs merge=lfs -text data/join_distribution_cumulative_p.pkl.gz filter=lfs diff=lfs merge=lfs -text data/join_distribution_cumulative_p_2.pkl.gz filter=lfs diff=lfs merge=lfs -text data/route_stops_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/route_stops_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/routes_array_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stop_routes_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stop_times_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/transfer_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text data/stops_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text +data/stop_times_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/data/route_stops_df_cyril.pkl b/data/route_stops_df_cyril.pkl index d22734d..70461ea 100644 --- a/data/route_stops_df_cyril.pkl +++ b/data/route_stops_df_cyril.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c62d0046e831eee39e5ee0605aa0eac60d6bad85c0c5d386d91ee81380216265 +oid sha256:3f86289015b10f212f8e2415b29cf5bd584a575fdd860d701cef87a46edb0bb8 size 369113 diff --git a/data/stop_routes_df_cyril.pkl b/data/stop_routes_df_cyril.pkl index 960efe5..484e287 100644 --- a/data/stop_routes_df_cyril.pkl +++ b/data/stop_routes_df_cyril.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b563d73903bf423b3c49fa4987691f28ee05261baad5595f19c46b56cf2a382f +oid sha256:53fe1e7c5b985f8d14528085832799ded6cf202c28872d6ff1e63953c2c20716 size 538099 diff --git a/data/stop_times_array_cyril.pkl b/data/stop_times_array_cyril.pkl index 8e5c056..524a974 100644 --- a/data/stop_times_array_cyril.pkl +++ b/data/stop_times_array_cyril.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c38ce272796cd8dcc9f560ba57840e43ce167893d43f4840087be1652bd25a4 +oid sha256:279ef5c4d19dc2f4d4194553ac95b57364efa7fdf19636831fcdc39d2b2a127d size 4167529 diff --git a/data/stop_times_df_cyril.pkl b/data/stop_times_df_cyril.pkl new file mode 100644 index 0000000..daea263 --- /dev/null +++ b/data/stop_times_df_cyril.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b44372accdcd34651a05e4ce97ff137a448fc5f8d87fb5ea97ffe1fd13a1ad0e +size 36173268 diff --git a/notebooks/Arrays_Cyrill_data.ipynb b/notebooks/Arrays_Cyrill_data.ipynb index c7cfa64..9cef39b 100644 --- a/notebooks/Arrays_Cyrill_data.ipynb +++ b/notebooks/Arrays_Cyrill_data.ipynb @@ -1,5236 +1,5236 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing part 2: preparing the arrays\n", "In this notebook we take 2 datasets prepared in spark: stop_times and transfers, and prepare them into the array format needed to run RAPTOR" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Outline\n", "In this notebook the following actions are performed:\n", "- create array" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import packages" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "import itertools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read files\n", "Before running make sure the .csv files are in /data . If not run notebook \"transfer_to_local\"" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
0026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
1126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
2226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
3326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
4426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", "
" ], "text/plain": [ " Unnamed: 0 route_id stop_id_general trip_id stop_id \\\n", "0 0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 \n", "1 1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 \n", "2 2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 \n", "3 3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 \n", "4 4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 \n", "\n", " arrival_time departure_time stop_sequence stop_name \\\n", "0 17:00:00 17:00:00 3 Zürich, Hürlimannplatz \n", "1 17:02:00 17:02:00 4 Zürich, Waffenplatzstrasse \n", "2 17:03:00 17:03:00 5 Zürich, Hügelstrasse \n", "3 17:04:00 17:04:00 6 Zürich, Brunau/Mutschellenstr. \n", "4 17:05:00 17:05:00 7 Zürich, Thujastrasse \n", "\n", " stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n", "0 47.365066 8.526539 Zürich, Neubühl 3870 0 \n", "1 47.361482 8.525749 Zürich, Neubühl 3870 0 \n", "2 47.358543 8.526997 Zürich, Neubühl 3870 0 \n", "3 47.355147 8.527141 Zürich, Neubühl 3870 0 \n", "4 47.350187 8.527806 Zürich, Neubühl 3870 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \n", "0 16:55:00 1225 12 1317 Bus \n", "1 16:55:00 1225 12 1267 Bus \n", "2 16:55:00 1225 12 67 Bus \n", "3 16:55:00 1225 12 512 Bus \n", "4 16:55:00 1225 12 403 Bus " ] }, - "execution_count": 2, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#stop_times\n", "stop_times_curated = pd.read_csv(\"../data/stop_times_final_cyril.csv\")\n", "stop_times_curated.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We drop columns not useful to us" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "stop_times_curated = stop_times_curated.drop(columns=[\"Unnamed: 0\"])" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker
11850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde
2285021868502186:0:10.0067628Dietikon StoffelbachDietikon Stoffelbach
3385021868502186:0:20.01352416Dietikon StoffelbachDietikon Stoffelbach
4485021868502186P0.0000000Dietikon StoffelbachDietikon Stoffelbach
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "1 1 8500926 8590737 0.300175 360 \n", "2 2 8502186 8502186:0:1 0.006762 8 \n", "3 3 8502186 8502186:0:2 0.013524 16 \n", "4 4 8502186 8502186P 0.000000 0 \n", "\n", " stop_name stop_name2 \n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n", "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n", "2 Dietikon Stoffelbach Dietikon Stoffelbach \n", "3 Dietikon Stoffelbach Dietikon Stoffelbach \n", "4 Dietikon Stoffelbach Dietikon Stoffelbach " ] }, - "execution_count": 4, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#transfers\n", "transfers = pd.read_csv(\"../data/transfers.csv\")\n", "transfers.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfer: delete transfer to same stop & get stop_int & stop_int2\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "12564" ] }, - "execution_count": 5, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check number stops transfers\n", "transfers.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker
11850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde
2285021868502186:0:10.0067628Dietikon StoffelbachDietikon Stoffelbach
3385021868502186:0:20.01352416Dietikon StoffelbachDietikon Stoffelbach
4485021868502186P0.0000000Dietikon StoffelbachDietikon Stoffelbach
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "1 1 8500926 8590737 0.300175 360 \n", "2 2 8502186 8502186:0:1 0.006762 8 \n", "3 3 8502186 8502186:0:2 0.013524 16 \n", "4 4 8502186 8502186P 0.000000 0 \n", "\n", " stop_name stop_name2 \n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n", "1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n", "2 Dietikon Stoffelbach Dietikon Stoffelbach \n", "3 Dietikon Stoffelbach Dietikon Stoffelbach \n", "4 Dietikon Stoffelbach Dietikon Stoffelbach " ] }, - "execution_count": 6, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We delete transfers to the same stop" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "transfers_df = transfers[transfers['stop_id'] != transfers['stop_id2']]" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "12564" ] }, - "execution_count": 8, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.stop_id.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the stop_int column in transfers. This action eliminates stops not in stop_times" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id arrival_time \\\n", "0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 17:00:00 \n", "1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 17:02:00 \n", "2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 17:03:00 \n", "3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 17:04:00 \n", "4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 17:05:00 \n", "\n", " departure_time stop_sequence stop_name stop_lat \\\n", "0 17:00:00 3 Zürich, Hürlimannplatz 47.365066 \n", "1 17:02:00 4 Zürich, Waffenplatzstrasse 47.361482 \n", "2 17:03:00 5 Zürich, Hügelstrasse 47.358543 \n", "3 17:04:00 6 Zürich, Brunau/Mutschellenstr. 47.355147 \n", "4 17:05:00 7 Zürich, Thujastrasse 47.350187 \n", "\n", " stop_lon trip_headsign trip_short_name direction_id \\\n", "0 8.526539 Zürich, Neubühl 3870 0 \n", "1 8.525749 Zürich, Neubühl 3870 0 \n", "2 8.526997 Zürich, Neubühl 3870 0 \n", "3 8.527141 Zürich, Neubühl 3870 0 \n", "4 8.527806 Zürich, Neubühl 3870 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \n", "0 16:55:00 1225 12 1317 Bus \n", "1 16:55:00 1225 12 1267 Bus \n", "2 16:55:00 1225 12 67 Bus \n", "3 16:55:00 1225 12 512 Bus \n", "4 16:55:00 1225 12 403 Bus " ] }, - "execution_count": 9, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_int = stop_times_curated\n", "stop_times_int.head(5)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "transfers_df = transfers_df.merge(stop_times_int[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\"), how=\"inner\", on = \"stop_id\").drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10707" ] }, - "execution_count": 11, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2stop_int
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker1392
381850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde1392
7698502186:0:185021860.0067628Dietikon StoffelbachDietikon Stoffelbach1394
128108502186:0:18502186:0:20.0067628Dietikon StoffelbachDietikon Stoffelbach1394
180118502186:0:18502186P0.0067628Dietikon StoffelbachDietikon Stoffelbach1394
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "38 1 8500926 8590737 0.300175 360 \n", "76 9 8502186:0:1 8502186 0.006762 8 \n", "128 10 8502186:0:1 8502186:0:2 0.006762 8 \n", "180 11 8502186:0:1 8502186P 0.006762 8 \n", "\n", " stop_name stop_name2 stop_int \n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 1392 \n", "38 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 1392 \n", "76 Dietikon Stoffelbach Dietikon Stoffelbach 1394 \n", "128 Dietikon Stoffelbach Dietikon Stoffelbach 1394 \n", "180 Dietikon Stoffelbach Dietikon Stoffelbach 1394 " ] }, - "execution_count": 12, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.head(5)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_id2stop_int_2
085912051317
185914151267
2859120467
38591098512
48591392403
\n", "
" ], "text/plain": [ " stop_id2 stop_int_2\n", "0 8591205 1317\n", "1 8591415 1267\n", "2 8591204 67\n", "3 8591098 512\n", "4 8591392 403" ] }, - "execution_count": 13, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create dataframe with stops\n", "df_stop_int2 = stop_times_int[[\"stop_id\", \"stop_int\"]].rename(columns={\"stop_id\": \"stop_id2\", \"stop_int\" : \"stop_int_2\"})\n", "df_stop_int2.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add the the stop id for the arrival destination, stop_int2" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "transfers_df_int = transfers_df.merge(df_stop_int2.set_index(\"stop_id2\"), how=\"inner\", on = \"stop_id2\").drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2stop_intstop_int_2
00850092685906160.122430146Oetwil a.d.L., SchweizäckerGeroldswil, Schweizäcker13921310
378193859061885906160.412676495Geroldswil, ZentrumGeroldswil, Schweizäcker5901310
748821859073785906160.422521507Oetwil an der Limmat, HaldeGeroldswil, Schweizäcker9011310
1111850092685907370.300175360Oetwil a.d.L., SchweizäckerOetwil an der Limmat, Halde1392901
1868189859061685907370.422521507Geroldswil, SchweizäckerOetwil an der Limmat, Halde1310901
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n", "0 0 8500926 8590616 0.122430 146 \n", "37 8193 8590618 8590616 0.412676 495 \n", "74 8821 8590737 8590616 0.422521 507 \n", "111 1 8500926 8590737 0.300175 360 \n", "186 8189 8590616 8590737 0.422521 507 \n", "\n", " stop_name stop_name2 stop_int \\\n", "0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 1392 \n", "37 Geroldswil, Zentrum Geroldswil, Schweizäcker 590 \n", "74 Oetwil an der Limmat, Halde Geroldswil, Schweizäcker 901 \n", "111 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 1392 \n", "186 Geroldswil, Schweizäcker Oetwil an der Limmat, Halde 1310 \n", "\n", " stop_int_2 \n", "0 1310 \n", "37 1310 \n", "74 1310 \n", "111 901 \n", "186 901 " ] }, - "execution_count": 15, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df_int.head(5)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9434" ] }, - "execution_count": 16, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df_int.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "transfers = transfers_df_int" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1503" ] }, - "execution_count": 18, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check number unique stops2 in transfers\n", "transfers.stop_id2.nunique()" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1503" ] }, - "execution_count": 19, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers.stop_id.nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
026-66-j19-1859120517.TA.26-66-j19-1.1.H859120517:00:0017:00:003Zürich, Hürlimannplatz47.3650668.526539Zürich, Neubühl3870016:55:001225121317Bus
126-66-j19-1859141517.TA.26-66-j19-1.1.H859141517:02:0017:02:004Zürich, Waffenplatzstrasse47.3614828.525749Zürich, Neubühl3870016:55:001225121267Bus
226-66-j19-1859120417.TA.26-66-j19-1.1.H859120417:03:0017:03:005Zürich, Hügelstrasse47.3585438.526997Zürich, Neubühl3870016:55:0012251267Bus
326-66-j19-1859109817.TA.26-66-j19-1.1.H859109817:04:0017:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3870016:55:00122512512Bus
426-66-j19-1859139217.TA.26-66-j19-1.1.H859139217:05:0017:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3870016:55:00122512403Bus
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id arrival_time \\\n", "0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 17:00:00 \n", "1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 17:02:00 \n", "2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 17:03:00 \n", "3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 17:04:00 \n", "4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 17:05:00 \n", "\n", " departure_time stop_sequence stop_name stop_lat \\\n", "0 17:00:00 3 Zürich, Hürlimannplatz 47.365066 \n", "1 17:02:00 4 Zürich, Waffenplatzstrasse 47.361482 \n", "2 17:03:00 5 Zürich, Hügelstrasse 47.358543 \n", "3 17:04:00 6 Zürich, Brunau/Mutschellenstr. 47.355147 \n", "4 17:05:00 7 Zürich, Thujastrasse 47.350187 \n", "\n", " stop_lon trip_headsign trip_short_name direction_id \\\n", "0 8.526539 Zürich, Neubühl 3870 0 \n", "1 8.525749 Zürich, Neubühl 3870 0 \n", "2 8.526997 Zürich, Neubühl 3870 0 \n", "3 8.527141 Zürich, Neubühl 3870 0 \n", "4 8.527806 Zürich, Neubühl 3870 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \n", "0 16:55:00 1225 12 1317 Bus \n", "1 16:55:00 1225 12 1267 Bus \n", "2 16:55:00 1225 12 67 Bus \n", "3 16:55:00 1225 12 512 Bus \n", "4 16:55:00 1225 12 403 Bus " ] }, - "execution_count": 20, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered = stop_times_curated\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by making sure the order is correct" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_desc
9311126-10-j19-185732051672.TA.26-10-j19-1.11.R857320507:00:0007:01:0027Zürich Flughafen, Bahnhof47.4504418.563729Zürich Flughafen, Fracht4096107:01:0002298Tram
9311226-10-j19-185885531672.TA.26-10-j19-1.11.R858855307:02:0007:02:0028Zürich Flughafen, Fracht47.4524948.572057Zürich Flughafen, Fracht4096107:01:00021295Tram
9311326-13-j19-185762402064.TA.26-13-j19-1.24.H857624007:00:0007:00:005Zürich, Meierhofplatz47.4020108.499374Zürich, Albisgütli1831007:00:001261222Tram
9311426-13-j19-185913532064.TA.26-13-j19-1.24.H859135307:01:0007:01:006Zürich, Schwert47.3997308.504611Zürich, Albisgütli1831007:00:00126816Tram
9311526-13-j19-185910392064.TA.26-13-j19-1.24.H859103907:02:0007:02:007Zürich, Alte Trotte47.3977668.507252Zürich, Albisgütli1831007:00:00126778Tram
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n", "93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n", "93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n", "93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n", "93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n", "\n", " arrival_time departure_time stop_sequence stop_name \\\n", "93111 07:00:00 07:01:00 27 Zürich Flughafen, Bahnhof \n", "93112 07:02:00 07:02:00 28 Zürich Flughafen, Fracht \n", "93113 07:00:00 07:00:00 5 Zürich, Meierhofplatz \n", "93114 07:01:00 07:01:00 6 Zürich, Schwert \n", "93115 07:02:00 07:02:00 7 Zürich, Alte Trotte \n", "\n", " stop_lat stop_lon trip_headsign trip_short_name \\\n", "93111 47.450441 8.563729 Zürich Flughafen, Fracht 4096 \n", "93112 47.452494 8.572057 Zürich Flughafen, Fracht 4096 \n", "93113 47.402010 8.499374 Zürich, Albisgütli 1831 \n", "93114 47.399730 8.504611 Zürich, Albisgütli 1831 \n", "93115 47.397766 8.507252 Zürich, Albisgütli 1831 \n", "\n", " direction_id departure_first_stop route_int stop_count stop_int \\\n", "93111 1 07:01:00 0 2 298 \n", "93112 1 07:01:00 0 2 1295 \n", "93113 0 07:00:00 1 26 1222 \n", "93114 0 07:00:00 1 26 816 \n", "93115 0 07:00:00 1 26 778 \n", "\n", " route_desc \n", "93111 Tram \n", "93112 Tram \n", "93113 Tram \n", "93114 Tram \n", "93115 Tram " ] }, - "execution_count": 23, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"departure_first_stop\", \"stop_sequence\"])\n", + "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"departure_first_stop\", \"departure_time\"])\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
arrival_timedeparture_time
9311107:00:0007:01:00
9311207:02:0007:02:00
9311307:00:0007:00:00
9311407:01:0007:01:00
9311507:02:0007:02:00
\n", "
" ], "text/plain": [ " arrival_time departure_time\n", "93111 07:00:00 07:01:00\n", "93112 07:02:00 07:02:00\n", "93113 07:00:00 07:00:00\n", "93114 07:01:00 07:01:00\n", "93115 07:02:00 07:02:00" ] }, - "execution_count": 24, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add None to first arrival time and last departure time." ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descsequence_shift_1departure_first_shift_1
9311126-10-j19-185732051672.TA.26-10-j19-1.11.R857320507:00:0007:01:0027Zürich Flughafen, Bahnhof47.4504418.563729Zürich Flughafen, Fracht4096107:01:0002298Tram2807:01:00
9311226-10-j19-185885531672.TA.26-10-j19-1.11.R858855307:02:0007:02:0028Zürich Flughafen, Fracht47.4524948.572057Zürich Flughafen, Fracht4096107:01:00021295Tram507:00:00
9311326-13-j19-185762402064.TA.26-13-j19-1.24.H857624007:00:0007:00:005Zürich, Meierhofplatz47.4020108.499374Zürich, Albisgütli1831007:00:001261222Tram607:00:00
9311426-13-j19-185913532064.TA.26-13-j19-1.24.H859135307:01:0007:01:006Zürich, Schwert47.3997308.504611Zürich, Albisgütli1831007:00:00126816Tram707:00:00
9311526-13-j19-185910392064.TA.26-13-j19-1.24.H859103907:02:0007:02:007Zürich, Alte Trotte47.3977668.507252Zürich, Albisgütli1831007:00:00126778Tram807:00:00
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n", "93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n", "93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n", "93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n", "93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n", "\n", " arrival_time departure_time stop_sequence stop_name \\\n", "93111 07:00:00 07:01:00 27 Zürich Flughafen, Bahnhof \n", "93112 07:02:00 07:02:00 28 Zürich Flughafen, Fracht \n", "93113 07:00:00 07:00:00 5 Zürich, Meierhofplatz \n", "93114 07:01:00 07:01:00 6 Zürich, Schwert \n", "93115 07:02:00 07:02:00 7 Zürich, Alte Trotte \n", "\n", " stop_lat stop_lon trip_headsign trip_short_name \\\n", "93111 47.450441 8.563729 Zürich Flughafen, Fracht 4096 \n", "93112 47.452494 8.572057 Zürich Flughafen, Fracht 4096 \n", "93113 47.402010 8.499374 Zürich, Albisgütli 1831 \n", "93114 47.399730 8.504611 Zürich, Albisgütli 1831 \n", "93115 47.397766 8.507252 Zürich, Albisgütli 1831 \n", "\n", " direction_id departure_first_stop route_int stop_count stop_int \\\n", "93111 1 07:01:00 0 2 298 \n", "93112 1 07:01:00 0 2 1295 \n", "93113 0 07:00:00 1 26 1222 \n", "93114 0 07:00:00 1 26 816 \n", "93115 0 07:00:00 1 26 778 \n", "\n", - " route_desc sequence_shift_1 \n", - "93111 Tram 28 \n", - "93112 Tram 5 \n", - "93113 Tram 6 \n", - "93114 Tram 7 \n", - "93115 Tram 8 " + " route_desc departure_first_shift_1 \n", + "93111 Tram 07:01:00 \n", + "93112 Tram 07:00:00 \n", + "93113 Tram 07:00:00 \n", + "93114 Tram 07:00:00 \n", + "93115 Tram 07:00:00 " ] }, - "execution_count": 25, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#adding a shift\n", - "stop_times_ordered[\"sequence_shift_1\"] = stop_times_ordered[\"stop_sequence\"].shift(-1, fill_value=0)\n", + "stop_times_ordered[\"departure_first_shift_1\"] = stop_times_ordered[\"departure_first_stop\"].shift(-1, fill_value=0)\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ - "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"stop_sequence\"] > stop_times_ordered[\"sequence_shift_1\"]), None, stop_times_ordered['departure_time'])" + "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"departure_first_stop\"] != stop_times_ordered[\"departure_first_shift_1\"]), None, stop_times_ordered['departure_time'])" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ - "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"stop_sequence\"] == 1), None, stop_times_ordered['arrival_time'])" + "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"departure_first_stop\"] == stop_times_ordered[\"departure_time\"]), None, stop_times_ordered['arrival_time'])" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", "
arrival_timedeparture_timestop_sequencesequence_shift_1
9311107:00:00None07:01:002728
9311207:02:00None285
93113None07:00:0007:00:0056
9311407:01:0007:01:0067
9311507:02:0007:02:0078
\n", "
" ], "text/plain": [ - " arrival_time departure_time stop_sequence sequence_shift_1\n", - "93111 07:00:00 07:01:00 27 28\n", - "93112 07:02:00 None 28 5\n", - "93113 07:00:00 07:00:00 5 6\n", - "93114 07:01:00 07:01:00 6 7\n", - "93115 07:02:00 07:02:00 7 8" + " arrival_time departure_time\n", + "93111 None 07:01:00\n", + "93112 07:02:00 None\n", + "93113 None 07:00:00\n", + "93114 07:01:00 07:01:00\n", + "93115 07:02:00 07:02:00" ] }, - "execution_count": 28, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "stop_times_ordered[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_shift_1\"]].head(5)" + "stop_times_ordered[[\"arrival_time\",\"departure_time\"]].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Array structure preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### StopTimes: \n", "[[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We transform it in datetime as required by the raptor algorithm" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n", "stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
arrival_timedeparture_time
931112020-05-22 07:00:002020-05-22 07:01:00NaT2020-05-23 07:01:00
931122020-05-22 07:02:002020-05-23 07:02:00NaT
931132020-05-22 07:00:002020-05-22 07:00:00NaT2020-05-23 07:00:00
931142020-05-22 07:01:002020-05-22 07:01:002020-05-23 07:01:002020-05-23 07:01:00
931152020-05-22 07:02:002020-05-22 07:02:002020-05-23 07:02:002020-05-23 07:02:00
\n", "
" ], "text/plain": [ " arrival_time departure_time\n", - "93111 2020-05-22 07:00:00 2020-05-22 07:01:00\n", - "93112 2020-05-22 07:02:00 NaT\n", - "93113 2020-05-22 07:00:00 2020-05-22 07:00:00\n", - "93114 2020-05-22 07:01:00 2020-05-22 07:01:00\n", - "93115 2020-05-22 07:02:00 2020-05-22 07:02:00" + "93111 NaT 2020-05-23 07:01:00\n", + "93112 2020-05-23 07:02:00 NaT\n", + "93113 NaT 2020-05-23 07:00:00\n", + "93114 2020-05-23 07:01:00 2020-05-23 07:01:00\n", + "93115 2020-05-23 07:02:00 2020-05-23 07:02:00" ] }, - "execution_count": 30, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ - "with open('../data/stop_times_df.pkl','wb') as f: pickle.dump(stop_times_ordered, f)" + "with open('../data/stop_times_df_cyril.pkl','wb') as f: pickle.dump(stop_times_ordered, f)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descsequence_shift_1departure_first_shift_1
9311126-10-j19-185732051672.TA.26-10-j19-1.11.R85732052020-05-22 07:00:002020-05-22 07:01:00NaT2020-05-23 07:01:0027Zürich Flughafen, Bahnhof47.4504418.563729Zürich Flughafen, Fracht4096107:01:0002298Tram2807:01:00
9311226-10-j19-185885531672.TA.26-10-j19-1.11.R85885532020-05-22 07:02:002020-05-23 07:02:00NaT28Zürich Flughafen, Fracht47.4524948.572057Zürich Flughafen, Fracht4096107:01:00021295Tram507:00:00
9311326-13-j19-185762402064.TA.26-13-j19-1.24.H85762402020-05-22 07:00:002020-05-22 07:00:00NaT2020-05-23 07:00:005Zürich, Meierhofplatz47.4020108.499374Zürich, Albisgütli1831007:00:001261222Tram607:00:00
9311426-13-j19-185913532064.TA.26-13-j19-1.24.H85913532020-05-22 07:01:002020-05-22 07:01:002020-05-23 07:01:002020-05-23 07:01:006Zürich, Schwert47.3997308.504611Zürich, Albisgütli1831007:00:00126816Tram707:00:00
9311526-13-j19-185910392064.TA.26-13-j19-1.24.H85910392020-05-22 07:02:002020-05-22 07:02:002020-05-23 07:02:002020-05-23 07:02:007Zürich, Alte Trotte47.3977668.507252Zürich, Albisgütli1831007:00:00126778Tram807:00:00
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n", "93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n", "93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n", "93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n", "93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n", "\n", " arrival_time departure_time stop_sequence \\\n", - "93111 2020-05-22 07:00:00 2020-05-22 07:01:00 27 \n", - "93112 2020-05-22 07:02:00 NaT 28 \n", - "93113 2020-05-22 07:00:00 2020-05-22 07:00:00 5 \n", - "93114 2020-05-22 07:01:00 2020-05-22 07:01:00 6 \n", - "93115 2020-05-22 07:02:00 2020-05-22 07:02:00 7 \n", + "93111 NaT 2020-05-23 07:01:00 27 \n", + "93112 2020-05-23 07:02:00 NaT 28 \n", + "93113 NaT 2020-05-23 07:00:00 5 \n", + "93114 2020-05-23 07:01:00 2020-05-23 07:01:00 6 \n", + "93115 2020-05-23 07:02:00 2020-05-23 07:02:00 7 \n", "\n", " stop_name stop_lat stop_lon \\\n", "93111 Zürich Flughafen, Bahnhof 47.450441 8.563729 \n", "93112 Zürich Flughafen, Fracht 47.452494 8.572057 \n", "93113 Zürich, Meierhofplatz 47.402010 8.499374 \n", "93114 Zürich, Schwert 47.399730 8.504611 \n", "93115 Zürich, Alte Trotte 47.397766 8.507252 \n", "\n", " trip_headsign trip_short_name direction_id \\\n", "93111 Zürich Flughafen, Fracht 4096 1 \n", "93112 Zürich Flughafen, Fracht 4096 1 \n", "93113 Zürich, Albisgütli 1831 0 \n", "93114 Zürich, Albisgütli 1831 0 \n", "93115 Zürich, Albisgütli 1831 0 \n", "\n", " departure_first_stop route_int stop_count stop_int route_desc \\\n", "93111 07:01:00 0 2 298 Tram \n", "93112 07:01:00 0 2 1295 Tram \n", "93113 07:00:00 1 26 1222 Tram \n", "93114 07:00:00 1 26 816 Tram \n", "93115 07:00:00 1 26 778 Tram \n", "\n", - " sequence_shift_1 \n", - "93111 28 \n", - "93112 5 \n", - "93113 6 \n", - "93114 7 \n", - "93115 8 " + " departure_first_shift_1 \n", + "93111 07:01:00 \n", + "93112 07:00:00 \n", + "93113 07:00:00 \n", + "93114 07:00:00 \n", + "93115 07:00:00 " ] }, - "execution_count": 33, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered = stop_times_ordered.sort_values(by=[\"route_int\", \"departure_first_stop\", \"stop_sequence\"])\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we transform it to array, ready ti be used by raptor" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([['2020-05-22T07:00:00.000000000', '2020-05-22T07:01:00.000000000'],\n", - " ['2020-05-22T07:02:00.000000000', 'NaT'],\n", - " ['2020-05-22T07:00:00.000000000', '2020-05-22T07:00:00.000000000'],\n", + "array([[ 'NaT', '2020-05-23T07:01:00.000000000'],\n", + " ['2020-05-23T07:02:00.000000000', 'NaT'],\n", + " [ 'NaT', '2020-05-23T07:00:00.000000000'],\n", " ...,\n", - " ['2020-05-22T07:35:00.000000000', '2020-05-22T07:35:00.000000000'],\n", - " ['2020-05-22T07:36:00.000000000', '2020-05-22T07:36:00.000000000'],\n", - " ['2020-05-22T07:37:00.000000000', 'NaT']],\n", + " ['2020-05-23T07:35:00.000000000', '2020-05-23T07:35:00.000000000'],\n", + " ['2020-05-23T07:36:00.000000000', '2020-05-23T07:36:00.000000000'],\n", + " ['2020-05-23T07:37:00.000000000', 'NaT']],\n", " dtype='datetime64[ns]')" ] }, - "execution_count": 34, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_array = stop_times_ordered[[\"arrival_time\", \"departure_time\"]].to_numpy()\n", "stop_times_array" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "260459" ] }, - "execution_count": 35, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stop_times_array,0)" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_times_array_cyril.pkl','wb') as f: pickle.dump(stop_times_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Routes: \n", "[[route0_nr.Trips, route0_nr. Stops, route0_pointerRoutes, route0_pointerStops_times],[route1_nr.Trips, route1_nr. Stops,, route1_pointerRoutes, route1_pointerStops_times],…]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by getting the number of trips and stops there is for each route" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stops
route_int
012
1126
218
3117
415
\n", "
" ], "text/plain": [ " n_Trips n_stops\n", "route_int \n", "0 1 2\n", "1 1 26\n", "2 1 8\n", "3 1 17\n", "4 1 5" ] }, - "execution_count": 40, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops = stop_times_ordered.groupby([\"route_int\"]).nunique()[[\"trip_id\",\"stop_int\"]].sort_index().rename(columns={\"trip_id\": \"n_Trips\", \"stop_int\": \"n_stops\"})\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1461, 2)" ] }, - "execution_count": 41, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the pointer for the route stops, by adding the unique stops for each route" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stops
route_int
0120
11262
21828
311736
41553
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops\n", "route_int \n", "0 1 2 0\n", "1 1 26 2\n", "2 1 8 28\n", "3 1 17 36\n", "4 1 5 53" ] }, - "execution_count": 42, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops['pointer_routes_stops'] = distinct_trips_stops.n_stops.cumsum().shift(1, fill_value=0)\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the pointer for stop_times by adding the number of stops in each route, counting duplicates (due to several trips)" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "distinct_trips_stops[\"pointer_stop_times\"] = (stop_times_ordered.groupby([\"route_int\"]).count().stop_id).cumsum().shift(1, fill_value=0)" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stopspointer_stop_timespointer_routes_stops_shiftpointer_stop_times_shift
route_int
0120022
1126222828
21828283636
311736365353
41553535858
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n", "route_int \n", "0 1 2 0 0 \n", "1 1 26 2 2 \n", "2 1 8 28 28 \n", "3 1 17 36 36 \n", "4 1 5 53 53 \n", "\n", " pointer_routes_stops_shift pointer_stop_times_shift \n", "route_int \n", "0 2 2 \n", "1 28 28 \n", "2 36 36 \n", "3 53 53 \n", "4 58 58 " ] }, - "execution_count": 44, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops[\"pointer_routes_stops_shift\"] = distinct_trips_stops['pointer_routes_stops'].shift(-1, fill_value=0)\n", "distinct_trips_stops[\"pointer_stop_times_shift\"] = distinct_trips_stops['pointer_stop_times'].shift(-1, fill_value=0)\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "distinct_trips_stops['pointer_routes_stops'] = np.where((distinct_trips_stops[\"pointer_routes_stops\"] == distinct_trips_stops[\"pointer_routes_stops_shift\"]), None, distinct_trips_stops['pointer_routes_stops'])\n", "distinct_trips_stops['pointer_stop_times'] = np.where((distinct_trips_stops[\"pointer_stop_times\"] == distinct_trips_stops[\"pointer_stop_times_shift\"]), None, distinct_trips_stops['pointer_stop_times'])\n" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "n_Trips False\n", "n_stops False\n", "pointer_routes_stops False\n", "pointer_stop_times False\n", "pointer_routes_stops_shift False\n", "pointer_stop_times_shift False\n", "dtype: bool" ] }, - "execution_count": 46, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.isna().any()" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "with open('../data/routes_array_df_cyril.pkl','wb') as f: pickle.dump(distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']], f)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 1461 entries, 0 to 1460\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 n_Trips 1461 non-null int64 \n", " 1 n_stops 1461 non-null int64 \n", " 2 pointer_routes_stops 1461 non-null object\n", " 3 pointer_stop_times 1461 non-null object\n", " 4 pointer_routes_stops_shift 1461 non-null int64 \n", " 5 pointer_stop_times_shift 1461 non-null int64 \n", "dtypes: int64(4), object(2)\n", "memory usage: 79.9+ KB\n" ] } ], "source": [ "distinct_trips_stops.info()" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 2, 0, 0],\n", " [1, 26, 2, 2],\n", " [1, 8, 28, 28],\n", " ...,\n", " [1, 3, 15297, 260396],\n", " [2, 16, 15300, 260399],\n", " [1, 28, 15316, 260431]], dtype=object)" ] }, - "execution_count": 49, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "routes_array = distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']].to_numpy()\n", "routes_array" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1461" ] }, - "execution_count": 50, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(routes_array, 0)" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "with open('../data/routes_array_cyril.pkl','wb') as f: pickle.dump(routes_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
0931110298
19311201295
29311311222
3931141816
4931151778
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "0 93111 0 298\n", "1 93112 0 1295\n", "2 93113 1 1222\n", "3 93114 1 816\n", "4 93115 1 778" ] }, - "execution_count": 52, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops = stop_times_ordered.sort_values([\"route_int\", \"stop_sequence\"])\n", "route_stops = route_stops[['route_int', 'stop_int']].drop_duplicates().reset_index()\n", "route_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 15344 entries, 0 to 15343\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype\n", "--- ------ -------------- -----\n", " 0 index 15344 non-null int64\n", " 1 route_int 15344 non-null int64\n", " 2 stop_int 15344 non-null int64\n", "dtypes: int64(3)\n", "memory usage: 359.8 KB\n" ] } ], "source": [ "route_stops.info()" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1461" ] }, - "execution_count": 54, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.route_int.nunique()" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "with open('../data/route_stops_df_cyril.pkl','wb') as f: pickle.dump(route_stops, f)" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 298, 1295, 1222, ..., 1349, 1042, 549])" ] }, - "execution_count": 56, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops_array = route_stops.stop_int.to_numpy()\n", "route_stops_array" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 57, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(np.unique(route_stops_array))" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15344" ] }, - "execution_count": 58, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(route_stops_array, 0)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15344,)" ] }, - "execution_count": 59, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops_array.shape" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "with open('../data/route_stops_array_cyril.pkl','wb') as f: pickle.dump(route_stops_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if pointers are correct\n", "It is fundamental that the indexes, that serve as pointers, in Routes are correct" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by looking at where the indexes for stop_times and route_stops diverge. This will allow us to change. We can see that Route stops should have a new route at 3 while stop_times should have it at 78, so we try with that" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stopspointer_stop_timespointer_routes_stops_shiftpointer_stop_times_shift
route_int
0120022
1126222828
21828283636
311736365353
41553535858
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n", "route_int \n", "0 1 2 0 0 \n", "1 1 26 2 2 \n", "2 1 8 28 28 \n", "3 1 17 36 36 \n", "4 1 5 53 53 \n", "\n", " pointer_routes_stops_shift pointer_stop_times_shift \n", "route_int \n", "0 2 2 \n", "1 28 28 \n", "2 36 36 \n", "3 53 53 \n", "4 58 58 " ] }, - "execution_count": 61, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can check if the pointer indicates the routes index number. At the pointer_routes should indicate the first stop of a new route. We try with 3 to see if route_stops has a new route at this index. It does so it works" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
0931110298
19311201295
29311311222
3931141816
4931151778
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "0 93111 0 298\n", "1 93112 0 1295\n", "2 93113 1 1222\n", "3 93114 1 816\n", "4 93115 1 778" ] }, - "execution_count": 62, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We go and see if stop_times has a new route at 78. It does, so it works" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_idstop_id_generaltrip_idstop_idarrival_timedeparture_timestop_sequencestop_namestop_latstop_lontrip_headsigntrip_short_namedirection_iddeparture_first_stoproute_intstop_countstop_introute_descsequence_shift_1departure_first_shift_1
7526-66-j19-185910988.TA.26-66-j19-1.1.H85910982020-05-22 18:04:002020-05-22 18:04:002020-05-23 18:04:002020-05-23 18:04:006Zürich, Brunau/Mutschellenstr.47.3551478.527141Zürich, Neubühl3762017:55:00122512512Bus717:55:00
7626-66-j19-185913928.TA.26-66-j19-1.1.H85913922020-05-22 18:05:002020-05-22 18:05:002020-05-23 18:05:002020-05-23 18:05:007Zürich, Thujastrasse47.3501878.527806Zürich, Neubühl3762017:55:00122512403Bus817:55:00
7726-66-j19-185912168.TA.26-66-j19-1.1.H85912162020-05-22 18:06:002020-05-22 18:06:002020-05-23 18:06:002020-05-23 18:06:008Zürich, Jugendherberge47.3480028.528210Zürich, Neubühl3762017:55:001225121375Bus917:55:00
7826-66-j19-185912798.TA.26-66-j19-1.1.H85912792020-05-22 18:08:002020-05-22 18:08:002020-05-23 18:08:002020-05-23 18:08:009Zürich, Morgental47.3439488.530141Zürich, Neubühl3762017:55:001225121349Bus1017:55:00
7926-66-j19-185912178.TA.26-66-j19-1.1.H85912172020-05-22 18:09:002020-05-22 18:09:002020-05-23 18:09:002020-05-23 18:09:0010Zürich, Kalchbühlweg47.3418188.531049Zürich, Neubühl3762017:55:001225121303Bus1117:55:00
\n", "
" ], "text/plain": [ " route_id stop_id_general trip_id stop_id \\\n", "75 26-66-j19-1 8591098 8.TA.26-66-j19-1.1.H 8591098 \n", "76 26-66-j19-1 8591392 8.TA.26-66-j19-1.1.H 8591392 \n", "77 26-66-j19-1 8591216 8.TA.26-66-j19-1.1.H 8591216 \n", "78 26-66-j19-1 8591279 8.TA.26-66-j19-1.1.H 8591279 \n", "79 26-66-j19-1 8591217 8.TA.26-66-j19-1.1.H 8591217 \n", "\n", " arrival_time departure_time stop_sequence \\\n", - "75 2020-05-22 18:04:00 2020-05-22 18:04:00 6 \n", - "76 2020-05-22 18:05:00 2020-05-22 18:05:00 7 \n", - "77 2020-05-22 18:06:00 2020-05-22 18:06:00 8 \n", - "78 2020-05-22 18:08:00 2020-05-22 18:08:00 9 \n", - "79 2020-05-22 18:09:00 2020-05-22 18:09:00 10 \n", + "75 2020-05-23 18:04:00 2020-05-23 18:04:00 6 \n", + "76 2020-05-23 18:05:00 2020-05-23 18:05:00 7 \n", + "77 2020-05-23 18:06:00 2020-05-23 18:06:00 8 \n", + "78 2020-05-23 18:08:00 2020-05-23 18:08:00 9 \n", + "79 2020-05-23 18:09:00 2020-05-23 18:09:00 10 \n", "\n", " stop_name stop_lat stop_lon trip_headsign \\\n", "75 Zürich, Brunau/Mutschellenstr. 47.355147 8.527141 Zürich, Neubühl \n", "76 Zürich, Thujastrasse 47.350187 8.527806 Zürich, Neubühl \n", "77 Zürich, Jugendherberge 47.348002 8.528210 Zürich, Neubühl \n", "78 Zürich, Morgental 47.343948 8.530141 Zürich, Neubühl \n", "79 Zürich, Kalchbühlweg 47.341818 8.531049 Zürich, Neubühl \n", "\n", " trip_short_name direction_id departure_first_stop route_int stop_count \\\n", "75 3762 0 17:55:00 1225 12 \n", "76 3762 0 17:55:00 1225 12 \n", "77 3762 0 17:55:00 1225 12 \n", "78 3762 0 17:55:00 1225 12 \n", "79 3762 0 17:55:00 1225 12 \n", "\n", - " stop_int route_desc sequence_shift_1 \n", - "75 512 Bus 7 \n", - "76 403 Bus 8 \n", - "77 1375 Bus 9 \n", - "78 1349 Bus 10 \n", - "79 1303 Bus 11 " + " stop_int route_desc departure_first_shift_1 \n", + "75 512 Bus 17:55:00 \n", + "76 403 Bus 17:55:00 \n", + "77 1375 Bus 17:55:00 \n", + "78 1349 Bus 17:55:00 \n", + "79 1303 Bus 17:55:00 " ] }, - "execution_count": 63, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered.loc[75:80].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intUnnamed: 0stop_idstop_id2distanceTransfer_time_secstop_namestop_name2stop_int_2
09311102984536.085732058503016:0:20.101546121.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984558.08573205:0:A8503016:0:20.118159141.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984580.08573205:0:B8503016:0:20.104861125.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984624.08573205:0:D8503016:0:20.103327123.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
09311102984646.08573205:0:E8503016:0:20.101546121.0Zürich Flughafen, BahnhofZürich Flughafen1218.0
\n", "
" ], "text/plain": [ " index route_int stop_int Unnamed: 0 stop_id stop_id2 distance \\\n", "0 93111 0 298 4536.0 8573205 8503016:0:2 0.101546 \n", "0 93111 0 298 4558.0 8573205:0:A 8503016:0:2 0.118159 \n", "0 93111 0 298 4580.0 8573205:0:B 8503016:0:2 0.104861 \n", "0 93111 0 298 4624.0 8573205:0:D 8503016:0:2 0.103327 \n", "0 93111 0 298 4646.0 8573205:0:E 8503016:0:2 0.101546 \n", "\n", " Transfer_time_sec stop_name stop_name2 stop_int_2 \n", "0 121.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 141.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 125.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 123.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n", "0 121.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 " ] }, - "execution_count": 64, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_join = route_stops.join(transfers.set_index(\"stop_int\"), how=\"left\", on=\"stop_int\").drop_duplicates()\n", "stops_join.head(5)" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 65, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_join.stop_int.nunique()" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_Transfers
stop_int
01816
1112
2234
366
460
\n", "
" ], "text/plain": [ " n_Routes n_Transfers\n", "stop_int \n", "0 18 16\n", "1 11 2\n", "2 23 4\n", "3 6 6\n", "4 6 0" ] }, - "execution_count": 66, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers = stops_join.sort_values(\"stop_int\").groupby([\"stop_int\"]).nunique().rename(columns={\"route_int\": \"n_Routes\", \"stop_int_2\": \"n_Transfers\"})\n", "distinct_route_transfers = distinct_route_transfers[[\"n_Routes\", \"n_Transfers\"]].sort_index()\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_Transferspointer_stop_routespointer_transfers
stop_int
0181600
11121816
22342918
3665222
4605828
\n", "
" ], "text/plain": [ " n_Routes n_Transfers pointer_stop_routes pointer_transfers\n", "stop_int \n", "0 18 16 0 0\n", "1 11 2 18 16\n", "2 23 4 29 18\n", "3 6 6 52 22\n", "4 6 0 58 28" ] }, - "execution_count": 67, + "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers['pointer_stop_routes'] = distinct_route_transfers.n_Routes.cumsum().shift(1, fill_value=0)\n", "distinct_route_transfers['pointer_transfers'] = distinct_route_transfers.n_Transfers.cumsum().shift(1, fill_value=0)\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_Transferspointer_stop_routespointer_transferspointer_stop_routes_shiftpointer_transfers_shift
stop_int
01816001816
111218162918
223429185222
36652225828
46058286428
\n", "
" ], "text/plain": [ " n_Routes n_Transfers pointer_stop_routes pointer_transfers \\\n", "stop_int \n", "0 18 16 0 0 \n", "1 11 2 18 16 \n", "2 23 4 29 18 \n", "3 6 6 52 22 \n", "4 6 0 58 28 \n", "\n", " pointer_stop_routes_shift pointer_transfers_shift \n", "stop_int \n", "0 18 16 \n", "1 29 18 \n", "2 52 22 \n", "3 58 28 \n", "4 64 28 " ] }, - "execution_count": 68, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers[\"pointer_stop_routes_shift\"] = distinct_route_transfers['pointer_stop_routes'].shift(-1, fill_value=0)\n", "distinct_route_transfers[\"pointer_transfers_shift\"] = distinct_route_transfers['pointer_transfers'].shift(-1, fill_value=0)\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "distinct_route_transfers['pointer_stop_routes'] = np.where((distinct_route_transfers[\"pointer_stop_routes\"] == distinct_route_transfers[\"pointer_stop_routes_shift\"]), None, distinct_route_transfers['pointer_stop_routes'])\n", "distinct_route_transfers['pointer_transfers'] = np.where((distinct_route_transfers[\"pointer_transfers\"] == distinct_route_transfers[\"pointer_transfers_shift\"]), None, distinct_route_transfers['pointer_transfers'])\n" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "n_Routes False\n", "n_Transfers False\n", "pointer_stop_routes False\n", "pointer_transfers True\n", "pointer_stop_routes_shift False\n", "pointer_transfers_shift False\n", "dtype: bool" ] }, - "execution_count": 70, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers.isna().any()" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "stops_df = distinct_route_transfers[['pointer_stop_routes', 'pointer_transfers']]" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "with open('../data/stops_df.pkl','wb') as f: pickle.dump(stops_df, f)" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0],\n", " [18, 16],\n", " [29, 18],\n", " ...,\n", " [15329, 6322],\n", " [15334, 6329],\n", " [15339, 6334]], dtype=object)" ] }, - "execution_count": 73, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_array = stops_df.to_numpy()\n", "stops_array" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 74, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stops_array, 0)" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1407, 2)" ] }, - "execution_count": 75, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_array.shape" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "with open('../data/stops_array_cyril.pkl','wb') as f: pickle.dump(stops_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intstop_id
0871638208503088:0:22
19554312908503088:0:21
212933218708503088:0:21
37384821108503088:0:22
414728525108503088:0:21
\n", "
" ], "text/plain": [ " index route_int stop_int stop_id\n", "0 87163 82 0 8503088:0:22\n", "1 95543 129 0 8503088:0:21\n", "2 129332 187 0 8503088:0:21\n", "3 73848 211 0 8503088:0:22\n", "4 147285 251 0 8503088:0:21" ] }, - "execution_count": 77, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes = stop_times_ordered[[\"route_int\", \"stop_int\", \"stop_id\"]].drop_duplicates().sort_values([\"stop_int\", \"route_int\"])\n", "stop_routes = stop_routes.reset_index()\n", "stop_routes.head(5)" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15486, 4)" ] }, - "execution_count": 78, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.shape" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "249" ] }, - "execution_count": 79, + "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.route_id.nunique()" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1461" ] }, - "execution_count": 80, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.route_int.nunique()" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_routes_df_cyril.pkl','wb') as f: pickle.dump(stop_routes, f)" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 82, 129, 187, ..., 855, 977, 1087])" ] }, - "execution_count": 82, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes_array = stop_routes[\"route_int\"].to_numpy()\n", "stop_routes_array" ] }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15486" ] }, - "execution_count": 83, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stop_routes_array, 0)" ] }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15486,)" ] }, - "execution_count": 84, + "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes_array.shape" ] }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_routes_array_cyril.pkl','wb') as f: pickle.dump(stop_routes_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]" ] }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 116, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9434" ] }, - "execution_count": 86, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers.stop_id.count()" ] }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_intstop_int_2Transfer_time_sec
0008
1051564
20274441
30375594
40462489
\n", "
" ], "text/plain": [ " stop_int stop_int_2 Transfer_time_sec\n", "0 0 0 8\n", "1 0 51 564\n", "2 0 274 441\n", "3 0 375 594\n", "4 0 462 489" ] }, - "execution_count": 89, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_pandas = transfers[[\"stop_int\",\"stop_int_2\", \"Transfer_time_sec\"]].sort_values([\"stop_int\", \"stop_int_2\"]).drop_duplicates([\"stop_int\", \"stop_int_2\"])\n", "transfer_pandas = transfer_pandas.reset_index(drop=True)\n", "transfer_pandas.head()" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 118, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1342" ] }, - "execution_count": 90, + "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_pandas.stop_int_2.nunique()" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "with open('../data/transfer_df_cyril.pkl','wb') as f: pickle.dump(transfers.sort_values(\"stop_id\"), f)" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 8],\n", " [ 51, 564],\n", " [ 274, 441],\n", " ...,\n", " [1120, 345],\n", " [1266, 561],\n", " [1406, 8]])" ] }, - "execution_count": 92, + "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_array = transfer_pandas[[\"stop_int_2\", \"Transfer_time_sec\"]].to_numpy()\n", "transfer_array" ] }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "with open('../data/transfer_array_cyril.pkl','wb') as f: pickle.dump(transfer_array, f)" ] }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6342" ] }, - "execution_count": 94, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(transfer_array, 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Check if indexes in stops is correct" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see first the pointers" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pointer_stop_routespointer_transfers
stop_int
000
11816
22918
35222
458None
\n", "
" ], "text/plain": [ " pointer_stop_routes pointer_transfers\n", "stop_int \n", "0 0 0\n", "1 18 16\n", "2 29 18\n", "3 52 22\n", "4 58 None" ] }, - "execution_count": 95, + "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that at the index 16 there should be a new stop. we check and it is false" ] }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_intstop_int_2Transfer_time_sec
1501289460
161814267
1711350569
18238346
1921062413
\n", "
" ], "text/plain": [ " stop_int stop_int_2 Transfer_time_sec\n", "15 0 1289 460\n", "16 1 814 267\n", "17 1 1350 569\n", "18 2 38 346\n", "19 2 1062 413" ] }, - "execution_count": 97, + "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_pandas.loc[15:20].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that at index 18 we should have a new stop. we check and it true" ] }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intstop_id
15240179103908503088:0:21
16150919107808503088:0:21
1726670131608503088:0:21
18938571818502508
1923650811418502508
\n", "
" ], "text/plain": [ " index route_int stop_int stop_id\n", "15 240179 1039 0 8503088:0:21\n", "16 150919 1078 0 8503088:0:21\n", "17 26670 1316 0 8503088:0:21\n", "18 93857 18 1 8502508\n", "19 236508 114 1 8502508" ] }, - "execution_count": 99, + "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.loc[15:20].head(5)" ] }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 126, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_intstop_id
70241089033826178503006:0:5
82041089013827248503011:0:2
1259910890038211388503010:0:2
1294010890238211768503000:0:33
1359010890438212188503016:0:3
\n", "
" ], "text/plain": [ " index route_int stop_int stop_id\n", "7024 108903 382 617 8503006:0:5\n", "8204 108901 382 724 8503011:0:2\n", "12599 108900 382 1138 8503010:0:2\n", "12940 108902 382 1176 8503000:0:33\n", "13590 108904 382 1218 8503016:0:3" ] }, - "execution_count": 106, + "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.loc[stop_routes['route_int'] == 382]" ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
40241089003821138
4025108901382724
40261089023821176
4027108903382617
40281089043821218
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "4024 108900 382 1138\n", "4025 108901 382 724\n", "4026 108902 382 1176\n", "4027 108903 382 617\n", "4028 108904 382 1218" ] }, - "execution_count": 108, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.loc[route_stops['route_int'] == 382]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "read files as pickles" ] }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 128, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../data/stop_times_array.pkl'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/stop_times_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/stop_times_array.pkl'" + ] + } + ], "source": [ - "with open('../data/stop_times_array.pkl','rb') as f: arrayname1 = pickle.load(f)" + "with open('../data/stop_times_array_cyril.pkl','rb') as f: arrayname1 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "ename": "UnpicklingError", "evalue": "invalid load key, 'v'.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/routes_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m: invalid load key, 'v'." ] } ], "source": [ - "with open('../data/routes_array.pkl','rb') as f: arrayname2 = pickle.load(f)" + "with open('../data/routes_array_cyril.pkl','rb') as f: arrayname2 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "ename": "UnpicklingError", "evalue": "invalid load key, 'v'.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/route_stops_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname3\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mUnpicklingError\u001b[0m: invalid load key, 'v'." ] } ], "source": [ - "with open('../data/route_stops_array.pkl','rb') as f: arrayname3 = pickle.load(f)" + "with open('../data/route_stops_array_cyril.pkl','rb') as f: arrayname3 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([['2020-05-22T07:00:00.000000000', '2020-05-22T07:01:00.000000000'],\n", " ['2020-05-22T07:02:00.000000000', 'NaT'],\n", " ['2020-05-22T07:00:00.000000000', '2020-05-22T07:00:00.000000000'],\n", " ...,\n", " ['2020-05-22T07:35:00.000000000', '2020-05-22T07:35:00.000000000'],\n", " ['2020-05-22T07:36:00.000000000', '2020-05-22T07:36:00.000000000'],\n", " ['2020-05-22T07:37:00.000000000', 'NaT']],\n", " dtype='datetime64[ns]')" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arrayname1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "arrayname2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "arrayname3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/notebooks/transfer_to_local.ipynb b/notebooks/transfer_to_local.ipynb index c9649d5..8196c42 100644 --- a/notebooks/transfer_to_local.ipynb +++ b/notebooks/transfer_to_local.ipynb @@ -1,261 +1,261 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## transfer files from HDFS to local\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7704application_1589299642358_2200pysparkidleLinkLink
7735application_1589299642358_2231pysparkidleLinkLink
7737application_1589299642358_2233pysparkidleLinkLink
7739application_1589299642358_2235pysparkdeadLinkLink
7743application_1589299642358_2239pysparkidleLinkLink
7745application_1589299642358_2241pysparkidleLinkLink
7750application_1589299642358_2246pysparkbusyLinkLink
7753application_1589299642358_2249pysparkidleLinkLink
7756application_1589299642358_2252pysparkidleLinkLink
7759application_1589299642358_2255pysparkbusyLinkLink
7760application_1589299642358_2256pysparkidleLinkLink
7761application_1589299642358_2257pysparkidleLinkLink
7762application_1589299642358_2258pysparkidleLinkLink
7764application_1589299642358_2260pysparkidleLinkLink
7767application_1589299642358_2263pysparkidleLinkLink
7768application_1589299642358_2264pysparkidleLinkLink
7770application_1589299642358_2266pysparkidleLinkLink
7772application_1589299642358_2268pysparkidleLinkLink
7773application_1589299642358_2269pysparkidleLinkLink
7774application_1589299642358_2270pysparkidleLinkLink
7775application_1589299642358_2272pysparkidleLinkLink
7776application_1589299642358_2273pysparkidleLinkLink
7777application_1589299642358_2274pysparkidleLinkLink
7778application_1589299642358_2275pysparkidleLinkLink
7779application_1589299642358_2276pysparkbusyLinkLink
7780application_1589299642358_2277pysparkidleLinkLink
7781application_1589299642358_2278pysparkbusyLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7932application_1589299642358_2450pysparkidleLinkLink7933application_1589299642358_2451pysparkidleLinkLink7935application_1589299642358_2453pysparkidleLinkLink7939application_1589299642358_2457pysparkidleLinkLink7940application_1589299642358_2458pysparkidleLinkLink7941application_1589299642358_2459pysparkidleLinkLink7942application_1589299642358_2460pysparkidleLinkLink7944application_1589299642358_2462pysparkidleLinkLink7945application_1589299642358_2463pysparkdeadLinkLink7946application_1589299642358_2464pysparkidleLinkLink7947application_1589299642358_2465pysparkidleLinkLink7948application_1589299642358_2466pysparkidleLinkLink7949application_1589299642358_2467pysparkidleLinkLink" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7782application_1589299642358_2279pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7950application_1589299642358_2468pysparkidleLinkLink✔" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfert and save tables from hdfs to local \n", "\n", "Here we describe the process of loading a table on hdfs, saving it to a proper place so that we can load it in local and then save it.\n", "\n", "First, we load the data that is in an otherwise not accessible place in hdfs :" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "username = 'tturner'\n", "\n", "my_files = ['stop_times_curated.csv',\n", " 'stops_15km.csv', 'transfers.csv', 'stop_times_final_cyril.csv']\n", "\n", "for file in my_files:\n", " this_file = spark.read.csv('data/lgpt_guys/{}'.format(file), \\\n", " header = True) \n", " this_file.write.csv(\"/user/{0}/{1}\".format(username, file.replace('.csv','')), \\\n", " header = True, mode = 'overwrite')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/work/final_project/notebooks\n", "stop_times_curated\n", "stops_15km\n", "transfers\n", "stop_times_final_cyril\n" ] } ], "source": [ "%local\n", "\n", "from hdfs3 import HDFileSystem\n", "import pandas as pd\n", "import numpy as np \n", "import os\n", "print(os.getcwd())\n", "\n", "hdfs = HDFileSystem(host='hdfs://iccluster044.iccluster.epfl.ch', port=8020, user='ebouille')\n", "\n", "username = 'tturner'\n", "\n", "my_folders = ['stop_times_curated', \n", " 'stops_15km', 'transfers', 'stop_times_final_cyril']\n", "\n", "for folder in my_folders:\n", " print(folder)\n", " array_files = hdfs.glob('/user/{0}/{1}/*.csv'.format(username, folder))\n", " array = pd.DataFrame()\n", " for file in array_files:\n", " with hdfs.open(file) as f:\n", " array = array.append(pd.read_csv(f))\n", "\n", " array.to_csv('../data/{}.csv'.format(folder), header=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 }