diff --git a/.gitattributes b/.gitattributes
index 101dcef..699bf1a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,37 +1,38 @@
data/distributions.pickle filter=lfs diff=lfs merge=lfs -text
data/stop_times_array_version2.csv filter=lfs diff=lfs merge=lfs -text
data/transfer_array_version2.csv filter=lfs diff=lfs merge=lfs -text
data/routes_array_version2.csv filter=lfs diff=lfs merge=lfs -text
data/route_stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text
data/stop_routes_array_version3.csv filter=lfs diff=lfs merge=lfs -text
data/stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text
object.data filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
data/*.pkl filter=lfs diff=lfs merge=lfs -text
data/dere.pkl filter=lfs diff=lfs merge=lfs -text
data/transfer_array.pkl filter=lfs diff=lfs merge=lfs -text
data/stops_array.pkl filter=lfs diff=lfs merge=lfs -text
data/stop_times_array.pkl filter=lfs diff=lfs merge=lfs -text
data/stop_routes_array.pkl filter=lfs diff=lfs merge=lfs -text
data/routes_array.pkl filter=lfs diff=lfs merge=lfs -text
data/route_stops_array.pkl filter=lfs diff=lfs merge=lfs -text
data/route_stops_df.pkl filter=lfs diff=lfs merge=lfs -text
data/routes_array_df.pkl filter=lfs diff=lfs merge=lfs -text
data/stop_routes_df.pkl filter=lfs diff=lfs merge=lfs -text
data/stop_times_df.pkl filter=lfs diff=lfs merge=lfs -text
data/stops_df.pkl filter=lfs diff=lfs merge=lfs -text
data/transfer_df.pkl filter=lfs diff=lfs merge=lfs -text
data/distrib_recov_tab_stopID_hour.pkl.gz filter=lfs diff=lfs merge=lfs -text
data/join_distribution_all.pkl.gz filter=lfs diff=lfs merge=lfs -text
data/join_distribution_cumulative_p.pkl.gz filter=lfs diff=lfs merge=lfs -text
data/join_distribution_cumulative_p_2.pkl.gz filter=lfs diff=lfs merge=lfs -text
data/route_stops_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/route_stops_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/routes_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/routes_array_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/stop_routes_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/stop_routes_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/stop_times_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/transfer_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/transfer_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text
data/stops_array_cyril.pkl filter=lfs diff=lfs merge=lfs -text
+data/stop_times_df_cyril.pkl filter=lfs diff=lfs merge=lfs -text
diff --git a/data/route_stops_df_cyril.pkl b/data/route_stops_df_cyril.pkl
index d22734d..70461ea 100644
--- a/data/route_stops_df_cyril.pkl
+++ b/data/route_stops_df_cyril.pkl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:c62d0046e831eee39e5ee0605aa0eac60d6bad85c0c5d386d91ee81380216265
+oid sha256:3f86289015b10f212f8e2415b29cf5bd584a575fdd860d701cef87a46edb0bb8
size 369113
diff --git a/data/stop_routes_df_cyril.pkl b/data/stop_routes_df_cyril.pkl
index 960efe5..484e287 100644
--- a/data/stop_routes_df_cyril.pkl
+++ b/data/stop_routes_df_cyril.pkl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:b563d73903bf423b3c49fa4987691f28ee05261baad5595f19c46b56cf2a382f
+oid sha256:53fe1e7c5b985f8d14528085832799ded6cf202c28872d6ff1e63953c2c20716
size 538099
diff --git a/data/stop_times_array_cyril.pkl b/data/stop_times_array_cyril.pkl
index 8e5c056..524a974 100644
--- a/data/stop_times_array_cyril.pkl
+++ b/data/stop_times_array_cyril.pkl
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:3c38ce272796cd8dcc9f560ba57840e43ce167893d43f4840087be1652bd25a4
+oid sha256:279ef5c4d19dc2f4d4194553ac95b57364efa7fdf19636831fcdc39d2b2a127d
size 4167529
diff --git a/data/stop_times_df_cyril.pkl b/data/stop_times_df_cyril.pkl
new file mode 100644
index 0000000..daea263
--- /dev/null
+++ b/data/stop_times_df_cyril.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b44372accdcd34651a05e4ce97ff137a448fc5f8d87fb5ea97ffe1fd13a1ad0e
+size 36173268
diff --git a/notebooks/Arrays_Cyrill_data.ipynb b/notebooks/Arrays_Cyrill_data.ipynb
index c7cfa64..9cef39b 100644
--- a/notebooks/Arrays_Cyrill_data.ipynb
+++ b/notebooks/Arrays_Cyrill_data.ipynb
@@ -1,5236 +1,5236 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preprocessing part 2: preparing the arrays\n",
"In this notebook we take 2 datasets prepared in spark: stop_times and transfers, and prepare them into the array format needed to run RAPTOR"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Outline\n",
"In this notebook the following actions are performed:\n",
"- create array"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import packages"
]
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pickle\n",
"import itertools"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read files\n",
"Before running make sure the .csv files are in /data . If not run notebook \"transfer_to_local\""
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" route_id | \n",
" stop_id_general | \n",
" trip_id | \n",
" stop_id | \n",
" arrival_time | \n",
" departure_time | \n",
" stop_sequence | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" trip_headsign | \n",
" trip_short_name | \n",
" direction_id | \n",
" departure_first_stop | \n",
" route_int | \n",
" stop_count | \n",
" stop_int | \n",
" route_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 26-66-j19-1 | \n",
" 8591205 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591205 | \n",
" 17:00:00 | \n",
" 17:00:00 | \n",
" 3 | \n",
" Zürich, Hürlimannplatz | \n",
" 47.365066 | \n",
" 8.526539 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1317 | \n",
" Bus | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 26-66-j19-1 | \n",
" 8591415 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591415 | \n",
" 17:02:00 | \n",
" 17:02:00 | \n",
" 4 | \n",
" Zürich, Waffenplatzstrasse | \n",
" 47.361482 | \n",
" 8.525749 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1267 | \n",
" Bus | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 26-66-j19-1 | \n",
" 8591204 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591204 | \n",
" 17:03:00 | \n",
" 17:03:00 | \n",
" 5 | \n",
" Zürich, Hügelstrasse | \n",
" 47.358543 | \n",
" 8.526997 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 67 | \n",
" Bus | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 26-66-j19-1 | \n",
" 8591098 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591098 | \n",
" 17:04:00 | \n",
" 17:04:00 | \n",
" 6 | \n",
" Zürich, Brunau/Mutschellenstr. | \n",
" 47.355147 | \n",
" 8.527141 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 512 | \n",
" Bus | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 26-66-j19-1 | \n",
" 8591392 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591392 | \n",
" 17:05:00 | \n",
" 17:05:00 | \n",
" 7 | \n",
" Zürich, Thujastrasse | \n",
" 47.350187 | \n",
" 8.527806 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 403 | \n",
" Bus | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 route_id stop_id_general trip_id stop_id \\\n",
"0 0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 \n",
"1 1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 \n",
"2 2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 \n",
"3 3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 \n",
"4 4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 \n",
"\n",
" arrival_time departure_time stop_sequence stop_name \\\n",
"0 17:00:00 17:00:00 3 Zürich, Hürlimannplatz \n",
"1 17:02:00 17:02:00 4 Zürich, Waffenplatzstrasse \n",
"2 17:03:00 17:03:00 5 Zürich, Hügelstrasse \n",
"3 17:04:00 17:04:00 6 Zürich, Brunau/Mutschellenstr. \n",
"4 17:05:00 17:05:00 7 Zürich, Thujastrasse \n",
"\n",
" stop_lat stop_lon trip_headsign trip_short_name direction_id \\\n",
"0 47.365066 8.526539 Zürich, Neubühl 3870 0 \n",
"1 47.361482 8.525749 Zürich, Neubühl 3870 0 \n",
"2 47.358543 8.526997 Zürich, Neubühl 3870 0 \n",
"3 47.355147 8.527141 Zürich, Neubühl 3870 0 \n",
"4 47.350187 8.527806 Zürich, Neubühl 3870 0 \n",
"\n",
" departure_first_stop route_int stop_count stop_int route_desc \n",
"0 16:55:00 1225 12 1317 Bus \n",
"1 16:55:00 1225 12 1267 Bus \n",
"2 16:55:00 1225 12 67 Bus \n",
"3 16:55:00 1225 12 512 Bus \n",
"4 16:55:00 1225 12 403 Bus "
]
},
- "execution_count": 2,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#stop_times\n",
"stop_times_curated = pd.read_csv(\"../data/stop_times_final_cyril.csv\")\n",
"stop_times_curated.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We drop columns not useful to us"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"stop_times_curated = stop_times_curated.drop(columns=[\"Unnamed: 0\"])"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" stop_id | \n",
" stop_id2 | \n",
" distance | \n",
" Transfer_time_sec | \n",
" stop_name | \n",
" stop_name2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 8500926 | \n",
" 8590616 | \n",
" 0.122430 | \n",
" 146 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Geroldswil, Schweizäcker | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 8500926 | \n",
" 8590737 | \n",
" 0.300175 | \n",
" 360 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Oetwil an der Limmat, Halde | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 8502186 | \n",
" 8502186:0:1 | \n",
" 0.006762 | \n",
" 8 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 8502186 | \n",
" 8502186:0:2 | \n",
" 0.013524 | \n",
" 16 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 8502186 | \n",
" 8502186P | \n",
" 0.000000 | \n",
" 0 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n",
"0 0 8500926 8590616 0.122430 146 \n",
"1 1 8500926 8590737 0.300175 360 \n",
"2 2 8502186 8502186:0:1 0.006762 8 \n",
"3 3 8502186 8502186:0:2 0.013524 16 \n",
"4 4 8502186 8502186P 0.000000 0 \n",
"\n",
" stop_name stop_name2 \n",
"0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n",
"1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n",
"2 Dietikon Stoffelbach Dietikon Stoffelbach \n",
"3 Dietikon Stoffelbach Dietikon Stoffelbach \n",
"4 Dietikon Stoffelbach Dietikon Stoffelbach "
]
},
- "execution_count": 4,
+ "execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#transfers\n",
"transfers = pd.read_csv(\"../data/transfers.csv\")\n",
"transfers.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Transfer: delete transfer to same stop & get stop_int & stop_int2\n"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12564"
]
},
- "execution_count": 5,
+ "execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#check number stops transfers\n",
"transfers.stop_id.count()"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" stop_id | \n",
" stop_id2 | \n",
" distance | \n",
" Transfer_time_sec | \n",
" stop_name | \n",
" stop_name2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 8500926 | \n",
" 8590616 | \n",
" 0.122430 | \n",
" 146 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Geroldswil, Schweizäcker | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 8500926 | \n",
" 8590737 | \n",
" 0.300175 | \n",
" 360 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Oetwil an der Limmat, Halde | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 8502186 | \n",
" 8502186:0:1 | \n",
" 0.006762 | \n",
" 8 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 8502186 | \n",
" 8502186:0:2 | \n",
" 0.013524 | \n",
" 16 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 8502186 | \n",
" 8502186P | \n",
" 0.000000 | \n",
" 0 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n",
"0 0 8500926 8590616 0.122430 146 \n",
"1 1 8500926 8590737 0.300175 360 \n",
"2 2 8502186 8502186:0:1 0.006762 8 \n",
"3 3 8502186 8502186:0:2 0.013524 16 \n",
"4 4 8502186 8502186P 0.000000 0 \n",
"\n",
" stop_name stop_name2 \n",
"0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker \n",
"1 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde \n",
"2 Dietikon Stoffelbach Dietikon Stoffelbach \n",
"3 Dietikon Stoffelbach Dietikon Stoffelbach \n",
"4 Dietikon Stoffelbach Dietikon Stoffelbach "
]
},
- "execution_count": 6,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We delete transfers to the same stop"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"transfers_df = transfers[transfers['stop_id'] != transfers['stop_id2']]"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12564"
]
},
- "execution_count": 8,
+ "execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers_df.stop_id.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We create the stop_int column in transfers. This action eliminates stops not in stop_times"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" route_id | \n",
" stop_id_general | \n",
" trip_id | \n",
" stop_id | \n",
" arrival_time | \n",
" departure_time | \n",
" stop_sequence | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" trip_headsign | \n",
" trip_short_name | \n",
" direction_id | \n",
" departure_first_stop | \n",
" route_int | \n",
" stop_count | \n",
" stop_int | \n",
" route_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 26-66-j19-1 | \n",
" 8591205 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591205 | \n",
" 17:00:00 | \n",
" 17:00:00 | \n",
" 3 | \n",
" Zürich, Hürlimannplatz | \n",
" 47.365066 | \n",
" 8.526539 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1317 | \n",
" Bus | \n",
"
\n",
" \n",
" 1 | \n",
" 26-66-j19-1 | \n",
" 8591415 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591415 | \n",
" 17:02:00 | \n",
" 17:02:00 | \n",
" 4 | \n",
" Zürich, Waffenplatzstrasse | \n",
" 47.361482 | \n",
" 8.525749 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1267 | \n",
" Bus | \n",
"
\n",
" \n",
" 2 | \n",
" 26-66-j19-1 | \n",
" 8591204 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591204 | \n",
" 17:03:00 | \n",
" 17:03:00 | \n",
" 5 | \n",
" Zürich, Hügelstrasse | \n",
" 47.358543 | \n",
" 8.526997 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 67 | \n",
" Bus | \n",
"
\n",
" \n",
" 3 | \n",
" 26-66-j19-1 | \n",
" 8591098 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591098 | \n",
" 17:04:00 | \n",
" 17:04:00 | \n",
" 6 | \n",
" Zürich, Brunau/Mutschellenstr. | \n",
" 47.355147 | \n",
" 8.527141 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 512 | \n",
" Bus | \n",
"
\n",
" \n",
" 4 | \n",
" 26-66-j19-1 | \n",
" 8591392 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591392 | \n",
" 17:05:00 | \n",
" 17:05:00 | \n",
" 7 | \n",
" Zürich, Thujastrasse | \n",
" 47.350187 | \n",
" 8.527806 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 403 | \n",
" Bus | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" route_id stop_id_general trip_id stop_id arrival_time \\\n",
"0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 17:00:00 \n",
"1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 17:02:00 \n",
"2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 17:03:00 \n",
"3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 17:04:00 \n",
"4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 17:05:00 \n",
"\n",
" departure_time stop_sequence stop_name stop_lat \\\n",
"0 17:00:00 3 Zürich, Hürlimannplatz 47.365066 \n",
"1 17:02:00 4 Zürich, Waffenplatzstrasse 47.361482 \n",
"2 17:03:00 5 Zürich, Hügelstrasse 47.358543 \n",
"3 17:04:00 6 Zürich, Brunau/Mutschellenstr. 47.355147 \n",
"4 17:05:00 7 Zürich, Thujastrasse 47.350187 \n",
"\n",
" stop_lon trip_headsign trip_short_name direction_id \\\n",
"0 8.526539 Zürich, Neubühl 3870 0 \n",
"1 8.525749 Zürich, Neubühl 3870 0 \n",
"2 8.526997 Zürich, Neubühl 3870 0 \n",
"3 8.527141 Zürich, Neubühl 3870 0 \n",
"4 8.527806 Zürich, Neubühl 3870 0 \n",
"\n",
" departure_first_stop route_int stop_count stop_int route_desc \n",
"0 16:55:00 1225 12 1317 Bus \n",
"1 16:55:00 1225 12 1267 Bus \n",
"2 16:55:00 1225 12 67 Bus \n",
"3 16:55:00 1225 12 512 Bus \n",
"4 16:55:00 1225 12 403 Bus "
]
},
- "execution_count": 9,
+ "execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_int = stop_times_curated\n",
"stop_times_int.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"transfers_df = transfers_df.merge(stop_times_int[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\"), how=\"inner\", on = \"stop_id\").drop_duplicates()"
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10707"
]
},
- "execution_count": 11,
+ "execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers_df.stop_id.count()"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" stop_id | \n",
" stop_id2 | \n",
" distance | \n",
" Transfer_time_sec | \n",
" stop_name | \n",
" stop_name2 | \n",
" stop_int | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 8500926 | \n",
" 8590616 | \n",
" 0.122430 | \n",
" 146 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Geroldswil, Schweizäcker | \n",
" 1392 | \n",
"
\n",
" \n",
" 38 | \n",
" 1 | \n",
" 8500926 | \n",
" 8590737 | \n",
" 0.300175 | \n",
" 360 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Oetwil an der Limmat, Halde | \n",
" 1392 | \n",
"
\n",
" \n",
" 76 | \n",
" 9 | \n",
" 8502186:0:1 | \n",
" 8502186 | \n",
" 0.006762 | \n",
" 8 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
" 1394 | \n",
"
\n",
" \n",
" 128 | \n",
" 10 | \n",
" 8502186:0:1 | \n",
" 8502186:0:2 | \n",
" 0.006762 | \n",
" 8 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
" 1394 | \n",
"
\n",
" \n",
" 180 | \n",
" 11 | \n",
" 8502186:0:1 | \n",
" 8502186P | \n",
" 0.006762 | \n",
" 8 | \n",
" Dietikon Stoffelbach | \n",
" Dietikon Stoffelbach | \n",
" 1394 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n",
"0 0 8500926 8590616 0.122430 146 \n",
"38 1 8500926 8590737 0.300175 360 \n",
"76 9 8502186:0:1 8502186 0.006762 8 \n",
"128 10 8502186:0:1 8502186:0:2 0.006762 8 \n",
"180 11 8502186:0:1 8502186P 0.006762 8 \n",
"\n",
" stop_name stop_name2 stop_int \n",
"0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 1392 \n",
"38 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 1392 \n",
"76 Dietikon Stoffelbach Dietikon Stoffelbach 1394 \n",
"128 Dietikon Stoffelbach Dietikon Stoffelbach 1394 \n",
"180 Dietikon Stoffelbach Dietikon Stoffelbach 1394 "
]
},
- "execution_count": 12,
+ "execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers_df.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" stop_id2 | \n",
" stop_int_2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 8591205 | \n",
" 1317 | \n",
"
\n",
" \n",
" 1 | \n",
" 8591415 | \n",
" 1267 | \n",
"
\n",
" \n",
" 2 | \n",
" 8591204 | \n",
" 67 | \n",
"
\n",
" \n",
" 3 | \n",
" 8591098 | \n",
" 512 | \n",
"
\n",
" \n",
" 4 | \n",
" 8591392 | \n",
" 403 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stop_id2 stop_int_2\n",
"0 8591205 1317\n",
"1 8591415 1267\n",
"2 8591204 67\n",
"3 8591098 512\n",
"4 8591392 403"
]
},
- "execution_count": 13,
+ "execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#create dataframe with stops\n",
"df_stop_int2 = stop_times_int[[\"stop_id\", \"stop_int\"]].rename(columns={\"stop_id\": \"stop_id2\", \"stop_int\" : \"stop_int_2\"})\n",
"df_stop_int2.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We add the the stop id for the arrival destination, stop_int2"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"transfers_df_int = transfers_df.merge(df_stop_int2.set_index(\"stop_id2\"), how=\"inner\", on = \"stop_id2\").drop_duplicates()"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" stop_id | \n",
" stop_id2 | \n",
" distance | \n",
" Transfer_time_sec | \n",
" stop_name | \n",
" stop_name2 | \n",
" stop_int | \n",
" stop_int_2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 8500926 | \n",
" 8590616 | \n",
" 0.122430 | \n",
" 146 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Geroldswil, Schweizäcker | \n",
" 1392 | \n",
" 1310 | \n",
"
\n",
" \n",
" 37 | \n",
" 8193 | \n",
" 8590618 | \n",
" 8590616 | \n",
" 0.412676 | \n",
" 495 | \n",
" Geroldswil, Zentrum | \n",
" Geroldswil, Schweizäcker | \n",
" 590 | \n",
" 1310 | \n",
"
\n",
" \n",
" 74 | \n",
" 8821 | \n",
" 8590737 | \n",
" 8590616 | \n",
" 0.422521 | \n",
" 507 | \n",
" Oetwil an der Limmat, Halde | \n",
" Geroldswil, Schweizäcker | \n",
" 901 | \n",
" 1310 | \n",
"
\n",
" \n",
" 111 | \n",
" 1 | \n",
" 8500926 | \n",
" 8590737 | \n",
" 0.300175 | \n",
" 360 | \n",
" Oetwil a.d.L., Schweizäcker | \n",
" Oetwil an der Limmat, Halde | \n",
" 1392 | \n",
" 901 | \n",
"
\n",
" \n",
" 186 | \n",
" 8189 | \n",
" 8590616 | \n",
" 8590737 | \n",
" 0.422521 | \n",
" 507 | \n",
" Geroldswil, Schweizäcker | \n",
" Oetwil an der Limmat, Halde | \n",
" 1310 | \n",
" 901 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec \\\n",
"0 0 8500926 8590616 0.122430 146 \n",
"37 8193 8590618 8590616 0.412676 495 \n",
"74 8821 8590737 8590616 0.422521 507 \n",
"111 1 8500926 8590737 0.300175 360 \n",
"186 8189 8590616 8590737 0.422521 507 \n",
"\n",
" stop_name stop_name2 stop_int \\\n",
"0 Oetwil a.d.L., Schweizäcker Geroldswil, Schweizäcker 1392 \n",
"37 Geroldswil, Zentrum Geroldswil, Schweizäcker 590 \n",
"74 Oetwil an der Limmat, Halde Geroldswil, Schweizäcker 901 \n",
"111 Oetwil a.d.L., Schweizäcker Oetwil an der Limmat, Halde 1392 \n",
"186 Geroldswil, Schweizäcker Oetwil an der Limmat, Halde 1310 \n",
"\n",
" stop_int_2 \n",
"0 1310 \n",
"37 1310 \n",
"74 1310 \n",
"111 901 \n",
"186 901 "
]
},
- "execution_count": 15,
+ "execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers_df_int.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9434"
]
},
- "execution_count": 16,
+ "execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers_df_int.stop_id.count()"
]
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"transfers = transfers_df_int"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1503"
]
},
- "execution_count": 18,
+ "execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#check number unique stops2 in transfers\n",
"transfers.stop_id2.nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1503"
]
},
- "execution_count": 19,
+ "execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers.stop_id.nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" route_id | \n",
" stop_id_general | \n",
" trip_id | \n",
" stop_id | \n",
" arrival_time | \n",
" departure_time | \n",
" stop_sequence | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" trip_headsign | \n",
" trip_short_name | \n",
" direction_id | \n",
" departure_first_stop | \n",
" route_int | \n",
" stop_count | \n",
" stop_int | \n",
" route_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 26-66-j19-1 | \n",
" 8591205 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591205 | \n",
" 17:00:00 | \n",
" 17:00:00 | \n",
" 3 | \n",
" Zürich, Hürlimannplatz | \n",
" 47.365066 | \n",
" 8.526539 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1317 | \n",
" Bus | \n",
"
\n",
" \n",
" 1 | \n",
" 26-66-j19-1 | \n",
" 8591415 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591415 | \n",
" 17:02:00 | \n",
" 17:02:00 | \n",
" 4 | \n",
" Zürich, Waffenplatzstrasse | \n",
" 47.361482 | \n",
" 8.525749 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1267 | \n",
" Bus | \n",
"
\n",
" \n",
" 2 | \n",
" 26-66-j19-1 | \n",
" 8591204 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591204 | \n",
" 17:03:00 | \n",
" 17:03:00 | \n",
" 5 | \n",
" Zürich, Hügelstrasse | \n",
" 47.358543 | \n",
" 8.526997 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 67 | \n",
" Bus | \n",
"
\n",
" \n",
" 3 | \n",
" 26-66-j19-1 | \n",
" 8591098 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591098 | \n",
" 17:04:00 | \n",
" 17:04:00 | \n",
" 6 | \n",
" Zürich, Brunau/Mutschellenstr. | \n",
" 47.355147 | \n",
" 8.527141 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 512 | \n",
" Bus | \n",
"
\n",
" \n",
" 4 | \n",
" 26-66-j19-1 | \n",
" 8591392 | \n",
" 17.TA.26-66-j19-1.1.H | \n",
" 8591392 | \n",
" 17:05:00 | \n",
" 17:05:00 | \n",
" 7 | \n",
" Zürich, Thujastrasse | \n",
" 47.350187 | \n",
" 8.527806 | \n",
" Zürich, Neubühl | \n",
" 3870 | \n",
" 0 | \n",
" 16:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 403 | \n",
" Bus | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" route_id stop_id_general trip_id stop_id arrival_time \\\n",
"0 26-66-j19-1 8591205 17.TA.26-66-j19-1.1.H 8591205 17:00:00 \n",
"1 26-66-j19-1 8591415 17.TA.26-66-j19-1.1.H 8591415 17:02:00 \n",
"2 26-66-j19-1 8591204 17.TA.26-66-j19-1.1.H 8591204 17:03:00 \n",
"3 26-66-j19-1 8591098 17.TA.26-66-j19-1.1.H 8591098 17:04:00 \n",
"4 26-66-j19-1 8591392 17.TA.26-66-j19-1.1.H 8591392 17:05:00 \n",
"\n",
" departure_time stop_sequence stop_name stop_lat \\\n",
"0 17:00:00 3 Zürich, Hürlimannplatz 47.365066 \n",
"1 17:02:00 4 Zürich, Waffenplatzstrasse 47.361482 \n",
"2 17:03:00 5 Zürich, Hügelstrasse 47.358543 \n",
"3 17:04:00 6 Zürich, Brunau/Mutschellenstr. 47.355147 \n",
"4 17:05:00 7 Zürich, Thujastrasse 47.350187 \n",
"\n",
" stop_lon trip_headsign trip_short_name direction_id \\\n",
"0 8.526539 Zürich, Neubühl 3870 0 \n",
"1 8.525749 Zürich, Neubühl 3870 0 \n",
"2 8.526997 Zürich, Neubühl 3870 0 \n",
"3 8.527141 Zürich, Neubühl 3870 0 \n",
"4 8.527806 Zürich, Neubühl 3870 0 \n",
"\n",
" departure_first_stop route_int stop_count stop_int route_desc \n",
"0 16:55:00 1225 12 1317 Bus \n",
"1 16:55:00 1225 12 1267 Bus \n",
"2 16:55:00 1225 12 67 Bus \n",
"3 16:55:00 1225 12 512 Bus \n",
"4 16:55:00 1225 12 403 Bus "
]
},
- "execution_count": 20,
+ "execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_ordered = stop_times_curated\n",
"stop_times_ordered.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We start by making sure the order is correct"
]
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" route_id | \n",
" stop_id_general | \n",
" trip_id | \n",
" stop_id | \n",
" arrival_time | \n",
" departure_time | \n",
" stop_sequence | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" trip_headsign | \n",
" trip_short_name | \n",
" direction_id | \n",
" departure_first_stop | \n",
" route_int | \n",
" stop_count | \n",
" stop_int | \n",
" route_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 93111 | \n",
" 26-10-j19-1 | \n",
" 8573205 | \n",
" 1672.TA.26-10-j19-1.11.R | \n",
" 8573205 | \n",
" 07:00:00 | \n",
" 07:01:00 | \n",
" 27 | \n",
" Zürich Flughafen, Bahnhof | \n",
" 47.450441 | \n",
" 8.563729 | \n",
" Zürich Flughafen, Fracht | \n",
" 4096 | \n",
" 1 | \n",
" 07:01:00 | \n",
" 0 | \n",
" 2 | \n",
" 298 | \n",
" Tram | \n",
"
\n",
" \n",
" 93112 | \n",
" 26-10-j19-1 | \n",
" 8588553 | \n",
" 1672.TA.26-10-j19-1.11.R | \n",
" 8588553 | \n",
" 07:02:00 | \n",
" 07:02:00 | \n",
" 28 | \n",
" Zürich Flughafen, Fracht | \n",
" 47.452494 | \n",
" 8.572057 | \n",
" Zürich Flughafen, Fracht | \n",
" 4096 | \n",
" 1 | \n",
" 07:01:00 | \n",
" 0 | \n",
" 2 | \n",
" 1295 | \n",
" Tram | \n",
"
\n",
" \n",
" 93113 | \n",
" 26-13-j19-1 | \n",
" 8576240 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8576240 | \n",
" 07:00:00 | \n",
" 07:00:00 | \n",
" 5 | \n",
" Zürich, Meierhofplatz | \n",
" 47.402010 | \n",
" 8.499374 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 1222 | \n",
" Tram | \n",
"
\n",
" \n",
" 93114 | \n",
" 26-13-j19-1 | \n",
" 8591353 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8591353 | \n",
" 07:01:00 | \n",
" 07:01:00 | \n",
" 6 | \n",
" Zürich, Schwert | \n",
" 47.399730 | \n",
" 8.504611 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 816 | \n",
" Tram | \n",
"
\n",
" \n",
" 93115 | \n",
" 26-13-j19-1 | \n",
" 8591039 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8591039 | \n",
" 07:02:00 | \n",
" 07:02:00 | \n",
" 7 | \n",
" Zürich, Alte Trotte | \n",
" 47.397766 | \n",
" 8.507252 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 778 | \n",
" Tram | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" route_id stop_id_general trip_id stop_id \\\n",
"93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n",
"93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n",
"93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n",
"93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n",
"93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n",
"\n",
" arrival_time departure_time stop_sequence stop_name \\\n",
"93111 07:00:00 07:01:00 27 Zürich Flughafen, Bahnhof \n",
"93112 07:02:00 07:02:00 28 Zürich Flughafen, Fracht \n",
"93113 07:00:00 07:00:00 5 Zürich, Meierhofplatz \n",
"93114 07:01:00 07:01:00 6 Zürich, Schwert \n",
"93115 07:02:00 07:02:00 7 Zürich, Alte Trotte \n",
"\n",
" stop_lat stop_lon trip_headsign trip_short_name \\\n",
"93111 47.450441 8.563729 Zürich Flughafen, Fracht 4096 \n",
"93112 47.452494 8.572057 Zürich Flughafen, Fracht 4096 \n",
"93113 47.402010 8.499374 Zürich, Albisgütli 1831 \n",
"93114 47.399730 8.504611 Zürich, Albisgütli 1831 \n",
"93115 47.397766 8.507252 Zürich, Albisgütli 1831 \n",
"\n",
" direction_id departure_first_stop route_int stop_count stop_int \\\n",
"93111 1 07:01:00 0 2 298 \n",
"93112 1 07:01:00 0 2 1295 \n",
"93113 0 07:00:00 1 26 1222 \n",
"93114 0 07:00:00 1 26 816 \n",
"93115 0 07:00:00 1 26 778 \n",
"\n",
" route_desc \n",
"93111 Tram \n",
"93112 Tram \n",
"93113 Tram \n",
"93114 Tram \n",
"93115 Tram "
]
},
- "execution_count": 23,
+ "execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"departure_first_stop\", \"stop_sequence\"])\n",
+ "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"departure_first_stop\", \"departure_time\"])\n",
"stop_times_ordered.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" arrival_time | \n",
" departure_time | \n",
"
\n",
" \n",
" \n",
" \n",
" 93111 | \n",
" 07:00:00 | \n",
" 07:01:00 | \n",
"
\n",
" \n",
" 93112 | \n",
" 07:02:00 | \n",
" 07:02:00 | \n",
"
\n",
" \n",
" 93113 | \n",
" 07:00:00 | \n",
" 07:00:00 | \n",
"
\n",
" \n",
" 93114 | \n",
" 07:01:00 | \n",
" 07:01:00 | \n",
"
\n",
" \n",
" 93115 | \n",
" 07:02:00 | \n",
" 07:02:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" arrival_time departure_time\n",
"93111 07:00:00 07:01:00\n",
"93112 07:02:00 07:02:00\n",
"93113 07:00:00 07:00:00\n",
"93114 07:01:00 07:01:00\n",
"93115 07:02:00 07:02:00"
]
},
- "execution_count": 24,
+ "execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We add None to first arrival time and last departure time."
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" route_id | \n",
" stop_id_general | \n",
" trip_id | \n",
" stop_id | \n",
" arrival_time | \n",
" departure_time | \n",
" stop_sequence | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" trip_headsign | \n",
" trip_short_name | \n",
" direction_id | \n",
" departure_first_stop | \n",
" route_int | \n",
" stop_count | \n",
" stop_int | \n",
" route_desc | \n",
- " sequence_shift_1 | \n",
+ " departure_first_shift_1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 93111 | \n",
" 26-10-j19-1 | \n",
" 8573205 | \n",
" 1672.TA.26-10-j19-1.11.R | \n",
" 8573205 | \n",
" 07:00:00 | \n",
" 07:01:00 | \n",
" 27 | \n",
" Zürich Flughafen, Bahnhof | \n",
" 47.450441 | \n",
" 8.563729 | \n",
" Zürich Flughafen, Fracht | \n",
" 4096 | \n",
" 1 | \n",
" 07:01:00 | \n",
" 0 | \n",
" 2 | \n",
" 298 | \n",
" Tram | \n",
- " 28 | \n",
+ " 07:01:00 | \n",
"
\n",
" \n",
" 93112 | \n",
" 26-10-j19-1 | \n",
" 8588553 | \n",
" 1672.TA.26-10-j19-1.11.R | \n",
" 8588553 | \n",
" 07:02:00 | \n",
" 07:02:00 | \n",
" 28 | \n",
" Zürich Flughafen, Fracht | \n",
" 47.452494 | \n",
" 8.572057 | \n",
" Zürich Flughafen, Fracht | \n",
" 4096 | \n",
" 1 | \n",
" 07:01:00 | \n",
" 0 | \n",
" 2 | \n",
" 1295 | \n",
" Tram | \n",
- " 5 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
" 93113 | \n",
" 26-13-j19-1 | \n",
" 8576240 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8576240 | \n",
" 07:00:00 | \n",
" 07:00:00 | \n",
" 5 | \n",
" Zürich, Meierhofplatz | \n",
" 47.402010 | \n",
" 8.499374 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 1222 | \n",
" Tram | \n",
- " 6 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
" 93114 | \n",
" 26-13-j19-1 | \n",
" 8591353 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8591353 | \n",
" 07:01:00 | \n",
" 07:01:00 | \n",
" 6 | \n",
" Zürich, Schwert | \n",
" 47.399730 | \n",
" 8.504611 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 816 | \n",
" Tram | \n",
- " 7 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
" 93115 | \n",
" 26-13-j19-1 | \n",
" 8591039 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8591039 | \n",
" 07:02:00 | \n",
" 07:02:00 | \n",
" 7 | \n",
" Zürich, Alte Trotte | \n",
" 47.397766 | \n",
" 8.507252 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 778 | \n",
" Tram | \n",
- " 8 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" route_id stop_id_general trip_id stop_id \\\n",
"93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n",
"93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n",
"93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n",
"93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n",
"93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n",
"\n",
" arrival_time departure_time stop_sequence stop_name \\\n",
"93111 07:00:00 07:01:00 27 Zürich Flughafen, Bahnhof \n",
"93112 07:02:00 07:02:00 28 Zürich Flughafen, Fracht \n",
"93113 07:00:00 07:00:00 5 Zürich, Meierhofplatz \n",
"93114 07:01:00 07:01:00 6 Zürich, Schwert \n",
"93115 07:02:00 07:02:00 7 Zürich, Alte Trotte \n",
"\n",
" stop_lat stop_lon trip_headsign trip_short_name \\\n",
"93111 47.450441 8.563729 Zürich Flughafen, Fracht 4096 \n",
"93112 47.452494 8.572057 Zürich Flughafen, Fracht 4096 \n",
"93113 47.402010 8.499374 Zürich, Albisgütli 1831 \n",
"93114 47.399730 8.504611 Zürich, Albisgütli 1831 \n",
"93115 47.397766 8.507252 Zürich, Albisgütli 1831 \n",
"\n",
" direction_id departure_first_stop route_int stop_count stop_int \\\n",
"93111 1 07:01:00 0 2 298 \n",
"93112 1 07:01:00 0 2 1295 \n",
"93113 0 07:00:00 1 26 1222 \n",
"93114 0 07:00:00 1 26 816 \n",
"93115 0 07:00:00 1 26 778 \n",
"\n",
- " route_desc sequence_shift_1 \n",
- "93111 Tram 28 \n",
- "93112 Tram 5 \n",
- "93113 Tram 6 \n",
- "93114 Tram 7 \n",
- "93115 Tram 8 "
+ " route_desc departure_first_shift_1 \n",
+ "93111 Tram 07:01:00 \n",
+ "93112 Tram 07:00:00 \n",
+ "93113 Tram 07:00:00 \n",
+ "93114 Tram 07:00:00 \n",
+ "93115 Tram 07:00:00 "
]
},
- "execution_count": 25,
+ "execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#adding a shift\n",
- "stop_times_ordered[\"sequence_shift_1\"] = stop_times_ordered[\"stop_sequence\"].shift(-1, fill_value=0)\n",
+ "stop_times_ordered[\"departure_first_shift_1\"] = stop_times_ordered[\"departure_first_stop\"].shift(-1, fill_value=0)\n",
"stop_times_ordered.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
- "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"stop_sequence\"] > stop_times_ordered[\"sequence_shift_1\"]), None, stop_times_ordered['departure_time'])"
+ "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"departure_first_stop\"] != stop_times_ordered[\"departure_first_shift_1\"]), None, stop_times_ordered['departure_time'])"
]
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
- "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"stop_sequence\"] == 1), None, stop_times_ordered['arrival_time'])"
+ "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"departure_first_stop\"] == stop_times_ordered[\"departure_time\"]), None, stop_times_ordered['arrival_time'])"
]
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" arrival_time | \n",
" departure_time | \n",
- " stop_sequence | \n",
- " sequence_shift_1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 93111 | \n",
- " 07:00:00 | \n",
+ " None | \n",
" 07:01:00 | \n",
- " 27 | \n",
- " 28 | \n",
"
\n",
" \n",
" 93112 | \n",
" 07:02:00 | \n",
" None | \n",
- " 28 | \n",
- " 5 | \n",
"
\n",
" \n",
" 93113 | \n",
+ " None | \n",
" 07:00:00 | \n",
- " 07:00:00 | \n",
- " 5 | \n",
- " 6 | \n",
"
\n",
" \n",
" 93114 | \n",
" 07:01:00 | \n",
" 07:01:00 | \n",
- " 6 | \n",
- " 7 | \n",
"
\n",
" \n",
" 93115 | \n",
" 07:02:00 | \n",
" 07:02:00 | \n",
- " 7 | \n",
- " 8 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
- " arrival_time departure_time stop_sequence sequence_shift_1\n",
- "93111 07:00:00 07:01:00 27 28\n",
- "93112 07:02:00 None 28 5\n",
- "93113 07:00:00 07:00:00 5 6\n",
- "93114 07:01:00 07:01:00 6 7\n",
- "93115 07:02:00 07:02:00 7 8"
+ " arrival_time departure_time\n",
+ "93111 None 07:01:00\n",
+ "93112 07:02:00 None\n",
+ "93113 None 07:00:00\n",
+ "93114 07:01:00 07:01:00\n",
+ "93115 07:02:00 07:02:00"
]
},
- "execution_count": 28,
+ "execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "stop_times_ordered[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_shift_1\"]].head(5)"
+ "stop_times_ordered[[\"arrival_time\",\"departure_time\"]].head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Array structure preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### StopTimes: \n",
"[[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We transform it in datetime as required by the raptor algorithm"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n",
"stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])"
]
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" arrival_time | \n",
" departure_time | \n",
"
\n",
" \n",
" \n",
" \n",
" 93111 | \n",
- " 2020-05-22 07:00:00 | \n",
- " 2020-05-22 07:01:00 | \n",
+ " NaT | \n",
+ " 2020-05-23 07:01:00 | \n",
"
\n",
" \n",
" 93112 | \n",
- " 2020-05-22 07:02:00 | \n",
+ " 2020-05-23 07:02:00 | \n",
" NaT | \n",
"
\n",
" \n",
" 93113 | \n",
- " 2020-05-22 07:00:00 | \n",
- " 2020-05-22 07:00:00 | \n",
+ " NaT | \n",
+ " 2020-05-23 07:00:00 | \n",
"
\n",
" \n",
" 93114 | \n",
- " 2020-05-22 07:01:00 | \n",
- " 2020-05-22 07:01:00 | \n",
+ " 2020-05-23 07:01:00 | \n",
+ " 2020-05-23 07:01:00 | \n",
"
\n",
" \n",
" 93115 | \n",
- " 2020-05-22 07:02:00 | \n",
- " 2020-05-22 07:02:00 | \n",
+ " 2020-05-23 07:02:00 | \n",
+ " 2020-05-23 07:02:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" arrival_time departure_time\n",
- "93111 2020-05-22 07:00:00 2020-05-22 07:01:00\n",
- "93112 2020-05-22 07:02:00 NaT\n",
- "93113 2020-05-22 07:00:00 2020-05-22 07:00:00\n",
- "93114 2020-05-22 07:01:00 2020-05-22 07:01:00\n",
- "93115 2020-05-22 07:02:00 2020-05-22 07:02:00"
+ "93111 NaT 2020-05-23 07:01:00\n",
+ "93112 2020-05-23 07:02:00 NaT\n",
+ "93113 NaT 2020-05-23 07:00:00\n",
+ "93114 2020-05-23 07:01:00 2020-05-23 07:01:00\n",
+ "93115 2020-05-23 07:02:00 2020-05-23 07:02:00"
]
},
- "execution_count": 30,
+ "execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
- "with open('../data/stop_times_df.pkl','wb') as f: pickle.dump(stop_times_ordered, f)"
+ "with open('../data/stop_times_df_cyril.pkl','wb') as f: pickle.dump(stop_times_ordered, f)"
]
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" route_id | \n",
" stop_id_general | \n",
" trip_id | \n",
" stop_id | \n",
" arrival_time | \n",
" departure_time | \n",
" stop_sequence | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" trip_headsign | \n",
" trip_short_name | \n",
" direction_id | \n",
" departure_first_stop | \n",
" route_int | \n",
" stop_count | \n",
" stop_int | \n",
" route_desc | \n",
- " sequence_shift_1 | \n",
+ " departure_first_shift_1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 93111 | \n",
" 26-10-j19-1 | \n",
" 8573205 | \n",
" 1672.TA.26-10-j19-1.11.R | \n",
" 8573205 | \n",
- " 2020-05-22 07:00:00 | \n",
- " 2020-05-22 07:01:00 | \n",
+ " NaT | \n",
+ " 2020-05-23 07:01:00 | \n",
" 27 | \n",
" Zürich Flughafen, Bahnhof | \n",
" 47.450441 | \n",
" 8.563729 | \n",
" Zürich Flughafen, Fracht | \n",
" 4096 | \n",
" 1 | \n",
" 07:01:00 | \n",
" 0 | \n",
" 2 | \n",
" 298 | \n",
" Tram | \n",
- " 28 | \n",
+ " 07:01:00 | \n",
"
\n",
" \n",
" 93112 | \n",
" 26-10-j19-1 | \n",
" 8588553 | \n",
" 1672.TA.26-10-j19-1.11.R | \n",
" 8588553 | \n",
- " 2020-05-22 07:02:00 | \n",
+ " 2020-05-23 07:02:00 | \n",
" NaT | \n",
" 28 | \n",
" Zürich Flughafen, Fracht | \n",
" 47.452494 | \n",
" 8.572057 | \n",
" Zürich Flughafen, Fracht | \n",
" 4096 | \n",
" 1 | \n",
" 07:01:00 | \n",
" 0 | \n",
" 2 | \n",
" 1295 | \n",
" Tram | \n",
- " 5 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
" 93113 | \n",
" 26-13-j19-1 | \n",
" 8576240 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8576240 | \n",
- " 2020-05-22 07:00:00 | \n",
- " 2020-05-22 07:00:00 | \n",
+ " NaT | \n",
+ " 2020-05-23 07:00:00 | \n",
" 5 | \n",
" Zürich, Meierhofplatz | \n",
" 47.402010 | \n",
" 8.499374 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 1222 | \n",
" Tram | \n",
- " 6 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
" 93114 | \n",
" 26-13-j19-1 | \n",
" 8591353 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8591353 | \n",
- " 2020-05-22 07:01:00 | \n",
- " 2020-05-22 07:01:00 | \n",
+ " 2020-05-23 07:01:00 | \n",
+ " 2020-05-23 07:01:00 | \n",
" 6 | \n",
" Zürich, Schwert | \n",
" 47.399730 | \n",
" 8.504611 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 816 | \n",
" Tram | \n",
- " 7 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
" 93115 | \n",
" 26-13-j19-1 | \n",
" 8591039 | \n",
" 2064.TA.26-13-j19-1.24.H | \n",
" 8591039 | \n",
- " 2020-05-22 07:02:00 | \n",
- " 2020-05-22 07:02:00 | \n",
+ " 2020-05-23 07:02:00 | \n",
+ " 2020-05-23 07:02:00 | \n",
" 7 | \n",
" Zürich, Alte Trotte | \n",
" 47.397766 | \n",
" 8.507252 | \n",
" Zürich, Albisgütli | \n",
" 1831 | \n",
" 0 | \n",
" 07:00:00 | \n",
" 1 | \n",
" 26 | \n",
" 778 | \n",
" Tram | \n",
- " 8 | \n",
+ " 07:00:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" route_id stop_id_general trip_id stop_id \\\n",
"93111 26-10-j19-1 8573205 1672.TA.26-10-j19-1.11.R 8573205 \n",
"93112 26-10-j19-1 8588553 1672.TA.26-10-j19-1.11.R 8588553 \n",
"93113 26-13-j19-1 8576240 2064.TA.26-13-j19-1.24.H 8576240 \n",
"93114 26-13-j19-1 8591353 2064.TA.26-13-j19-1.24.H 8591353 \n",
"93115 26-13-j19-1 8591039 2064.TA.26-13-j19-1.24.H 8591039 \n",
"\n",
" arrival_time departure_time stop_sequence \\\n",
- "93111 2020-05-22 07:00:00 2020-05-22 07:01:00 27 \n",
- "93112 2020-05-22 07:02:00 NaT 28 \n",
- "93113 2020-05-22 07:00:00 2020-05-22 07:00:00 5 \n",
- "93114 2020-05-22 07:01:00 2020-05-22 07:01:00 6 \n",
- "93115 2020-05-22 07:02:00 2020-05-22 07:02:00 7 \n",
+ "93111 NaT 2020-05-23 07:01:00 27 \n",
+ "93112 2020-05-23 07:02:00 NaT 28 \n",
+ "93113 NaT 2020-05-23 07:00:00 5 \n",
+ "93114 2020-05-23 07:01:00 2020-05-23 07:01:00 6 \n",
+ "93115 2020-05-23 07:02:00 2020-05-23 07:02:00 7 \n",
"\n",
" stop_name stop_lat stop_lon \\\n",
"93111 Zürich Flughafen, Bahnhof 47.450441 8.563729 \n",
"93112 Zürich Flughafen, Fracht 47.452494 8.572057 \n",
"93113 Zürich, Meierhofplatz 47.402010 8.499374 \n",
"93114 Zürich, Schwert 47.399730 8.504611 \n",
"93115 Zürich, Alte Trotte 47.397766 8.507252 \n",
"\n",
" trip_headsign trip_short_name direction_id \\\n",
"93111 Zürich Flughafen, Fracht 4096 1 \n",
"93112 Zürich Flughafen, Fracht 4096 1 \n",
"93113 Zürich, Albisgütli 1831 0 \n",
"93114 Zürich, Albisgütli 1831 0 \n",
"93115 Zürich, Albisgütli 1831 0 \n",
"\n",
" departure_first_stop route_int stop_count stop_int route_desc \\\n",
"93111 07:01:00 0 2 298 Tram \n",
"93112 07:01:00 0 2 1295 Tram \n",
"93113 07:00:00 1 26 1222 Tram \n",
"93114 07:00:00 1 26 816 Tram \n",
"93115 07:00:00 1 26 778 Tram \n",
"\n",
- " sequence_shift_1 \n",
- "93111 28 \n",
- "93112 5 \n",
- "93113 6 \n",
- "93114 7 \n",
- "93115 8 "
+ " departure_first_shift_1 \n",
+ "93111 07:01:00 \n",
+ "93112 07:00:00 \n",
+ "93113 07:00:00 \n",
+ "93114 07:00:00 \n",
+ "93115 07:00:00 "
]
},
- "execution_count": 33,
+ "execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_ordered = stop_times_ordered.sort_values(by=[\"route_int\", \"departure_first_stop\", \"stop_sequence\"])\n",
"stop_times_ordered.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And we transform it to array, ready ti be used by raptor"
]
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "array([['2020-05-22T07:00:00.000000000', '2020-05-22T07:01:00.000000000'],\n",
- " ['2020-05-22T07:02:00.000000000', 'NaT'],\n",
- " ['2020-05-22T07:00:00.000000000', '2020-05-22T07:00:00.000000000'],\n",
+ "array([[ 'NaT', '2020-05-23T07:01:00.000000000'],\n",
+ " ['2020-05-23T07:02:00.000000000', 'NaT'],\n",
+ " [ 'NaT', '2020-05-23T07:00:00.000000000'],\n",
" ...,\n",
- " ['2020-05-22T07:35:00.000000000', '2020-05-22T07:35:00.000000000'],\n",
- " ['2020-05-22T07:36:00.000000000', '2020-05-22T07:36:00.000000000'],\n",
- " ['2020-05-22T07:37:00.000000000', 'NaT']],\n",
+ " ['2020-05-23T07:35:00.000000000', '2020-05-23T07:35:00.000000000'],\n",
+ " ['2020-05-23T07:36:00.000000000', '2020-05-23T07:36:00.000000000'],\n",
+ " ['2020-05-23T07:37:00.000000000', 'NaT']],\n",
" dtype='datetime64[ns]')"
]
},
- "execution_count": 34,
+ "execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_array = stop_times_ordered[[\"arrival_time\", \"departure_time\"]].to_numpy()\n",
"stop_times_array"
]
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"260459"
]
},
- "execution_count": 35,
+ "execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.size(stop_times_array,0)"
]
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/stop_times_array_cyril.pkl','wb') as f: pickle.dump(stop_times_array, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Routes: \n",
"[[route0_nr.Trips, route0_nr. Stops, route0_pointerRoutes, route0_pointerStops_times],[route1_nr.Trips, route1_nr. Stops,, route1_pointerRoutes, route1_pointerStops_times],…]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We start by getting the number of trips and stops there is for each route"
]
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" n_Trips | \n",
" n_stops | \n",
"
\n",
" \n",
" route_int | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 26 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 8 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 17 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n_Trips n_stops\n",
"route_int \n",
"0 1 2\n",
"1 1 26\n",
"2 1 8\n",
"3 1 17\n",
"4 1 5"
]
},
- "execution_count": 40,
+ "execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_trips_stops = stop_times_ordered.groupby([\"route_int\"]).nunique()[[\"trip_id\",\"stop_int\"]].sort_index().rename(columns={\"trip_id\": \"n_Trips\", \"stop_int\": \"n_stops\"})\n",
"distinct_trips_stops.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1461, 2)"
]
},
- "execution_count": 41,
+ "execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_trips_stops.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We create the pointer for the route stops, by adding the unique stops for each route"
]
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" n_Trips | \n",
" n_stops | \n",
" pointer_routes_stops | \n",
"
\n",
" \n",
" route_int | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 26 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 8 | \n",
" 28 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 17 | \n",
" 36 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 5 | \n",
" 53 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n_Trips n_stops pointer_routes_stops\n",
"route_int \n",
"0 1 2 0\n",
"1 1 26 2\n",
"2 1 8 28\n",
"3 1 17 36\n",
"4 1 5 53"
]
},
- "execution_count": 42,
+ "execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_trips_stops['pointer_routes_stops'] = distinct_trips_stops.n_stops.cumsum().shift(1, fill_value=0)\n",
"distinct_trips_stops.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We create the pointer for stop_times by adding the number of stops in each route, counting duplicates (due to several trips)"
]
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"distinct_trips_stops[\"pointer_stop_times\"] = (stop_times_ordered.groupby([\"route_int\"]).count().stop_id).cumsum().shift(1, fill_value=0)"
]
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" n_Trips | \n",
" n_stops | \n",
" pointer_routes_stops | \n",
" pointer_stop_times | \n",
" pointer_routes_stops_shift | \n",
" pointer_stop_times_shift | \n",
"
\n",
" \n",
" route_int | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 26 | \n",
" 2 | \n",
" 2 | \n",
" 28 | \n",
" 28 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 8 | \n",
" 28 | \n",
" 28 | \n",
" 36 | \n",
" 36 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 17 | \n",
" 36 | \n",
" 36 | \n",
" 53 | \n",
" 53 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 5 | \n",
" 53 | \n",
" 53 | \n",
" 58 | \n",
" 58 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n",
"route_int \n",
"0 1 2 0 0 \n",
"1 1 26 2 2 \n",
"2 1 8 28 28 \n",
"3 1 17 36 36 \n",
"4 1 5 53 53 \n",
"\n",
" pointer_routes_stops_shift pointer_stop_times_shift \n",
"route_int \n",
"0 2 2 \n",
"1 28 28 \n",
"2 36 36 \n",
"3 53 53 \n",
"4 58 58 "
]
},
- "execution_count": 44,
+ "execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_trips_stops[\"pointer_routes_stops_shift\"] = distinct_trips_stops['pointer_routes_stops'].shift(-1, fill_value=0)\n",
"distinct_trips_stops[\"pointer_stop_times_shift\"] = distinct_trips_stops['pointer_stop_times'].shift(-1, fill_value=0)\n",
"distinct_trips_stops.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"distinct_trips_stops['pointer_routes_stops'] = np.where((distinct_trips_stops[\"pointer_routes_stops\"] == distinct_trips_stops[\"pointer_routes_stops_shift\"]), None, distinct_trips_stops['pointer_routes_stops'])\n",
"distinct_trips_stops['pointer_stop_times'] = np.where((distinct_trips_stops[\"pointer_stop_times\"] == distinct_trips_stops[\"pointer_stop_times_shift\"]), None, distinct_trips_stops['pointer_stop_times'])\n"
]
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"n_Trips False\n",
"n_stops False\n",
"pointer_routes_stops False\n",
"pointer_stop_times False\n",
"pointer_routes_stops_shift False\n",
"pointer_stop_times_shift False\n",
"dtype: bool"
]
},
- "execution_count": 46,
+ "execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_trips_stops.isna().any()"
]
},
{
"cell_type": "code",
- "execution_count": 47,
+ "execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/routes_array_df_cyril.pkl','wb') as f: pickle.dump(distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']], f)"
]
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 1461 entries, 0 to 1460\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 n_Trips 1461 non-null int64 \n",
" 1 n_stops 1461 non-null int64 \n",
" 2 pointer_routes_stops 1461 non-null object\n",
" 3 pointer_stop_times 1461 non-null object\n",
" 4 pointer_routes_stops_shift 1461 non-null int64 \n",
" 5 pointer_stop_times_shift 1461 non-null int64 \n",
"dtypes: int64(4), object(2)\n",
"memory usage: 79.9+ KB\n"
]
}
],
"source": [
"distinct_trips_stops.info()"
]
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 2, 0, 0],\n",
" [1, 26, 2, 2],\n",
" [1, 8, 28, 28],\n",
" ...,\n",
" [1, 3, 15297, 260396],\n",
" [2, 16, 15300, 260399],\n",
" [1, 28, 15316, 260431]], dtype=object)"
]
},
- "execution_count": 49,
+ "execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"routes_array = distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']].to_numpy()\n",
"routes_array"
]
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1461"
]
},
- "execution_count": 50,
+ "execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.size(routes_array, 0)"
]
},
{
"cell_type": "code",
- "execution_count": 51,
+ "execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/routes_array_cyril.pkl','wb') as f: pickle.dump(routes_array, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n"
]
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" route_int | \n",
" stop_int | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 93111 | \n",
" 0 | \n",
" 298 | \n",
"
\n",
" \n",
" 1 | \n",
" 93112 | \n",
" 0 | \n",
" 1295 | \n",
"
\n",
" \n",
" 2 | \n",
" 93113 | \n",
" 1 | \n",
" 1222 | \n",
"
\n",
" \n",
" 3 | \n",
" 93114 | \n",
" 1 | \n",
" 816 | \n",
"
\n",
" \n",
" 4 | \n",
" 93115 | \n",
" 1 | \n",
" 778 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index route_int stop_int\n",
"0 93111 0 298\n",
"1 93112 0 1295\n",
"2 93113 1 1222\n",
"3 93114 1 816\n",
"4 93115 1 778"
]
},
- "execution_count": 52,
+ "execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"route_stops = stop_times_ordered.sort_values([\"route_int\", \"stop_sequence\"])\n",
"route_stops = route_stops[['route_int', 'stop_int']].drop_duplicates().reset_index()\n",
"route_stops.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 53,
+ "execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 15344 entries, 0 to 15343\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 index 15344 non-null int64\n",
" 1 route_int 15344 non-null int64\n",
" 2 stop_int 15344 non-null int64\n",
"dtypes: int64(3)\n",
"memory usage: 359.8 KB\n"
]
}
],
"source": [
"route_stops.info()"
]
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1461"
]
},
- "execution_count": 54,
+ "execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"route_stops.route_int.nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/route_stops_df_cyril.pkl','wb') as f: pickle.dump(route_stops, f)"
]
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 298, 1295, 1222, ..., 1349, 1042, 549])"
]
},
- "execution_count": 56,
+ "execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"route_stops_array = route_stops.stop_int.to_numpy()\n",
"route_stops_array"
]
},
{
"cell_type": "code",
- "execution_count": 57,
+ "execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1407"
]
},
- "execution_count": 57,
+ "execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.size(np.unique(route_stops_array))"
]
},
{
"cell_type": "code",
- "execution_count": 58,
+ "execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15344"
]
},
- "execution_count": 58,
+ "execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.size(route_stops_array, 0)"
]
},
{
"cell_type": "code",
- "execution_count": 59,
+ "execution_count": 89,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(15344,)"
]
},
- "execution_count": 59,
+ "execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"route_stops_array.shape"
]
},
{
"cell_type": "code",
- "execution_count": 60,
+ "execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/route_stops_array_cyril.pkl','wb') as f: pickle.dump(route_stops_array, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if pointers are correct\n",
"It is fundamental that the indexes, that serve as pointers, in Routes are correct"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We start by looking at where the indexes for stop_times and route_stops diverge. This will allow us to change. We can see that Route stops should have a new route at 3 while stop_times should have it at 78, so we try with that"
]
},
{
"cell_type": "code",
- "execution_count": 61,
+ "execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" n_Trips | \n",
" n_stops | \n",
" pointer_routes_stops | \n",
" pointer_stop_times | \n",
" pointer_routes_stops_shift | \n",
" pointer_stop_times_shift | \n",
"
\n",
" \n",
" route_int | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 26 | \n",
" 2 | \n",
" 2 | \n",
" 28 | \n",
" 28 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 8 | \n",
" 28 | \n",
" 28 | \n",
" 36 | \n",
" 36 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 17 | \n",
" 36 | \n",
" 36 | \n",
" 53 | \n",
" 53 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 5 | \n",
" 53 | \n",
" 53 | \n",
" 58 | \n",
" 58 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n",
"route_int \n",
"0 1 2 0 0 \n",
"1 1 26 2 2 \n",
"2 1 8 28 28 \n",
"3 1 17 36 36 \n",
"4 1 5 53 53 \n",
"\n",
" pointer_routes_stops_shift pointer_stop_times_shift \n",
"route_int \n",
"0 2 2 \n",
"1 28 28 \n",
"2 36 36 \n",
"3 53 53 \n",
"4 58 58 "
]
},
- "execution_count": 61,
+ "execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_trips_stops.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can check if the pointer indicates the routes index number. At the pointer_routes should indicate the first stop of a new route. We try with 3 to see if route_stops has a new route at this index. It does so it works"
]
},
{
"cell_type": "code",
- "execution_count": 62,
+ "execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" route_int | \n",
" stop_int | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 93111 | \n",
" 0 | \n",
" 298 | \n",
"
\n",
" \n",
" 1 | \n",
" 93112 | \n",
" 0 | \n",
" 1295 | \n",
"
\n",
" \n",
" 2 | \n",
" 93113 | \n",
" 1 | \n",
" 1222 | \n",
"
\n",
" \n",
" 3 | \n",
" 93114 | \n",
" 1 | \n",
" 816 | \n",
"
\n",
" \n",
" 4 | \n",
" 93115 | \n",
" 1 | \n",
" 778 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index route_int stop_int\n",
"0 93111 0 298\n",
"1 93112 0 1295\n",
"2 93113 1 1222\n",
"3 93114 1 816\n",
"4 93115 1 778"
]
},
- "execution_count": 62,
+ "execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"route_stops.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We go and see if stop_times has a new route at 78. It does, so it works"
]
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" route_id | \n",
" stop_id_general | \n",
" trip_id | \n",
" stop_id | \n",
" arrival_time | \n",
" departure_time | \n",
" stop_sequence | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" trip_headsign | \n",
" trip_short_name | \n",
" direction_id | \n",
" departure_first_stop | \n",
" route_int | \n",
" stop_count | \n",
" stop_int | \n",
" route_desc | \n",
- " sequence_shift_1 | \n",
+ " departure_first_shift_1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 75 | \n",
" 26-66-j19-1 | \n",
" 8591098 | \n",
" 8.TA.26-66-j19-1.1.H | \n",
" 8591098 | \n",
- " 2020-05-22 18:04:00 | \n",
- " 2020-05-22 18:04:00 | \n",
+ " 2020-05-23 18:04:00 | \n",
+ " 2020-05-23 18:04:00 | \n",
" 6 | \n",
" Zürich, Brunau/Mutschellenstr. | \n",
" 47.355147 | \n",
" 8.527141 | \n",
" Zürich, Neubühl | \n",
" 3762 | \n",
" 0 | \n",
" 17:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 512 | \n",
" Bus | \n",
- " 7 | \n",
+ " 17:55:00 | \n",
"
\n",
" \n",
" 76 | \n",
" 26-66-j19-1 | \n",
" 8591392 | \n",
" 8.TA.26-66-j19-1.1.H | \n",
" 8591392 | \n",
- " 2020-05-22 18:05:00 | \n",
- " 2020-05-22 18:05:00 | \n",
+ " 2020-05-23 18:05:00 | \n",
+ " 2020-05-23 18:05:00 | \n",
" 7 | \n",
" Zürich, Thujastrasse | \n",
" 47.350187 | \n",
" 8.527806 | \n",
" Zürich, Neubühl | \n",
" 3762 | \n",
" 0 | \n",
" 17:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 403 | \n",
" Bus | \n",
- " 8 | \n",
+ " 17:55:00 | \n",
"
\n",
" \n",
" 77 | \n",
" 26-66-j19-1 | \n",
" 8591216 | \n",
" 8.TA.26-66-j19-1.1.H | \n",
" 8591216 | \n",
- " 2020-05-22 18:06:00 | \n",
- " 2020-05-22 18:06:00 | \n",
+ " 2020-05-23 18:06:00 | \n",
+ " 2020-05-23 18:06:00 | \n",
" 8 | \n",
" Zürich, Jugendherberge | \n",
" 47.348002 | \n",
" 8.528210 | \n",
" Zürich, Neubühl | \n",
" 3762 | \n",
" 0 | \n",
" 17:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1375 | \n",
" Bus | \n",
- " 9 | \n",
+ " 17:55:00 | \n",
"
\n",
" \n",
" 78 | \n",
" 26-66-j19-1 | \n",
" 8591279 | \n",
" 8.TA.26-66-j19-1.1.H | \n",
" 8591279 | \n",
- " 2020-05-22 18:08:00 | \n",
- " 2020-05-22 18:08:00 | \n",
+ " 2020-05-23 18:08:00 | \n",
+ " 2020-05-23 18:08:00 | \n",
" 9 | \n",
" Zürich, Morgental | \n",
" 47.343948 | \n",
" 8.530141 | \n",
" Zürich, Neubühl | \n",
" 3762 | \n",
" 0 | \n",
" 17:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1349 | \n",
" Bus | \n",
- " 10 | \n",
+ " 17:55:00 | \n",
"
\n",
" \n",
" 79 | \n",
" 26-66-j19-1 | \n",
" 8591217 | \n",
" 8.TA.26-66-j19-1.1.H | \n",
" 8591217 | \n",
- " 2020-05-22 18:09:00 | \n",
- " 2020-05-22 18:09:00 | \n",
+ " 2020-05-23 18:09:00 | \n",
+ " 2020-05-23 18:09:00 | \n",
" 10 | \n",
" Zürich, Kalchbühlweg | \n",
" 47.341818 | \n",
" 8.531049 | \n",
" Zürich, Neubühl | \n",
" 3762 | \n",
" 0 | \n",
" 17:55:00 | \n",
" 1225 | \n",
" 12 | \n",
" 1303 | \n",
" Bus | \n",
- " 11 | \n",
+ " 17:55:00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" route_id stop_id_general trip_id stop_id \\\n",
"75 26-66-j19-1 8591098 8.TA.26-66-j19-1.1.H 8591098 \n",
"76 26-66-j19-1 8591392 8.TA.26-66-j19-1.1.H 8591392 \n",
"77 26-66-j19-1 8591216 8.TA.26-66-j19-1.1.H 8591216 \n",
"78 26-66-j19-1 8591279 8.TA.26-66-j19-1.1.H 8591279 \n",
"79 26-66-j19-1 8591217 8.TA.26-66-j19-1.1.H 8591217 \n",
"\n",
" arrival_time departure_time stop_sequence \\\n",
- "75 2020-05-22 18:04:00 2020-05-22 18:04:00 6 \n",
- "76 2020-05-22 18:05:00 2020-05-22 18:05:00 7 \n",
- "77 2020-05-22 18:06:00 2020-05-22 18:06:00 8 \n",
- "78 2020-05-22 18:08:00 2020-05-22 18:08:00 9 \n",
- "79 2020-05-22 18:09:00 2020-05-22 18:09:00 10 \n",
+ "75 2020-05-23 18:04:00 2020-05-23 18:04:00 6 \n",
+ "76 2020-05-23 18:05:00 2020-05-23 18:05:00 7 \n",
+ "77 2020-05-23 18:06:00 2020-05-23 18:06:00 8 \n",
+ "78 2020-05-23 18:08:00 2020-05-23 18:08:00 9 \n",
+ "79 2020-05-23 18:09:00 2020-05-23 18:09:00 10 \n",
"\n",
" stop_name stop_lat stop_lon trip_headsign \\\n",
"75 Zürich, Brunau/Mutschellenstr. 47.355147 8.527141 Zürich, Neubühl \n",
"76 Zürich, Thujastrasse 47.350187 8.527806 Zürich, Neubühl \n",
"77 Zürich, Jugendherberge 47.348002 8.528210 Zürich, Neubühl \n",
"78 Zürich, Morgental 47.343948 8.530141 Zürich, Neubühl \n",
"79 Zürich, Kalchbühlweg 47.341818 8.531049 Zürich, Neubühl \n",
"\n",
" trip_short_name direction_id departure_first_stop route_int stop_count \\\n",
"75 3762 0 17:55:00 1225 12 \n",
"76 3762 0 17:55:00 1225 12 \n",
"77 3762 0 17:55:00 1225 12 \n",
"78 3762 0 17:55:00 1225 12 \n",
"79 3762 0 17:55:00 1225 12 \n",
"\n",
- " stop_int route_desc sequence_shift_1 \n",
- "75 512 Bus 7 \n",
- "76 403 Bus 8 \n",
- "77 1375 Bus 9 \n",
- "78 1349 Bus 10 \n",
- "79 1303 Bus 11 "
+ " stop_int route_desc departure_first_shift_1 \n",
+ "75 512 Bus 17:55:00 \n",
+ "76 403 Bus 17:55:00 \n",
+ "77 1375 Bus 17:55:00 \n",
+ "78 1349 Bus 17:55:00 \n",
+ "79 1303 Bus 17:55:00 "
]
},
- "execution_count": 63,
+ "execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_ordered.loc[75:80].head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]"
]
},
{
"cell_type": "code",
- "execution_count": 64,
+ "execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" route_int | \n",
" stop_int | \n",
" Unnamed: 0 | \n",
" stop_id | \n",
" stop_id2 | \n",
" distance | \n",
" Transfer_time_sec | \n",
" stop_name | \n",
" stop_name2 | \n",
" stop_int_2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 93111 | \n",
" 0 | \n",
" 298 | \n",
" 4536.0 | \n",
" 8573205 | \n",
" 8503016:0:2 | \n",
" 0.101546 | \n",
" 121.0 | \n",
" Zürich Flughafen, Bahnhof | \n",
" Zürich Flughafen | \n",
" 1218.0 | \n",
"
\n",
" \n",
" 0 | \n",
" 93111 | \n",
" 0 | \n",
" 298 | \n",
" 4558.0 | \n",
" 8573205:0:A | \n",
" 8503016:0:2 | \n",
" 0.118159 | \n",
" 141.0 | \n",
" Zürich Flughafen, Bahnhof | \n",
" Zürich Flughafen | \n",
" 1218.0 | \n",
"
\n",
" \n",
" 0 | \n",
" 93111 | \n",
" 0 | \n",
" 298 | \n",
" 4580.0 | \n",
" 8573205:0:B | \n",
" 8503016:0:2 | \n",
" 0.104861 | \n",
" 125.0 | \n",
" Zürich Flughafen, Bahnhof | \n",
" Zürich Flughafen | \n",
" 1218.0 | \n",
"
\n",
" \n",
" 0 | \n",
" 93111 | \n",
" 0 | \n",
" 298 | \n",
" 4624.0 | \n",
" 8573205:0:D | \n",
" 8503016:0:2 | \n",
" 0.103327 | \n",
" 123.0 | \n",
" Zürich Flughafen, Bahnhof | \n",
" Zürich Flughafen | \n",
" 1218.0 | \n",
"
\n",
" \n",
" 0 | \n",
" 93111 | \n",
" 0 | \n",
" 298 | \n",
" 4646.0 | \n",
" 8573205:0:E | \n",
" 8503016:0:2 | \n",
" 0.101546 | \n",
" 121.0 | \n",
" Zürich Flughafen, Bahnhof | \n",
" Zürich Flughafen | \n",
" 1218.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index route_int stop_int Unnamed: 0 stop_id stop_id2 distance \\\n",
"0 93111 0 298 4536.0 8573205 8503016:0:2 0.101546 \n",
"0 93111 0 298 4558.0 8573205:0:A 8503016:0:2 0.118159 \n",
"0 93111 0 298 4580.0 8573205:0:B 8503016:0:2 0.104861 \n",
"0 93111 0 298 4624.0 8573205:0:D 8503016:0:2 0.103327 \n",
"0 93111 0 298 4646.0 8573205:0:E 8503016:0:2 0.101546 \n",
"\n",
" Transfer_time_sec stop_name stop_name2 stop_int_2 \n",
"0 121.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n",
"0 141.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n",
"0 125.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n",
"0 123.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 \n",
"0 121.0 Zürich Flughafen, Bahnhof Zürich Flughafen 1218.0 "
]
},
- "execution_count": 64,
+ "execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stops_join = route_stops.join(transfers.set_index(\"stop_int\"), how=\"left\", on=\"stop_int\").drop_duplicates()\n",
"stops_join.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1407"
]
},
- "execution_count": 65,
+ "execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stops_join.stop_int.nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 66,
+ "execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" n_Routes | \n",
" n_Transfers | \n",
"
\n",
" \n",
" stop_int | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 18 | \n",
" 16 | \n",
"
\n",
" \n",
" 1 | \n",
" 11 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 23 | \n",
" 4 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" 6 | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n_Routes n_Transfers\n",
"stop_int \n",
"0 18 16\n",
"1 11 2\n",
"2 23 4\n",
"3 6 6\n",
"4 6 0"
]
},
- "execution_count": 66,
+ "execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_route_transfers = stops_join.sort_values(\"stop_int\").groupby([\"stop_int\"]).nunique().rename(columns={\"route_int\": \"n_Routes\", \"stop_int_2\": \"n_Transfers\"})\n",
"distinct_route_transfers = distinct_route_transfers[[\"n_Routes\", \"n_Transfers\"]].sort_index()\n",
"distinct_route_transfers.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 67,
+ "execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" n_Routes | \n",
" n_Transfers | \n",
" pointer_stop_routes | \n",
" pointer_transfers | \n",
"
\n",
" \n",
" stop_int | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 18 | \n",
" 16 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 11 | \n",
" 2 | \n",
" 18 | \n",
" 16 | \n",
"
\n",
" \n",
" 2 | \n",
" 23 | \n",
" 4 | \n",
" 29 | \n",
" 18 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" 6 | \n",
" 52 | \n",
" 22 | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" 0 | \n",
" 58 | \n",
" 28 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n_Routes n_Transfers pointer_stop_routes pointer_transfers\n",
"stop_int \n",
"0 18 16 0 0\n",
"1 11 2 18 16\n",
"2 23 4 29 18\n",
"3 6 6 52 22\n",
"4 6 0 58 28"
]
},
- "execution_count": 67,
+ "execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_route_transfers['pointer_stop_routes'] = distinct_route_transfers.n_Routes.cumsum().shift(1, fill_value=0)\n",
"distinct_route_transfers['pointer_transfers'] = distinct_route_transfers.n_Transfers.cumsum().shift(1, fill_value=0)\n",
"distinct_route_transfers.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" n_Routes | \n",
" n_Transfers | \n",
" pointer_stop_routes | \n",
" pointer_transfers | \n",
" pointer_stop_routes_shift | \n",
" pointer_transfers_shift | \n",
"
\n",
" \n",
" stop_int | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 18 | \n",
" 16 | \n",
" 0 | \n",
" 0 | \n",
" 18 | \n",
" 16 | \n",
"
\n",
" \n",
" 1 | \n",
" 11 | \n",
" 2 | \n",
" 18 | \n",
" 16 | \n",
" 29 | \n",
" 18 | \n",
"
\n",
" \n",
" 2 | \n",
" 23 | \n",
" 4 | \n",
" 29 | \n",
" 18 | \n",
" 52 | \n",
" 22 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" 6 | \n",
" 52 | \n",
" 22 | \n",
" 58 | \n",
" 28 | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" 0 | \n",
" 58 | \n",
" 28 | \n",
" 64 | \n",
" 28 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n_Routes n_Transfers pointer_stop_routes pointer_transfers \\\n",
"stop_int \n",
"0 18 16 0 0 \n",
"1 11 2 18 16 \n",
"2 23 4 29 18 \n",
"3 6 6 52 22 \n",
"4 6 0 58 28 \n",
"\n",
" pointer_stop_routes_shift pointer_transfers_shift \n",
"stop_int \n",
"0 18 16 \n",
"1 29 18 \n",
"2 52 22 \n",
"3 58 28 \n",
"4 64 28 "
]
},
- "execution_count": 68,
+ "execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_route_transfers[\"pointer_stop_routes_shift\"] = distinct_route_transfers['pointer_stop_routes'].shift(-1, fill_value=0)\n",
"distinct_route_transfers[\"pointer_transfers_shift\"] = distinct_route_transfers['pointer_transfers'].shift(-1, fill_value=0)\n",
"distinct_route_transfers.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"distinct_route_transfers['pointer_stop_routes'] = np.where((distinct_route_transfers[\"pointer_stop_routes\"] == distinct_route_transfers[\"pointer_stop_routes_shift\"]), None, distinct_route_transfers['pointer_stop_routes'])\n",
"distinct_route_transfers['pointer_transfers'] = np.where((distinct_route_transfers[\"pointer_transfers\"] == distinct_route_transfers[\"pointer_transfers_shift\"]), None, distinct_route_transfers['pointer_transfers'])\n"
]
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"n_Routes False\n",
"n_Transfers False\n",
"pointer_stop_routes False\n",
"pointer_transfers True\n",
"pointer_stop_routes_shift False\n",
"pointer_transfers_shift False\n",
"dtype: bool"
]
},
- "execution_count": 70,
+ "execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_route_transfers.isna().any()"
]
},
{
"cell_type": "code",
- "execution_count": 71,
+ "execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"stops_df = distinct_route_transfers[['pointer_stop_routes', 'pointer_transfers']]"
]
},
{
"cell_type": "code",
- "execution_count": 72,
+ "execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/stops_df.pkl','wb') as f: pickle.dump(stops_df, f)"
]
},
{
"cell_type": "code",
- "execution_count": 73,
+ "execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0],\n",
" [18, 16],\n",
" [29, 18],\n",
" ...,\n",
" [15329, 6322],\n",
" [15334, 6329],\n",
" [15339, 6334]], dtype=object)"
]
},
- "execution_count": 73,
+ "execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stops_array = stops_df.to_numpy()\n",
"stops_array"
]
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1407"
]
},
- "execution_count": 74,
+ "execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.size(stops_array, 0)"
]
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 105,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1407, 2)"
]
},
- "execution_count": 75,
+ "execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stops_array.shape"
]
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/stops_array_cyril.pkl','wb') as f: pickle.dump(stops_array, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]"
]
},
{
"cell_type": "code",
- "execution_count": 77,
+ "execution_count": 107,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" route_int | \n",
" stop_int | \n",
" stop_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 87163 | \n",
" 82 | \n",
" 0 | \n",
" 8503088:0:22 | \n",
"
\n",
" \n",
" 1 | \n",
" 95543 | \n",
" 129 | \n",
" 0 | \n",
" 8503088:0:21 | \n",
"
\n",
" \n",
" 2 | \n",
" 129332 | \n",
" 187 | \n",
" 0 | \n",
" 8503088:0:21 | \n",
"
\n",
" \n",
" 3 | \n",
" 73848 | \n",
" 211 | \n",
" 0 | \n",
" 8503088:0:22 | \n",
"
\n",
" \n",
" 4 | \n",
" 147285 | \n",
" 251 | \n",
" 0 | \n",
" 8503088:0:21 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index route_int stop_int stop_id\n",
"0 87163 82 0 8503088:0:22\n",
"1 95543 129 0 8503088:0:21\n",
"2 129332 187 0 8503088:0:21\n",
"3 73848 211 0 8503088:0:22\n",
"4 147285 251 0 8503088:0:21"
]
},
- "execution_count": 77,
+ "execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_routes = stop_times_ordered[[\"route_int\", \"stop_int\", \"stop_id\"]].drop_duplicates().sort_values([\"stop_int\", \"route_int\"])\n",
"stop_routes = stop_routes.reset_index()\n",
"stop_routes.head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(15486, 4)"
]
},
- "execution_count": 78,
+ "execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_routes.shape"
]
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 109,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"249"
]
},
- "execution_count": 79,
+ "execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_times_curated.route_id.nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 110,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1461"
]
},
- "execution_count": 80,
+ "execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_routes.route_int.nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 81,
+ "execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/stop_routes_df_cyril.pkl','wb') as f: pickle.dump(stop_routes, f)"
]
},
{
"cell_type": "code",
- "execution_count": 82,
+ "execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 82, 129, 187, ..., 855, 977, 1087])"
]
},
- "execution_count": 82,
+ "execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_routes_array = stop_routes[\"route_int\"].to_numpy()\n",
"stop_routes_array"
]
},
{
"cell_type": "code",
- "execution_count": 83,
+ "execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15486"
]
},
- "execution_count": 83,
+ "execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.size(stop_routes_array, 0)"
]
},
{
"cell_type": "code",
- "execution_count": 84,
+ "execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(15486,)"
]
},
- "execution_count": 84,
+ "execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_routes_array.shape"
]
},
{
"cell_type": "code",
- "execution_count": 85,
+ "execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/stop_routes_array_cyril.pkl','wb') as f: pickle.dump(stop_routes_array, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]"
]
},
{
"cell_type": "code",
- "execution_count": 86,
+ "execution_count": 116,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9434"
]
},
- "execution_count": 86,
+ "execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfers.stop_id.count()"
]
},
{
"cell_type": "code",
- "execution_count": 89,
+ "execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" stop_int | \n",
" stop_int_2 | \n",
" Transfer_time_sec | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 8 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 51 | \n",
" 564 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 274 | \n",
" 441 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 375 | \n",
" 594 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 462 | \n",
" 489 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stop_int stop_int_2 Transfer_time_sec\n",
"0 0 0 8\n",
"1 0 51 564\n",
"2 0 274 441\n",
"3 0 375 594\n",
"4 0 462 489"
]
},
- "execution_count": 89,
+ "execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfer_pandas = transfers[[\"stop_int\",\"stop_int_2\", \"Transfer_time_sec\"]].sort_values([\"stop_int\", \"stop_int_2\"]).drop_duplicates([\"stop_int\", \"stop_int_2\"])\n",
"transfer_pandas = transfer_pandas.reset_index(drop=True)\n",
"transfer_pandas.head()"
]
},
{
"cell_type": "code",
- "execution_count": 90,
+ "execution_count": 118,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1342"
]
},
- "execution_count": 90,
+ "execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfer_pandas.stop_int_2.nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 91,
+ "execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/transfer_df_cyril.pkl','wb') as f: pickle.dump(transfers.sort_values(\"stop_id\"), f)"
]
},
{
"cell_type": "code",
- "execution_count": 92,
+ "execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0, 8],\n",
" [ 51, 564],\n",
" [ 274, 441],\n",
" ...,\n",
" [1120, 345],\n",
" [1266, 561],\n",
" [1406, 8]])"
]
},
- "execution_count": 92,
+ "execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfer_array = transfer_pandas[[\"stop_int_2\", \"Transfer_time_sec\"]].to_numpy()\n",
"transfer_array"
]
},
{
"cell_type": "code",
- "execution_count": 93,
+ "execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/transfer_array_cyril.pkl','wb') as f: pickle.dump(transfer_array, f)"
]
},
{
"cell_type": "code",
- "execution_count": 94,
+ "execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6342"
]
},
- "execution_count": 94,
+ "execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.size(transfer_array, 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Check if indexes in stops is correct"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see first the pointers"
]
},
{
"cell_type": "code",
- "execution_count": 95,
+ "execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pointer_stop_routes | \n",
" pointer_transfers | \n",
"
\n",
" \n",
" stop_int | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 18 | \n",
" 16 | \n",
"
\n",
" \n",
" 2 | \n",
" 29 | \n",
" 18 | \n",
"
\n",
" \n",
" 3 | \n",
" 52 | \n",
" 22 | \n",
"
\n",
" \n",
" 4 | \n",
" 58 | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pointer_stop_routes pointer_transfers\n",
"stop_int \n",
"0 0 0\n",
"1 18 16\n",
"2 29 18\n",
"3 52 22\n",
"4 58 None"
]
},
- "execution_count": 95,
+ "execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stops_df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see that at the index 16 there should be a new stop. we check and it is false"
]
},
{
"cell_type": "code",
- "execution_count": 97,
+ "execution_count": 124,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" stop_int | \n",
" stop_int_2 | \n",
" Transfer_time_sec | \n",
"
\n",
" \n",
" \n",
" \n",
" 15 | \n",
" 0 | \n",
" 1289 | \n",
" 460 | \n",
"
\n",
" \n",
" 16 | \n",
" 1 | \n",
" 814 | \n",
" 267 | \n",
"
\n",
" \n",
" 17 | \n",
" 1 | \n",
" 1350 | \n",
" 569 | \n",
"
\n",
" \n",
" 18 | \n",
" 2 | \n",
" 38 | \n",
" 346 | \n",
"
\n",
" \n",
" 19 | \n",
" 2 | \n",
" 1062 | \n",
" 413 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stop_int stop_int_2 Transfer_time_sec\n",
"15 0 1289 460\n",
"16 1 814 267\n",
"17 1 1350 569\n",
"18 2 38 346\n",
"19 2 1062 413"
]
},
- "execution_count": 97,
+ "execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transfer_pandas.loc[15:20].head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see that at index 18 we should have a new stop. we check and it true"
]
},
{
"cell_type": "code",
- "execution_count": 99,
+ "execution_count": 125,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" route_int | \n",
" stop_int | \n",
" stop_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 15 | \n",
" 240179 | \n",
" 1039 | \n",
" 0 | \n",
" 8503088:0:21 | \n",
"
\n",
" \n",
" 16 | \n",
" 150919 | \n",
" 1078 | \n",
" 0 | \n",
" 8503088:0:21 | \n",
"
\n",
" \n",
" 17 | \n",
" 26670 | \n",
" 1316 | \n",
" 0 | \n",
" 8503088:0:21 | \n",
"
\n",
" \n",
" 18 | \n",
" 93857 | \n",
" 18 | \n",
" 1 | \n",
" 8502508 | \n",
"
\n",
" \n",
" 19 | \n",
" 236508 | \n",
" 114 | \n",
" 1 | \n",
" 8502508 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index route_int stop_int stop_id\n",
"15 240179 1039 0 8503088:0:21\n",
"16 150919 1078 0 8503088:0:21\n",
"17 26670 1316 0 8503088:0:21\n",
"18 93857 18 1 8502508\n",
"19 236508 114 1 8502508"
]
},
- "execution_count": 99,
+ "execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_routes.loc[15:20].head(5)"
]
},
{
"cell_type": "code",
- "execution_count": 106,
+ "execution_count": 126,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" route_int | \n",
" stop_int | \n",
" stop_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 7024 | \n",
" 108903 | \n",
" 382 | \n",
" 617 | \n",
" 8503006:0:5 | \n",
"
\n",
" \n",
" 8204 | \n",
" 108901 | \n",
" 382 | \n",
" 724 | \n",
" 8503011:0:2 | \n",
"
\n",
" \n",
" 12599 | \n",
" 108900 | \n",
" 382 | \n",
" 1138 | \n",
" 8503010:0:2 | \n",
"
\n",
" \n",
" 12940 | \n",
" 108902 | \n",
" 382 | \n",
" 1176 | \n",
" 8503000:0:33 | \n",
"
\n",
" \n",
" 13590 | \n",
" 108904 | \n",
" 382 | \n",
" 1218 | \n",
" 8503016:0:3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index route_int stop_int stop_id\n",
"7024 108903 382 617 8503006:0:5\n",
"8204 108901 382 724 8503011:0:2\n",
"12599 108900 382 1138 8503010:0:2\n",
"12940 108902 382 1176 8503000:0:33\n",
"13590 108904 382 1218 8503016:0:3"
]
},
- "execution_count": 106,
+ "execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stop_routes.loc[stop_routes['route_int'] == 382]"
]
},
{
"cell_type": "code",
- "execution_count": 108,
+ "execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" route_int | \n",
" stop_int | \n",
"
\n",
" \n",
" \n",
" \n",
" 4024 | \n",
" 108900 | \n",
" 382 | \n",
" 1138 | \n",
"
\n",
" \n",
" 4025 | \n",
" 108901 | \n",
" 382 | \n",
" 724 | \n",
"
\n",
" \n",
" 4026 | \n",
" 108902 | \n",
" 382 | \n",
" 1176 | \n",
"
\n",
" \n",
" 4027 | \n",
" 108903 | \n",
" 382 | \n",
" 617 | \n",
"
\n",
" \n",
" 4028 | \n",
" 108904 | \n",
" 382 | \n",
" 1218 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index route_int stop_int\n",
"4024 108900 382 1138\n",
"4025 108901 382 724\n",
"4026 108902 382 1176\n",
"4027 108903 382 617\n",
"4028 108904 382 1218"
]
},
- "execution_count": 108,
+ "execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"route_stops.loc[route_stops['route_int'] == 382]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"read files as pickles"
]
},
{
"cell_type": "code",
- "execution_count": 102,
+ "execution_count": 128,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "FileNotFoundError",
+ "evalue": "[Errno 2] No such file or directory: '../data/stop_times_array.pkl'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/stop_times_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/stop_times_array.pkl'"
+ ]
+ }
+ ],
"source": [
- "with open('../data/stop_times_array.pkl','rb') as f: arrayname1 = pickle.load(f)"
+ "with open('../data/stop_times_array_cyril.pkl','rb') as f: arrayname1 = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"ename": "UnpicklingError",
"evalue": "invalid load key, 'v'.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnpicklingError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/routes_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mUnpicklingError\u001b[0m: invalid load key, 'v'."
]
}
],
"source": [
- "with open('../data/routes_array.pkl','rb') as f: arrayname2 = pickle.load(f)"
+ "with open('../data/routes_array_cyril.pkl','rb') as f: arrayname2 = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"ename": "UnpicklingError",
"evalue": "invalid load key, 'v'.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnpicklingError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/route_stops_array.pkl'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marrayname3\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mUnpicklingError\u001b[0m: invalid load key, 'v'."
]
}
],
"source": [
- "with open('../data/route_stops_array.pkl','rb') as f: arrayname3 = pickle.load(f)"
+ "with open('../data/route_stops_array_cyril.pkl','rb') as f: arrayname3 = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([['2020-05-22T07:00:00.000000000', '2020-05-22T07:01:00.000000000'],\n",
" ['2020-05-22T07:02:00.000000000', 'NaT'],\n",
" ['2020-05-22T07:00:00.000000000', '2020-05-22T07:00:00.000000000'],\n",
" ...,\n",
" ['2020-05-22T07:35:00.000000000', '2020-05-22T07:35:00.000000000'],\n",
" ['2020-05-22T07:36:00.000000000', '2020-05-22T07:36:00.000000000'],\n",
" ['2020-05-22T07:37:00.000000000', 'NaT']],\n",
" dtype='datetime64[ns]')"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arrayname1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"arrayname2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"arrayname3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
diff --git a/notebooks/transfer_to_local.ipynb b/notebooks/transfer_to_local.ipynb
index c9649d5..8196c42 100644
--- a/notebooks/transfer_to_local.ipynb
+++ b/notebooks/transfer_to_local.ipynb
@@ -1,261 +1,261 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## transfer files from HDFS to local\n",
"\n",
"Any application without a proper name would be promptly killed.
"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
- "ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
7704 | application_1589299642358_2200 | pyspark | idle | Link | Link | |
7735 | application_1589299642358_2231 | pyspark | idle | Link | Link | |
7737 | application_1589299642358_2233 | pyspark | idle | Link | Link | |
7739 | application_1589299642358_2235 | pyspark | dead | Link | Link | |
7743 | application_1589299642358_2239 | pyspark | idle | Link | Link | |
7745 | application_1589299642358_2241 | pyspark | idle | Link | Link | |
7750 | application_1589299642358_2246 | pyspark | busy | Link | Link | |
7753 | application_1589299642358_2249 | pyspark | idle | Link | Link | |
7756 | application_1589299642358_2252 | pyspark | idle | Link | Link | |
7759 | application_1589299642358_2255 | pyspark | busy | Link | Link | |
7760 | application_1589299642358_2256 | pyspark | idle | Link | Link | |
7761 | application_1589299642358_2257 | pyspark | idle | Link | Link | |
7762 | application_1589299642358_2258 | pyspark | idle | Link | Link | |
7764 | application_1589299642358_2260 | pyspark | idle | Link | Link | |
7767 | application_1589299642358_2263 | pyspark | idle | Link | Link | |
7768 | application_1589299642358_2264 | pyspark | idle | Link | Link | |
7770 | application_1589299642358_2266 | pyspark | idle | Link | Link | |
7772 | application_1589299642358_2268 | pyspark | idle | Link | Link | |
7773 | application_1589299642358_2269 | pyspark | idle | Link | Link | |
7774 | application_1589299642358_2270 | pyspark | idle | Link | Link | |
7775 | application_1589299642358_2272 | pyspark | idle | Link | Link | |
7776 | application_1589299642358_2273 | pyspark | idle | Link | Link | |
7777 | application_1589299642358_2274 | pyspark | idle | Link | Link | |
7778 | application_1589299642358_2275 | pyspark | idle | Link | Link | |
7779 | application_1589299642358_2276 | pyspark | busy | Link | Link | |
7780 | application_1589299642358_2277 | pyspark | idle | Link | Link | |
7781 | application_1589299642358_2278 | pyspark | busy | Link | Link | |
"
+ "ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
7932 | application_1589299642358_2450 | pyspark | idle | Link | Link | |
7933 | application_1589299642358_2451 | pyspark | idle | Link | Link | |
7935 | application_1589299642358_2453 | pyspark | idle | Link | Link | |
7939 | application_1589299642358_2457 | pyspark | idle | Link | Link | |
7940 | application_1589299642358_2458 | pyspark | idle | Link | Link | |
7941 | application_1589299642358_2459 | pyspark | idle | Link | Link | |
7942 | application_1589299642358_2460 | pyspark | idle | Link | Link | |
7944 | application_1589299642358_2462 | pyspark | idle | Link | Link | |
7945 | application_1589299642358_2463 | pyspark | dead | Link | Link | |
7946 | application_1589299642358_2464 | pyspark | idle | Link | Link | |
7947 | application_1589299642358_2465 | pyspark | idle | Link | Link | |
7948 | application_1589299642358_2466 | pyspark | idle | Link | Link | |
7949 | application_1589299642358_2467 | pyspark | idle | Link | Link | |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%configure\n",
"{\"conf\": {\n",
" \"spark.app.name\": \"lgptguys_final\"\n",
"}}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Start Spark"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting Spark application\n"
]
},
{
"data": {
"text/html": [
"\n",
- "ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
7782 | application_1589299642358_2279 | pyspark | idle | Link | Link | ✔ |
"
+ "ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
7950 | application_1589299642358_2468 | pyspark | idle | Link | Link | ✔ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"SparkSession available as 'spark'.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"An error was encountered:\n",
"unknown magic command '%spark'\n",
"UnknownMagic: unknown magic command '%spark'\n",
"\n"
]
}
],
"source": [
"# Initialization\n",
"%%spark"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Transfert and save tables from hdfs to local \n",
"\n",
"Here we describe the process of loading a table on hdfs, saving it to a proper place so that we can load it in local and then save it.\n",
"\n",
"First, we load the data that is in an otherwise not accessible place in hdfs :"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"username = 'tturner'\n",
"\n",
"my_files = ['stop_times_curated.csv',\n",
" 'stops_15km.csv', 'transfers.csv', 'stop_times_final_cyril.csv']\n",
"\n",
"for file in my_files:\n",
" this_file = spark.read.csv('data/lgpt_guys/{}'.format(file), \\\n",
" header = True) \n",
" this_file.write.csv(\"/user/{0}/{1}\".format(username, file.replace('.csv','')), \\\n",
" header = True, mode = 'overwrite')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/work/final_project/notebooks\n",
"stop_times_curated\n",
"stops_15km\n",
"transfers\n",
"stop_times_final_cyril\n"
]
}
],
"source": [
"%local\n",
"\n",
"from hdfs3 import HDFileSystem\n",
"import pandas as pd\n",
"import numpy as np \n",
"import os\n",
"print(os.getcwd())\n",
"\n",
"hdfs = HDFileSystem(host='hdfs://iccluster044.iccluster.epfl.ch', port=8020, user='ebouille')\n",
"\n",
"username = 'tturner'\n",
"\n",
"my_folders = ['stop_times_curated', \n",
" 'stops_15km', 'transfers', 'stop_times_final_cyril']\n",
"\n",
"for folder in my_folders:\n",
" print(folder)\n",
" array_files = hdfs.glob('/user/{0}/{1}/*.csv'.format(username, folder))\n",
" array = pd.DataFrame()\n",
" for file in array_files:\n",
" with hdfs.open(file) as f:\n",
" array = array.append(pd.read_csv(f))\n",
"\n",
" array.to_csv('../data/{}.csv'.format(folder), header=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark",
"language": "",
"name": "pysparkkernel"
},
"language_info": {
"codemirror_mode": {
"name": "python",
"version": 3
},
"mimetype": "text/x-python",
"name": "pyspark",
"pygments_lexer": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}