diff --git a/data/stop_times_array.pkl b/data/stop_times_array.pkl index b54a600..f7b3d65 100644 --- a/data/stop_times_array.pkl +++ b/data/stop_times_array.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a5b61142e94d00f3e247423d80655e35601baffb7d63810446145e37bcdf775 +oid sha256:96ec84b146cfb13e91b74cabe6522ba18bfc984f045eeea0804fb231b8e507ac size 3931993 diff --git a/data/stop_times_df.pkl b/data/stop_times_df.pkl index a230267..53b366a 100644 --- a/data/stop_times_df.pkl +++ b/data/stop_times_df.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83ef53159dcc6ed2394199f68d68424290b9ee10981ada263dee4c6b0282e1ab -size 21899011 +oid sha256:0ef6f71d9ceb1cdf805a3ff80b2f80b80b73a40398832a926361c0d8face79cd +size 24436035 diff --git a/notebooks/Arrays_to_pickle.ipynb b/notebooks/Arrays_to_pickle.ipynb index 4eb5ec2..bc22c1f 100644 --- a/notebooks/Arrays_to_pickle.ipynb +++ b/notebooks/Arrays_to_pickle.ipynb @@ -1,6661 +1,6917 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Transform csv preproccesed files to pickle" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "import itertools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read files\n", "Before running make sure the .csv files are in /data . If not run notebook \"transfer_to_local\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0trip_idstop_idarrival_timedeparture_timestop_sequencepickup_typedrop_off_typehour_departureroute_iddirection_id
00666.TA.26-4-j19-1.20.R857618207:02:0007:02:001007.026-4-j19-11
11243.TA.26-311-j19-1.3.R859083407:16:0007:16:001007.026-311-j19-11
22406.TA.26-62-j19-1.3.R859134907:24:0007:24:001007.026-62-j19-11
3362.TA.57-2-Y-j19-1.33.H8503000:0:1307:34:0007:34:001007.057-2-Y-j19-10
441179.TA.26-5-B-j19-1.23.R859124507:36:0007:36:001007.026-5-B-j19-11
\n", "
" ], "text/plain": [ " Unnamed: 0 trip_id stop_id arrival_time \\\n", "0 0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 \n", "1 1 243.TA.26-311-j19-1.3.R 8590834 07:16:00 \n", "2 2 406.TA.26-62-j19-1.3.R 8591349 07:24:00 \n", "3 3 62.TA.57-2-Y-j19-1.33.H 8503000:0:13 07:34:00 \n", "4 4 1179.TA.26-5-B-j19-1.23.R 8591245 07:36:00 \n", "\n", " departure_time stop_sequence pickup_type drop_off_type hour_departure \\\n", "0 07:02:00 1 0 0 7.0 \n", "1 07:16:00 1 0 0 7.0 \n", "2 07:24:00 1 0 0 7.0 \n", "3 07:34:00 1 0 0 7.0 \n", "4 07:36:00 1 0 0 7.0 \n", "\n", " route_id direction_id \n", "0 26-4-j19-1 1 \n", "1 26-311-j19-1 1 \n", "2 26-62-j19-1 1 \n", "3 57-2-Y-j19-1 0 \n", "4 26-5-B-j19-1 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#stop_times\n", "stop_times_curated = pd.read_csv(\"../data/stop_times_curated.csv\")\n", "stop_times_curated.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_sec
00850092685906160.122430146
11850092685907370.300175360
2285021868502186:0:10.0067628
3385021868502186:0:20.01352416
4485021868502186P0.0000000
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec\n", "0 0 8500926 8590616 0.122430 146\n", "1 1 8500926 8590737 0.300175 360\n", "2 2 8502186 8502186:0:1 0.006762 8\n", "3 3 8502186 8502186:0:2 0.013524 16\n", "4 4 8502186 8502186P 0.000000 0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#transfers\n", "transfers = pd.read_csv(\"../data/transfers.csv\")\n", "transfers.head(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0route_iddirection_idroute_id_directionalagency_idroute_typeroute_descroute_short_name_directionalroute_long_name_directional
001-1-E-j19-101-1-E-j19-1-0886700Bus1-0NaN
111-44-j19-111-44-j19-1-1886700Bus44-1NaN
2226-453-j19-1026-453-j19-1-0773700Bus453-0NaN
3379-373-2-j19-1179-373-2-j19-1-11941000Schiff3732-1NaN
4426-744-j19-1026-744-j19-1-0849700Bus744-0NaN
\n", "
" ], "text/plain": [ " Unnamed: 0 route_id direction_id route_id_directional agency_id \\\n", "0 0 1-1-E-j19-1 0 1-1-E-j19-1-0 886 \n", "1 1 1-44-j19-1 1 1-44-j19-1-1 886 \n", "2 2 26-453-j19-1 0 26-453-j19-1-0 773 \n", "3 3 79-373-2-j19-1 1 79-373-2-j19-1-1 194 \n", "4 4 26-744-j19-1 0 26-744-j19-1-0 849 \n", "\n", " route_type route_desc route_short_name_directional \\\n", "0 700 Bus 1-0 \n", "1 700 Bus 44-1 \n", "2 700 Bus 453-0 \n", "3 1000 Schiff 3732-1 \n", "4 700 Bus 744-0 \n", "\n", " route_long_name_directional \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#routes\n", "routes_curated = pd.read_csv(\"../data/routes_curated.csv\")\n", "routes_curated.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create stop_id same for all platforms" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#copy information stop_id with platform in stop_id_raw\n", "stop_times_curated[\"stop_id_raw\"] = stop_times_curated[\"stop_id\"]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0trip_idstop_idarrival_timedeparture_timestop_sequencepickup_typedrop_off_typehour_departureroute_iddirection_idstop_id_raw
00666.TA.26-4-j19-1.20.R857618207:02:0007:02:001007.026-4-j19-118576182
11243.TA.26-311-j19-1.3.R859083407:16:0007:16:001007.026-311-j19-118590834
22406.TA.26-62-j19-1.3.R859134907:24:0007:24:001007.026-62-j19-118591349
3362.TA.57-2-Y-j19-1.33.H850300007:34:0007:34:001007.057-2-Y-j19-108503000:0:13
441179.TA.26-5-B-j19-1.23.R859124507:36:0007:36:001007.026-5-B-j19-118591245
\n", "
" ], "text/plain": [ " Unnamed: 0 trip_id stop_id arrival_time departure_time \\\n", "0 0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 07:02:00 \n", "1 1 243.TA.26-311-j19-1.3.R 8590834 07:16:00 07:16:00 \n", "2 2 406.TA.26-62-j19-1.3.R 8591349 07:24:00 07:24:00 \n", "3 3 62.TA.57-2-Y-j19-1.33.H 8503000 07:34:00 07:34:00 \n", "4 4 1179.TA.26-5-B-j19-1.23.R 8591245 07:36:00 07:36:00 \n", "\n", " stop_sequence pickup_type drop_off_type hour_departure route_id \\\n", "0 1 0 0 7.0 26-4-j19-1 \n", "1 1 0 0 7.0 26-311-j19-1 \n", "2 1 0 0 7.0 26-62-j19-1 \n", "3 1 0 0 7.0 57-2-Y-j19-1 \n", "4 1 0 0 7.0 26-5-B-j19-1 \n", "\n", " direction_id stop_id_raw \n", "0 1 8576182 \n", "1 1 8590834 \n", "2 1 8591349 \n", "3 0 8503000:0:13 \n", "4 1 8591245 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Use only first 7 characters for stop_id\n", "stop_times_curated[\"stop_id\"] = stop_times_curated[\"stop_id_raw\"].str.slice(0, 7)\n", "stop_times_curated[\"stop_id\"] = pd.to_numeric(stop_times_curated[\"stop_id\"])\n", "stop_times_curated.head(5)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#copy information stop_id with platform in stop_id_raw\n", "transfers[\"stop_id_raw\"] = transfers[\"stop_id\"]\n", "transfers[\"stop_id2_raw\"] = transfers[\"stop_id2\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_id_rawstop_id2_raw
00850092685906160.12243014685009268590616
11850092685907370.30017536085009268590737
22850218685021860.006762885021868502186:0:1
33850218685021860.0135241685021868502186:0:2
44850218685021860.000000085021868502186P
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec stop_id_raw \\\n", "0 0 8500926 8590616 0.122430 146 8500926 \n", "1 1 8500926 8590737 0.300175 360 8500926 \n", "2 2 8502186 8502186 0.006762 8 8502186 \n", "3 3 8502186 8502186 0.013524 16 8502186 \n", "4 4 8502186 8502186 0.000000 0 8502186 \n", "\n", " stop_id2_raw \n", "0 8590616 \n", "1 8590737 \n", "2 8502186:0:1 \n", "3 8502186:0:2 \n", "4 8502186P " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Use only first 7 characters for stop_id\n", "transfers[\"stop_id\"] = transfers[\"stop_id_raw\"].str.slice(0, 7)\n", "transfers[\"stop_id2\"] = transfers[\"stop_id2_raw\"].str.slice(0, 7)\n", "transfers[\"stop_id\"] = pd.to_numeric(transfers[\"stop_id\"])\n", "transfers[\"stop_id2\"] = pd.to_numeric(transfers[\"stop_id2\"])\n", "transfers.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Solve problem in hour_departure" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can see that hour departure does not contain times above 23, so we need to correct this" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,\n", " nan])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.hour_departure.unique()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17',\n", " '18', '19', '24', '25'], dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.departure_time.str.slice(0,2).unique()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "stop_times_curated[\"hour_departure\"] = pd.to_numeric(stop_times_curated.departure_time.str.slice(0,2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if well converted to int" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 24, 25])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated[\"hour_departure\"].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We drop the trips before 7 am and after 7 pm" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "250777" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.trip_id.count()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "trip_id_drop = np.where(((stop_times_curated.hour_departure > 19) |\\\n", " (stop_times_curated.hour_departure < 7)),\\\n", " stop_times_curated[\"trip_id\"] , None)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "stop_times_curated = stop_times_curated[~stop_times_curated[\"trip_id\"].isin(trip_id_drop)]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "246514" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.trip_id.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Delete trips with 1 stop\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We drop columns not useful to us" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "stop_times_curated = stop_times_curated.drop(columns=[\"Unnamed: 0\", \"hour_departure\", \"drop_off_type\", \"pickup_type\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we delete trips with only 1 stop. We start by counting the stops of each trip" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
trip_idstop_idarrival_timedeparture_timestop_sequenceroute_iddirection_idstop_id_raw
trip_id
1.TA.1-231-j19-1.1.H1151717181115
1.TA.1-44-j19-1.1.R13333113
1.TA.1-444-j19-1.1.H19999119
1.TA.12-E03-j19-1.1.H12222112
1.TA.18-46-j19-1.1.H11111111
\n", "
" ], "text/plain": [ " trip_id stop_id arrival_time departure_time \\\n", "trip_id \n", "1.TA.1-231-j19-1.1.H 1 15 17 17 \n", "1.TA.1-44-j19-1.1.R 1 3 3 3 \n", "1.TA.1-444-j19-1.1.H 1 9 9 9 \n", "1.TA.12-E03-j19-1.1.H 1 2 2 2 \n", "1.TA.18-46-j19-1.1.H 1 1 1 1 \n", "\n", " stop_sequence route_id direction_id stop_id_raw \n", "trip_id \n", "1.TA.1-231-j19-1.1.H 18 1 1 15 \n", "1.TA.1-44-j19-1.1.R 3 1 1 3 \n", "1.TA.1-444-j19-1.1.H 9 1 1 9 \n", "1.TA.12-E03-j19-1.1.H 2 1 1 2 \n", "1.TA.18-46-j19-1.1.H 1 1 1 1 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "number_stop = stop_times_curated.groupby('trip_id').nunique()\n", "number_stop.head(5)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "#get trips with 1 stop\n", "trip_with_1_stop = np.where((number_stop.stop_id == 1), number_stop.index, None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check number of trips before cleaning" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19444" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.trip_id.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We drop the rows with a unique stop per trip" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "#drop trips with only 1 stop\n", "stop_times_curated = stop_times_curated[~stop_times_curated[\"trip_id\"].isin(trip_with_1_stop)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we check how many trips there still. Wee that an important number of trips has been deleted" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18668" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.trip_id.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add information on transport type\n", - "To do" + "We add the information of type of transport to the StopTimes dataframe" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(245738, 8)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_curated.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(521, 2)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "routes_curated[[\"route_id\", \"route_desc\"]].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", " \n", "
Unnamed: 0route_iddirection_idroute_id_directionalagency_idroute_typeroute_descroute_short_name_directionalroute_long_name_directional
001-1-E-j19-101-1-E-j19-1-0886700Bus1-0NaN
111-44-j19-111-44-j19-1-1886700Bus44-1NaN
2226-453-j19-1026-453-j19-1-0773700Bus453-0NaN
3379-373-2-j19-1179-373-2-j19-1-11941000Schiff3732-1NaN
4426-744-j19-1026-744-j19-1-0849700Bus744-0NaN
\n", "
" ], "text/plain": [ - " Unnamed: 0 route_id direction_id route_id_directional agency_id \\\n", - "0 0 1-1-E-j19-1 0 1-1-E-j19-1-0 886 \n", - "1 1 1-44-j19-1 1 1-44-j19-1-1 886 \n", - "2 2 26-453-j19-1 0 26-453-j19-1-0 773 \n", - "3 3 79-373-2-j19-1 1 79-373-2-j19-1-1 194 \n", - "4 4 26-744-j19-1 0 26-744-j19-1-0 849 \n", - "\n", - " route_type route_desc route_short_name_directional \\\n", - "0 700 Bus 1-0 \n", - "1 700 Bus 44-1 \n", - "2 700 Bus 453-0 \n", - "3 1000 Schiff 3732-1 \n", - "4 700 Bus 744-0 \n", - "\n", - " route_long_name_directional \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " route_id route_desc\n", + "0 1-1-E-j19-1 Bus\n", + "1 1-44-j19-1 Bus\n", + "2 26-453-j19-1 Bus\n", + "3 79-373-2-j19-1 Schiff\n", + "4 26-744-j19-1 Bus" ] }, - "execution_count": 23, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "routes_curated.head(5)" + "routes_curated[[\"route_id\", \"route_desc\"]].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_route_info = stop_times_curated.join(routes_curated[[\"route_id\", \"route_desc\"]].set_index(\"route_id\"), on=\"route_id\", how=\"inner\").drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(245738, 9)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_route_info.shape" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ - "stop_times_curated = stop_times_curated.join(routes_curated[[\"route_id\", \"route_desc\"]].set_index(\"route_id\"), on=\"route_id\", how=\"left\")" + "stop_times_curated = stop_times_route_info" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
trip_idstop_idarrival_timedeparture_timestop_sequenceroute_iddirection_idstop_id_rawroute_desc
0666.TA.26-4-j19-1.20.R857618207:02:0007:02:00126-4-j19-118576182Tram
0666.TA.26-4-j19-1.20.R857618207:02:0007:02:00531175.TA.26-4-j19-1.25.H859105715:31:0015:31:00126-4-j19-11857618208591057Tram
1243.TA.26-311-j19-1.3.R859083407:16:0007:16:00126-311-j19-1671163.TA.26-4-j19-1.25.H859105716:44:0016:44:0018590834Bus26-4-j19-108591057Tram
1243.TA.26-311-j19-1.3.R859083407:16:0007:16:00126-311-j19-118590834Bus2501014.TA.26-4-j19-1.25.H859128210:37:0010:37:001226-4-j19-108591282Tram
2406.TA.26-62-j19-1.3.R859134907:24:0007:24:00126-62-j19-1288586.TA.26-4-j19-1.20.R858734911:56:0011:56:001326-4-j19-118591349Bus8587349Tram
\n", "
" ], "text/plain": [ - " trip_id stop_id arrival_time departure_time \\\n", - "0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 07:02:00 \n", - "0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 07:02:00 \n", - "1 243.TA.26-311-j19-1.3.R 8590834 07:16:00 07:16:00 \n", - "1 243.TA.26-311-j19-1.3.R 8590834 07:16:00 07:16:00 \n", - "2 406.TA.26-62-j19-1.3.R 8591349 07:24:00 07:24:00 \n", + " trip_id stop_id arrival_time departure_time \\\n", + "0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 07:02:00 \n", + "53 1175.TA.26-4-j19-1.25.H 8591057 15:31:00 15:31:00 \n", + "67 1163.TA.26-4-j19-1.25.H 8591057 16:44:00 16:44:00 \n", + "250 1014.TA.26-4-j19-1.25.H 8591282 10:37:00 10:37:00 \n", + "288 586.TA.26-4-j19-1.20.R 8587349 11:56:00 11:56:00 \n", "\n", - " stop_sequence route_id direction_id stop_id_raw route_desc \n", - "0 1 26-4-j19-1 1 8576182 Tram \n", - "0 1 26-4-j19-1 1 8576182 Tram \n", - "1 1 26-311-j19-1 1 8590834 Bus \n", - "1 1 26-311-j19-1 1 8590834 Bus \n", - "2 1 26-62-j19-1 1 8591349 Bus " + " stop_sequence route_id direction_id stop_id_raw route_desc \n", + "0 1 26-4-j19-1 1 8576182 Tram \n", + "53 1 26-4-j19-1 0 8591057 Tram \n", + "67 1 26-4-j19-1 0 8591057 Tram \n", + "250 12 26-4-j19-1 0 8591282 Tram \n", + "288 13 26-4-j19-1 1 8587349 Tram " ] }, - "execution_count": 25, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create route_int, trip_int and stop_int as consecutive integer IDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Route_int\n", "The route_int Id is given in an abitrary order" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start creating a tuple with all the stops in a trip" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
stop_id
trip_id
1.TA.1-231-j19-1.1.H(8583071, 8583071, 8502553, 8502553, 8572600, ...(8583071, 8502553, 8572600, 8573722, 8572747, ...
1.TA.1-44-j19-1.1.R(8590279, 8590279, 8590275, 8590275, 8591891, ...(8590279, 8590275, 8591891)
1.TA.1-444-j19-1.1.H(8591365, 8591365, 8502979, 8502979, 8591059, ...(8591365, 8502979, 8591059, 8581346, 8572596, ...
1.TA.12-E03-j19-1.1.H(8573205, 8573205, 8596126, 8596126)(8573205, 8596126)
1.TA.21-23-j19-1.1.R(8503003, 8503003, 8503000, 8503000)(8503003, 8503000)
\n", "
" ], "text/plain": [ " stop_id\n", "trip_id \n", - "1.TA.1-231-j19-1.1.H (8583071, 8583071, 8502553, 8502553, 8572600, ...\n", - "1.TA.1-44-j19-1.1.R (8590279, 8590279, 8590275, 8590275, 8591891, ...\n", - "1.TA.1-444-j19-1.1.H (8591365, 8591365, 8502979, 8502979, 8591059, ...\n", - "1.TA.12-E03-j19-1.1.H (8573205, 8573205, 8596126, 8596126)\n", - "1.TA.21-23-j19-1.1.R (8503003, 8503003, 8503000, 8503000)" + "1.TA.1-231-j19-1.1.H (8583071, 8502553, 8572600, 8573722, 8572747, ...\n", + "1.TA.1-44-j19-1.1.R (8590279, 8590275, 8591891)\n", + "1.TA.1-444-j19-1.1.H (8591365, 8502979, 8591059, 8581346, 8572596, ...\n", + "1.TA.12-E03-j19-1.1.H (8573205, 8596126)\n", + "1.TA.21-23-j19-1.1.R (8503003, 8503000)" ] }, - "execution_count": 26, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#group stops into a sequence\n", "tuple_stops = stop_times_curated.groupby('trip_id')['stop_id'].apply(tuple).to_frame()\n", "tuple_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we can group all these sequences in unique groups" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
stop_id
(8500926, 8500926, 8590602, 8590602, 8595511, 8595511, 8590737, 8590737, 8591829, 8591829, 8590618, 8590618, 8587020, 8587020, 8590739, 8590739, 8590614, 8590614, 8590603, 8590603, 8590615, 8590615)(8500926, 8590602, 8595511, 8590737, 8591829, 8590618, 8587020, 8590739, 8590614, 8590603, 8590615)
(8500926, 8500926, 8590603, 8590603, 8587020, 8587020, 8590615, 8590615, 8590739, 8590739, 8591829, 8591829, 8590602, 8590602, 8590618, 8590618, 8590737, 8590737, 8595511, 8595511, 8590614, 8590614)(8500926, 8590603, 8587020, 8590615, 8590739, 8591829, 8590602, 8590618, 8590737, 8595511, 8590614)
(8500926, 8500926, 8590603, 8590603, 8590739, 8590739, 8595511, 8595511, 8590614, 8590614, 8590737, 8590737, 8590618, 8590618, 8587020, 8587020, 8591829, 8591829, 8590615, 8590615, 8590602, 8590602)(8500926, 8590603, 8590739, 8595511, 8590614, 8590737, 8590618, 8587020, 8591829, 8590615, 8590602)
(8500926, 8500926, 8590618, 8590618, 8590737, 8590737, 8590739, 8590739, 8587020, 8587020, 8590602, 8590602, 8590615, 8590615, 8595511, 8595511, 8591829, 8591829, 8590614, 8590614, 8590603, 8590603)(8500926, 8590618, 8590737, 8590739, 8587020, 8590602, 8590615, 8595511, 8591829, 8590614, 8590603)
(8500926, 8500926, 8590737, 8590737, 8590603, 8590603, 8590614, 8590614, 8590615, 8590615, 8591829, 8591829, 8595511, 8595511, 8590739, 8590739, 8587020, 8587020, 8590618, 8590618, 8590602, 8590602)(8500926, 8590737, 8590603, 8590614, 8590615, 8591829, 8595511, 8590739, 8587020, 8590618, 8590602)
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: []\n", - "Index: [(8500926, 8500926, 8590602, 8590602, 8595511, 8595511, 8590737, 8590737, 8591829, 8591829, 8590618, 8590618, 8587020, 8587020, 8590739, 8590739, 8590614, 8590614, 8590603, 8590603, 8590615, 8590615), (8500926, 8500926, 8590603, 8590603, 8587020, 8587020, 8590615, 8590615, 8590739, 8590739, 8591829, 8591829, 8590602, 8590602, 8590618, 8590618, 8590737, 8590737, 8595511, 8595511, 8590614, 8590614), (8500926, 8500926, 8590603, 8590603, 8590739, 8590739, 8595511, 8595511, 8590614, 8590614, 8590737, 8590737, 8590618, 8590618, 8587020, 8587020, 8591829, 8591829, 8590615, 8590615, 8590602, 8590602), (8500926, 8500926, 8590618, 8590618, 8590737, 8590737, 8590739, 8590739, 8587020, 8587020, 8590602, 8590602, 8590615, 8590615, 8595511, 8595511, 8591829, 8591829, 8590614, 8590614, 8590603, 8590603), (8500926, 8500926, 8590737, 8590737, 8590603, 8590603, 8590614, 8590614, 8590615, 8590615, 8591829, 8591829, 8595511, 8595511, 8590739, 8590739, 8587020, 8587020, 8590618, 8590618, 8590602, 8590602)]" + "Index: [(8500926, 8590602, 8595511, 8590737, 8591829, 8590618, 8587020, 8590739, 8590614, 8590603, 8590615), (8500926, 8590603, 8587020, 8590615, 8590739, 8591829, 8590602, 8590618, 8590737, 8595511, 8590614), (8500926, 8590603, 8590739, 8595511, 8590614, 8590737, 8590618, 8587020, 8591829, 8590615, 8590602), (8500926, 8590618, 8590737, 8590739, 8587020, 8590602, 8590615, 8595511, 8591829, 8590614, 8590603), (8500926, 8590737, 8590603, 8590614, 8590615, 8591829, 8595511, 8590739, 8587020, 8590618, 8590602)]" ] }, - "execution_count": 27, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#group to get unique stop sequences\n", "unique_stop_sequence = tuple_stops.groupby(\"stop_id\").count()\n", "unique_stop_sequence.head(5)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Series([], dtype: int64)" ] }, - "execution_count": 28, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_stop_sequence.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "These unique sequences of stops are our routes. We can create a unique ID, an integer, for each route" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", "
stop_idroute_int
0(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0
1(8500926, 8500926, 8590603, 8590603, 8587020, ...(8500926, 8590603, 8587020, 8590615, 8590739, ...1
2(8500926, 8500926, 8590603, 8590603, 8590739, ...(8500926, 8590603, 8590739, 8595511, 8590614, ...2
3(8500926, 8500926, 8590618, 8590618, 8590737, ...(8500926, 8590618, 8590737, 8590739, 8587020, ...3
4(8500926, 8500926, 8590737, 8590737, 8590603, ...(8500926, 8590737, 8590603, 8590614, 8590615, ...4
\n", "
" ], "text/plain": [ " stop_id route_int\n", - "0 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0\n", - "1 (8500926, 8500926, 8590603, 8590603, 8587020, ... 1\n", - "2 (8500926, 8500926, 8590603, 8590603, 8590739, ... 2\n", - "3 (8500926, 8500926, 8590618, 8590618, 8590737, ... 3\n", - "4 (8500926, 8500926, 8590737, 8590737, 8590603, ... 4" + "0 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0\n", + "1 (8500926, 8590603, 8587020, 8590615, 8590739, ... 1\n", + "2 (8500926, 8590603, 8590739, 8595511, 8590614, ... 2\n", + "3 (8500926, 8590618, 8590737, 8590739, 8587020, ... 3\n", + "4 (8500926, 8590737, 8590603, 8590614, 8590615, ... 4" ] }, - "execution_count": 29, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create dataframe and route_int\n", "df_unique_stop_sequence = unique_stop_sequence.reset_index()\n", "df_unique_stop_sequence[\"route_int\"] = df_unique_stop_sequence.index\n", "df_unique_stop_sequence.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add the route information to the trip" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", "
stop_idroute_int
trip_id
131.TA.26-301-j19-1.1.R(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0
32.TA.26-301-j19-1.1.R(8500926, 8500926, 8590603, 8590603, 8587020, ...(8500926, 8590603, 8587020, 8590615, 8590739, ...1
133.TA.26-301-j19-1.1.R(8500926, 8500926, 8590603, 8590603, 8590739, ...(8500926, 8590603, 8590739, 8595511, 8590614, ...2
31.TA.26-301-j19-1.1.R(8500926, 8500926, 8590618, 8590618, 8590737, ...(8500926, 8590618, 8590737, 8590739, 8587020, ...3
140.TA.26-301-j19-1.1.R(8500926, 8500926, 8590737, 8590737, 8590603, ...(8500926, 8590737, 8590603, 8590614, 8590615, ...4
\n", "
" ], "text/plain": [ " stop_id \\\n", "trip_id \n", - "131.TA.26-301-j19-1.1.R (8500926, 8500926, 8590602, 8590602, 8595511, ... \n", - "32.TA.26-301-j19-1.1.R (8500926, 8500926, 8590603, 8590603, 8587020, ... \n", - "133.TA.26-301-j19-1.1.R (8500926, 8500926, 8590603, 8590603, 8590739, ... \n", - "31.TA.26-301-j19-1.1.R (8500926, 8500926, 8590618, 8590618, 8590737, ... \n", - "140.TA.26-301-j19-1.1.R (8500926, 8500926, 8590737, 8590737, 8590603, ... \n", + "131.TA.26-301-j19-1.1.R (8500926, 8590602, 8595511, 8590737, 8591829, ... \n", + "32.TA.26-301-j19-1.1.R (8500926, 8590603, 8587020, 8590615, 8590739, ... \n", + "133.TA.26-301-j19-1.1.R (8500926, 8590603, 8590739, 8595511, 8590614, ... \n", + "31.TA.26-301-j19-1.1.R (8500926, 8590618, 8590737, 8590739, 8587020, ... \n", + "140.TA.26-301-j19-1.1.R (8500926, 8590737, 8590603, 8590614, 8590615, ... \n", "\n", " route_int \n", "trip_id \n", "131.TA.26-301-j19-1.1.R 0 \n", "32.TA.26-301-j19-1.1.R 1 \n", "133.TA.26-301-j19-1.1.R 2 \n", "31.TA.26-301-j19-1.1.R 3 \n", "140.TA.26-301-j19-1.1.R 4 " ] }, - "execution_count": 30, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#join with trip information\n", "trip_with_routes = tuple_stops.join(df_unique_stop_sequence.set_index(\"stop_id\"), on=\"stop_id\", how=\"left\").sort_values(\"route_int\")\n", "trip_with_routes.head(5)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "trip_with_routes = trip_with_routes.rename(columns={\"stop_id\" : \"all_stops\"})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if wrong manipulations cause to have the same, or higher, number of routes than trips. It is not the case" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18668" ] }, - "execution_count": 32, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check if routes and trips do not have the same number\n", "trip_with_routes.index.nunique()" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "16224" + "16210" ] }, - "execution_count": 33, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trip_with_routes.route_int.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add the rout_int column to stop_times dataframe" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "#join to get route_int in stop_times\n", "stop_times_routes = stop_times_curated.join(trip_with_routes, how=\"left\", on=\"trip_id\" , lsuffix='_left', rsuffix='_right')\n" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
trip_idstop_idarrival_timedeparture_timestop_sequenceroute_iddirection_idstop_id_rawroute_descall_stopsroute_int
0666.TA.26-4-j19-1.20.R857618207:02:0007:02:00126-4-j19-118576182Tram(8576182, 8576182, 8591391, 8591391, 8576197, ...3253(8576182, 8591391, 8576197, 8576195, 8576193, ...3239
0666.TA.26-4-j19-1.20.R857618207:02:0007:02:00531175.TA.26-4-j19-1.25.H859105715:31:0015:31:00126-4-j19-11857618208591057Tram(8576182, 8576182, 8591391, 8591391, 8576197, ...3253(8591057, 8591306, 8591327, 8594239, 8576199, ...9348
1243.TA.26-311-j19-1.3.R859083407:16:0007:16:00671163.TA.26-4-j19-1.25.H859105716:44:0016:44:00126-311-j19-118590834Bus(8590834, 8590834, 8590849, 8590849, 8590619, ...845726-4-j19-108591057Tram(8591057, 8591391, 8576195, 8576200, 8591306, ...9350
1243.TA.26-311-j19-1.3.R859083407:16:0007:16:00126-311-j19-118590834Bus(8590834, 8590834, 8590849, 8590849, 8590619, ...84572501014.TA.26-4-j19-1.25.H859128210:37:0010:37:001226-4-j19-108591282Tram(8591282, 8587349, 8576182, 8591306, 8576199, ...13033
2406.TA.26-62-j19-1.3.R859134907:24:0007:24:00126-62-j19-1288586.TA.26-4-j19-1.20.R858734911:56:0011:56:001326-4-j19-118591349Bus(8591349, 8591349, 8591403, 8591403, 8591293, ...142598587349Tram(8587349, 8588078, 8591391, 8591257, 8576199, ...4880
\n", "
" ], "text/plain": [ - " trip_id stop_id arrival_time departure_time \\\n", - "0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 07:02:00 \n", - "0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 07:02:00 \n", - "1 243.TA.26-311-j19-1.3.R 8590834 07:16:00 07:16:00 \n", - "1 243.TA.26-311-j19-1.3.R 8590834 07:16:00 07:16:00 \n", - "2 406.TA.26-62-j19-1.3.R 8591349 07:24:00 07:24:00 \n", + " trip_id stop_id arrival_time departure_time \\\n", + "0 666.TA.26-4-j19-1.20.R 8576182 07:02:00 07:02:00 \n", + "53 1175.TA.26-4-j19-1.25.H 8591057 15:31:00 15:31:00 \n", + "67 1163.TA.26-4-j19-1.25.H 8591057 16:44:00 16:44:00 \n", + "250 1014.TA.26-4-j19-1.25.H 8591282 10:37:00 10:37:00 \n", + "288 586.TA.26-4-j19-1.20.R 8587349 11:56:00 11:56:00 \n", "\n", - " stop_sequence route_id direction_id stop_id_raw route_desc \\\n", - "0 1 26-4-j19-1 1 8576182 Tram \n", - "0 1 26-4-j19-1 1 8576182 Tram \n", - "1 1 26-311-j19-1 1 8590834 Bus \n", - "1 1 26-311-j19-1 1 8590834 Bus \n", - "2 1 26-62-j19-1 1 8591349 Bus \n", + " stop_sequence route_id direction_id stop_id_raw route_desc \\\n", + "0 1 26-4-j19-1 1 8576182 Tram \n", + "53 1 26-4-j19-1 0 8591057 Tram \n", + "67 1 26-4-j19-1 0 8591057 Tram \n", + "250 12 26-4-j19-1 0 8591282 Tram \n", + "288 13 26-4-j19-1 1 8587349 Tram \n", "\n", - " all_stops route_int \n", - "0 (8576182, 8576182, 8591391, 8591391, 8576197, ... 3253 \n", - "0 (8576182, 8576182, 8591391, 8591391, 8576197, ... 3253 \n", - "1 (8590834, 8590834, 8590849, 8590849, 8590619, ... 8457 \n", - "1 (8590834, 8590834, 8590849, 8590849, 8590619, ... 8457 \n", - "2 (8591349, 8591349, 8591403, 8591403, 8591293, ... 14259 " + " all_stops route_int \n", + "0 (8576182, 8591391, 8576197, 8576195, 8576193, ... 3239 \n", + "53 (8591057, 8591306, 8591327, 8594239, 8576199, ... 9348 \n", + "67 (8591057, 8591391, 8576195, 8576200, 8591306, ... 9350 \n", + "250 (8591282, 8587349, 8576182, 8591306, 8576199, ... 13033 \n", + "288 (8587349, 8588078, 8591391, 8591257, 8576199, ... 4880 " ] }, - "execution_count": 35, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_routes.head(5)" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "16223" + "16209" ] }, - "execution_count": 36, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check if route_int is correct\n", "stop_times_routes.route_int.max()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Trip_int\n", "The trip_int number needs to be ordered by route_int and time" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18668" ] }, - "execution_count": 37, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check number trips in stop_times\n", "stop_times_routes.trip_id.nunique()" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", "
trip_idstop_idarrival_timedeparture_timestop_sequenceroute_iddirection_idstop_id_rawroute_descall_stopsroute_int
77516131.TA.26-301-j19-1.1.R858702016:53:0016:53:00126-301-j19-118587020:0:GBus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0
7751649961131.TA.26-301-j19-1.1.R858702016:53:0016:53:001859551116:55:0016:55:00226-301-j19-118587020:0:G8595511Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0
49961163549131.TA.26-301-j19-1.1.R859551116:55:0016:55:002859060316:57:0016:57:00326-301-j19-1185955118590603Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0
4996170704131.TA.26-301-j19-1.1.R859551116:55:0016:55:002859182916:58:0016:58:00426-301-j19-1185955118591829Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0
16354945228131.TA.26-301-j19-1.1.R859060316:57:0016:57:003859060216:59:0016:59:00526-301-j19-1185906038590602Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0
\n", "
" ], "text/plain": [ " trip_id stop_id arrival_time departure_time \\\n", "77516 131.TA.26-301-j19-1.1.R 8587020 16:53:00 16:53:00 \n", - "77516 131.TA.26-301-j19-1.1.R 8587020 16:53:00 16:53:00 \n", - "49961 131.TA.26-301-j19-1.1.R 8595511 16:55:00 16:55:00 \n", "49961 131.TA.26-301-j19-1.1.R 8595511 16:55:00 16:55:00 \n", "163549 131.TA.26-301-j19-1.1.R 8590603 16:57:00 16:57:00 \n", + "70704 131.TA.26-301-j19-1.1.R 8591829 16:58:00 16:58:00 \n", + "45228 131.TA.26-301-j19-1.1.R 8590602 16:59:00 16:59:00 \n", "\n", " stop_sequence route_id direction_id stop_id_raw route_desc \\\n", "77516 1 26-301-j19-1 1 8587020:0:G Bus \n", - "77516 1 26-301-j19-1 1 8587020:0:G Bus \n", - "49961 2 26-301-j19-1 1 8595511 Bus \n", "49961 2 26-301-j19-1 1 8595511 Bus \n", "163549 3 26-301-j19-1 1 8590603 Bus \n", + "70704 4 26-301-j19-1 1 8591829 Bus \n", + "45228 5 26-301-j19-1 1 8590602 Bus \n", "\n", " all_stops route_int \n", - "77516 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "77516 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "49961 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "49961 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "163549 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 " + "77516 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "49961 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "163549 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "70704 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "45228 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 " ] }, - "execution_count": 38, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_routes.sort_values([\"route_int\", \"arrival_time\"]).head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate sequential trip_int, ordered by route and by time" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0trip_inttrip_id
0131.TA.26-301-j19-1.1.R0131.TA.26-301-j19-1.1.R
132.TA.26-301-j19-1.1.R132.TA.26-301-j19-1.1.R
2133.TA.26-301-j19-1.1.R2133.TA.26-301-j19-1.1.R
331.TA.26-301-j19-1.1.R331.TA.26-301-j19-1.1.R
4140.TA.26-301-j19-1.1.R4140.TA.26-301-j19-1.1.R
\n", "
" ], "text/plain": [ " 0 trip_int trip_id\n", "0 131.TA.26-301-j19-1.1.R 0 131.TA.26-301-j19-1.1.R\n", "1 32.TA.26-301-j19-1.1.R 1 32.TA.26-301-j19-1.1.R\n", "2 133.TA.26-301-j19-1.1.R 2 133.TA.26-301-j19-1.1.R\n", "3 31.TA.26-301-j19-1.1.R 3 31.TA.26-301-j19-1.1.R\n", "4 140.TA.26-301-j19-1.1.R 4 140.TA.26-301-j19-1.1.R" ] }, - "execution_count": 39, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trip_df = pd.DataFrame(stop_times_routes.sort_values([\"route_int\", \"arrival_time\"]).trip_id.unique())\n", "trip_df[\"trip_int\"] = trip_df.index\n", "trip_df[\"trip_id\"] = trip_df.iloc[:,0]\n", "trip_df.head(5)" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18668" ] }, - "execution_count": 40, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check number trip_id\n", "trip_df.trip_id.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We join trip_id to stop_times dataframe" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "#join to get trip_int in stop_times\n", "stop_times_routes_trip = stop_times_routes.join(trip_df.set_index(\"trip_id\"), how=\"inner\", on=\"trip_id\" , lsuffix='_left', rsuffix='_right')\n" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", "
trip_idstop_idarrival_timedeparture_timestop_sequenceroute_iddirection_idstop_id_rawroute_descall_stopsroute_int0trip_int
77516131.TA.26-301-j19-1.1.R858702016:53:0016:53:00126-301-j19-118587020:0:GBus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0131.TA.26-301-j19-1.1.R0
7751649961131.TA.26-301-j19-1.1.R858702016:53:0016:53:001859551116:55:0016:55:00226-301-j19-118587020:0:G8595511Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0131.TA.26-301-j19-1.1.R0
49961163549131.TA.26-301-j19-1.1.R859551116:55:0016:55:002859060316:57:0016:57:00326-301-j19-1185955118590603Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0131.TA.26-301-j19-1.1.R0
4996170704131.TA.26-301-j19-1.1.R859551116:55:0016:55:002859182916:58:0016:58:00426-301-j19-1185955118591829Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0131.TA.26-301-j19-1.1.R0
16354945228131.TA.26-301-j19-1.1.R859060316:57:0016:57:003859060216:59:0016:59:00526-301-j19-1185906038590602Bus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0131.TA.26-301-j19-1.1.R0
\n", "
" ], "text/plain": [ " trip_id stop_id arrival_time departure_time \\\n", "77516 131.TA.26-301-j19-1.1.R 8587020 16:53:00 16:53:00 \n", - "77516 131.TA.26-301-j19-1.1.R 8587020 16:53:00 16:53:00 \n", - "49961 131.TA.26-301-j19-1.1.R 8595511 16:55:00 16:55:00 \n", "49961 131.TA.26-301-j19-1.1.R 8595511 16:55:00 16:55:00 \n", "163549 131.TA.26-301-j19-1.1.R 8590603 16:57:00 16:57:00 \n", + "70704 131.TA.26-301-j19-1.1.R 8591829 16:58:00 16:58:00 \n", + "45228 131.TA.26-301-j19-1.1.R 8590602 16:59:00 16:59:00 \n", "\n", " stop_sequence route_id direction_id stop_id_raw route_desc \\\n", "77516 1 26-301-j19-1 1 8587020:0:G Bus \n", - "77516 1 26-301-j19-1 1 8587020:0:G Bus \n", - "49961 2 26-301-j19-1 1 8595511 Bus \n", "49961 2 26-301-j19-1 1 8595511 Bus \n", "163549 3 26-301-j19-1 1 8590603 Bus \n", + "70704 4 26-301-j19-1 1 8591829 Bus \n", + "45228 5 26-301-j19-1 1 8590602 Bus \n", "\n", " all_stops route_int \\\n", - "77516 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "77516 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "49961 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "49961 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "163549 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", + "77516 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "49961 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "163549 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "70704 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "45228 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", "\n", " 0 trip_int \n", "77516 131.TA.26-301-j19-1.1.R 0 \n", - "77516 131.TA.26-301-j19-1.1.R 0 \n", "49961 131.TA.26-301-j19-1.1.R 0 \n", - "49961 131.TA.26-301-j19-1.1.R 0 \n", - "163549 131.TA.26-301-j19-1.1.R 0 " + "163549 131.TA.26-301-j19-1.1.R 0 \n", + "70704 131.TA.26-301-j19-1.1.R 0 \n", + "45228 131.TA.26-301-j19-1.1.R 0 " ] }, - "execution_count": 42, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#save ordered stop_times\n", "stop_times_routes_trip = stop_times_routes_trip.sort_values([\"route_int\", \"trip_int\", \"stop_sequence\"])\n", "stop_times_routes_trip.head(5)" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18668" ] }, - "execution_count": 43, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check if manipulations did not destroy trips\n", "stop_times_routes_trip.trip_id.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Stop_int\n", "Stop_int id needs to ordered by route, trip and stop sequence" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 44, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check number stops at entry\n", "stop_times_routes_trip.stop_id.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "stop_times_routes_trip is already in the right order. We create dataframe to create stop_int" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0stop_intstop_id
0858702008587020
1859551118595511
2859060328590603
3859182938591829
4859060248590602
\n", "
" ], "text/plain": [ " 0 stop_int stop_id\n", "0 8587020 0 8587020\n", "1 8595511 1 8595511\n", "2 8590603 2 8590603\n", "3 8591829 3 8591829\n", "4 8590602 4 8590602" ] }, - "execution_count": 45, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_df = pd.DataFrame(stop_times_routes_trip.stop_id.unique())\n", "stops_df[\"stop_int\"] = stops_df.index\n", "stops_df[\"stop_id\"] = stops_df.iloc[:,0]\n", "stops_df.head(5)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 46, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check if number stop_int correct\n", "stops_df.stop_int.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add stop_int information to stop_times" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "#join to get stop_int\n", "stop_times_routes_trip_stop = stop_times_routes_trip.join(stops_df.set_index(\"stop_id\"), how=\"inner\", on=\"stop_id\", lsuffix='_left', rsuffix='_right')\n" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", "
trip_idstop_idarrival_timedeparture_timestop_sequenceroute_iddirection_idstop_id_rawroute_descall_stopsroute_int0_lefttrip_int0_rightstop_int
77516131.TA.26-301-j19-1.1.R858702016:53:0016:53:00126-301-j19-118587020:0:GBus(8500926, 8500926, 8590602, 8590602, 8595511, ...(8500926, 8590602, 8595511, 8590737, 8591829, ...0131.TA.26-301-j19-1.1.R085870200
77516131.TA.26-301-j19-1.1.R4683732.TA.26-301-j19-1.1.R858702016:53:0016:53:0007:38:0007:38:00126-301-j19-118587020:0:GBus(8500926, 8500926, 8590602, 8590602, 8595511, ...0131.TA.26-301-j19-1.1.R0(8500926, 8590603, 8587020, 8590615, 8590739, ...132.TA.26-301-j19-1.1.R185870200
4683732.TA.26-301-j19-1.1.R208591133.TA.26-301-j19-1.1.R858702007:38:0007:38:0016:38:0016:38:00126-301-j19-118587020:0:GBus(8500926, 8500926, 8590603, 8590603, 8587020, ...132.TA.26-301-j19-1.1.R1(8500926, 8590603, 8590739, 8595511, 8590614, ...2133.TA.26-301-j19-1.1.R285870200
4683732.TA.26-301-j19-1.1.R8126931.TA.26-301-j19-1.1.R858702007:38:0007:38:0010:38:0010:38:00126-301-j19-118587020:0:GBus(8500926, 8500926, 8590603, 8590603, 8587020, ...132.TA.26-301-j19-1.1.R1(8500926, 8590618, 8590737, 8590739, 8587020, ...331.TA.26-301-j19-1.1.R385870200
208591133.TA.26-301-j19-1.1.R212257140.TA.26-301-j19-1.1.R858702016:38:0016:38:0018:23:0018:23:00126-301-j19-118587020:0:GBus(8500926, 8500926, 8590603, 8590603, 8590739, ...2133.TA.26-301-j19-1.1.R2(8500926, 8590737, 8590603, 8590614, 8590615, ...4140.TA.26-301-j19-1.1.R485870200
\n", "
" ], "text/plain": [ " trip_id stop_id arrival_time departure_time \\\n", "77516 131.TA.26-301-j19-1.1.R 8587020 16:53:00 16:53:00 \n", - "77516 131.TA.26-301-j19-1.1.R 8587020 16:53:00 16:53:00 \n", - "46837 32.TA.26-301-j19-1.1.R 8587020 07:38:00 07:38:00 \n", "46837 32.TA.26-301-j19-1.1.R 8587020 07:38:00 07:38:00 \n", "208591 133.TA.26-301-j19-1.1.R 8587020 16:38:00 16:38:00 \n", + "81269 31.TA.26-301-j19-1.1.R 8587020 10:38:00 10:38:00 \n", + "212257 140.TA.26-301-j19-1.1.R 8587020 18:23:00 18:23:00 \n", "\n", " stop_sequence route_id direction_id stop_id_raw route_desc \\\n", "77516 1 26-301-j19-1 1 8587020:0:G Bus \n", - "77516 1 26-301-j19-1 1 8587020:0:G Bus \n", - "46837 1 26-301-j19-1 1 8587020:0:G Bus \n", "46837 1 26-301-j19-1 1 8587020:0:G Bus \n", "208591 1 26-301-j19-1 1 8587020:0:G Bus \n", + "81269 1 26-301-j19-1 1 8587020:0:G Bus \n", + "212257 1 26-301-j19-1 1 8587020:0:G Bus \n", "\n", " all_stops route_int \\\n", - "77516 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "77516 (8500926, 8500926, 8590602, 8590602, 8595511, ... 0 \n", - "46837 (8500926, 8500926, 8590603, 8590603, 8587020, ... 1 \n", - "46837 (8500926, 8500926, 8590603, 8590603, 8587020, ... 1 \n", - "208591 (8500926, 8500926, 8590603, 8590603, 8590739, ... 2 \n", + "77516 (8500926, 8590602, 8595511, 8590737, 8591829, ... 0 \n", + "46837 (8500926, 8590603, 8587020, 8590615, 8590739, ... 1 \n", + "208591 (8500926, 8590603, 8590739, 8595511, 8590614, ... 2 \n", + "81269 (8500926, 8590618, 8590737, 8590739, 8587020, ... 3 \n", + "212257 (8500926, 8590737, 8590603, 8590614, 8590615, ... 4 \n", "\n", " 0_left trip_int 0_right stop_int \n", "77516 131.TA.26-301-j19-1.1.R 0 8587020 0 \n", - "77516 131.TA.26-301-j19-1.1.R 0 8587020 0 \n", "46837 32.TA.26-301-j19-1.1.R 1 8587020 0 \n", - "46837 32.TA.26-301-j19-1.1.R 1 8587020 0 \n", - "208591 133.TA.26-301-j19-1.1.R 2 8587020 0 " + "208591 133.TA.26-301-j19-1.1.R 2 8587020 0 \n", + "81269 31.TA.26-301-j19-1.1.R 3 8587020 0 \n", + "212257 140.TA.26-301-j19-1.1.R 4 8587020 0 " ] }, - "execution_count": 48, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_routes_trip_stop.head(5)" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, - "execution_count": 49, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check if no stops deleted during manipulation\n", "stop_times_routes_trip_stop.stop_id.nunique()" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1406" ] }, - "execution_count": 50, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_routes_trip_stop.stop_int.max()" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "#keep only useful columns \n", "stop_times_int = stop_times_routes_trip_stop[[\"route_int\", \"trip_int\", \"stop_int\", \"stop_sequence\", \"arrival_time\", \"departure_time\",\\\n", " \"route_id\", \"trip_id\", \"stop_id\", \\\n", " \"route_desc\", \"stop_id_raw\"]].sort_values([\"route_int\", \"trip_int\", \"stop_sequence\"])" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "stop_times_int = stop_times_int.reset_index(drop=True)" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_inttrip_intstop_intstop_sequencearrival_timedeparture_timeroute_idtrip_idstop_idroute_descstop_id_raw
0000116:53:0016:53:0026-301-j19-1131.TA.26-301-j19-1.1.R8587020Bus8587020:0:G
1000116:53:0016:53:00216:55:0016:55:0026-301-j19-1131.TA.26-301-j19-1.1.R85870208595511Bus8587020:0:G8595511
2001216:55:0016:55:00316:57:0016:57:0026-301-j19-1131.TA.26-301-j19-1.1.R85955118590603Bus85955118590603
3001216:55:0016:55:003416:58:0016:58:0026-301-j19-1131.TA.26-301-j19-1.1.R85955118591829Bus85955118591829
4002316:57:0016:57:004516:59:0016:59:0026-301-j19-1131.TA.26-301-j19-1.1.R85906038590602Bus85906038590602
5002316:57:0016:57:005617:00:0017:00:0026-301-j19-1131.TA.26-301-j19-1.1.R85906038590615Bus85906038590615
6003416:58:0016:58:006717:01:0017:01:0026-301-j19-1131.TA.26-301-j19-1.1.R85918298590614Bus85918298590614
7003416:58:0016:58:007817:01:0017:01:0026-301-j19-1131.TA.26-301-j19-1.1.R85918298590618Bus85918298590618
8004516:59:0016:59:008917:03:0017:03:0026-301-j19-1131.TA.26-301-j19-1.1.R85906028500926Bus85906028500926
9004516:59:0016:59:0091017:03:0017:03:0026-301-j19-1131.TA.26-301-j19-1.1.R85906028590737Bus85906028590737
\n", "
" ], "text/plain": [ " route_int trip_int stop_int stop_sequence arrival_time departure_time \\\n", "0 0 0 0 1 16:53:00 16:53:00 \n", - "1 0 0 0 1 16:53:00 16:53:00 \n", - "2 0 0 1 2 16:55:00 16:55:00 \n", - "3 0 0 1 2 16:55:00 16:55:00 \n", - "4 0 0 2 3 16:57:00 16:57:00 \n", - "5 0 0 2 3 16:57:00 16:57:00 \n", - "6 0 0 3 4 16:58:00 16:58:00 \n", - "7 0 0 3 4 16:58:00 16:58:00 \n", - "8 0 0 4 5 16:59:00 16:59:00 \n", - "9 0 0 4 5 16:59:00 16:59:00 \n", + "1 0 0 1 2 16:55:00 16:55:00 \n", + "2 0 0 2 3 16:57:00 16:57:00 \n", + "3 0 0 3 4 16:58:00 16:58:00 \n", + "4 0 0 4 5 16:59:00 16:59:00 \n", + "5 0 0 5 6 17:00:00 17:00:00 \n", + "6 0 0 6 7 17:01:00 17:01:00 \n", + "7 0 0 7 8 17:01:00 17:01:00 \n", + "8 0 0 8 9 17:03:00 17:03:00 \n", + "9 0 0 9 10 17:03:00 17:03:00 \n", "\n", " route_id trip_id stop_id route_desc stop_id_raw \n", "0 26-301-j19-1 131.TA.26-301-j19-1.1.R 8587020 Bus 8587020:0:G \n", - "1 26-301-j19-1 131.TA.26-301-j19-1.1.R 8587020 Bus 8587020:0:G \n", - "2 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", - "3 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", - "4 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 Bus 8590603 \n", - "5 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 Bus 8590603 \n", - "6 26-301-j19-1 131.TA.26-301-j19-1.1.R 8591829 Bus 8591829 \n", - "7 26-301-j19-1 131.TA.26-301-j19-1.1.R 8591829 Bus 8591829 \n", - "8 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590602 Bus 8590602 \n", - "9 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590602 Bus 8590602 " + "1 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", + "2 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 Bus 8590603 \n", + "3 26-301-j19-1 131.TA.26-301-j19-1.1.R 8591829 Bus 8591829 \n", + "4 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590602 Bus 8590602 \n", + "5 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590615 Bus 8590615 \n", + "6 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590614 Bus 8590614 \n", + "7 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590618 Bus 8590618 \n", + "8 26-301-j19-1 131.TA.26-301-j19-1.1.R 8500926 Bus 8500926 \n", + "9 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590737 Bus 8590737 " ] }, - "execution_count": 53, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_int.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfer: delete transfer to same stop & get stop_int & stop_int2\n", "to finish" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 74, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'transfers' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#check number stops transfers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtransfers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstop_id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnunique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'transfers' is not defined" - ] + "data": { + "text/plain": [ + "1497" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "#check number stops transfers\n", "transfers.stop_id.nunique()" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 75, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'transfers' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtransfers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'transfers' is not defined" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_id_rawstop_id2_raw
00850092685906160.12243014685009268590616
11850092685907370.30017536085009268590737
22850218685021860.006762885021868502186:0:1
33850218685021860.0135241685021868502186:0:2
44850218685021860.000000085021868502186P
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec stop_id_raw \\\n", + "0 0 8500926 8590616 0.122430 146 8500926 \n", + "1 1 8500926 8590737 0.300175 360 8500926 \n", + "2 2 8502186 8502186 0.006762 8 8502186 \n", + "3 3 8502186 8502186 0.013524 16 8502186 \n", + "4 4 8502186 8502186 0.000000 0 8502186 \n", + "\n", + " stop_id2_raw \n", + "0 8590616 \n", + "1 8590737 \n", + "2 8502186:0:1 \n", + "3 8502186:0:2 \n", + "4 8502186P " + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "transfers.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We delete transfers to the same stop" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "transfers_df = transfers[transfers['stop_id'] != transfers['stop_id2']]" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1491" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.stop_id.nunique()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 10482 entries, 0 to 12563\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Unnamed: 0 10482 non-null int64 \n", " 1 stop_id 10482 non-null int64 \n", " 2 stop_id2 10482 non-null int64 \n", " 3 distance 10482 non-null float64\n", " 4 Transfer_time_sec 10482 non-null int64 \n", " 5 stop_id_raw 10482 non-null object \n", " 6 stop_id2_raw 10482 non-null object \n", "dtypes: float64(1), int64(4), object(2)\n", "memory usage: 655.1+ KB\n" ] } ], "source": [ "transfers_df.info()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_int
stop_id
85870200
85955111
85906032
85918293
85906024
\n", "
" ], "text/plain": [ " stop_int\n", "stop_id \n", "8587020 0\n", "8595511 1\n", "8590603 2\n", "8591829 3\n", "8590602 4" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_int[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\").head(5)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "#create stop_int column\n", "#this action eliminates stops not in stop_times\n", "transfers_df = transfers_df.merge(stop_times_int[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\"), how=\"inner\", on = \"stop_id\")" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0stop_idstop_id2distanceTransfer_time_secstop_id_rawstop_id2_rawstop_int
00850092685906160.12243146850092685906168
10850092685906160.12243146850092685906168
20850092685906160.12243146850092685906168
30850092685906160.12243146850092685906168
40850092685906160.12243146850092685906168
\n", "
" ], "text/plain": [ " Unnamed: 0 stop_id stop_id2 distance Transfer_time_sec stop_id_raw \\\n", "0 0 8500926 8590616 0.12243 146 8500926 \n", "1 0 8500926 8590616 0.12243 146 8500926 \n", "2 0 8500926 8590616 0.12243 146 8500926 \n", "3 0 8500926 8590616 0.12243 146 8500926 \n", "4 0 8500926 8590616 0.12243 146 8500926 \n", "\n", " stop_id2_raw stop_int \n", "0 8590616 8 \n", "1 8590616 8 \n", "2 8590616 8 \n", "3 8590616 8 \n", "4 8590616 8 " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfers_df.head(5)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stop_id2stop_int_2
085870200
185955111
285906032
385918293
485906024
\n", "
" ], "text/plain": [ " stop_id2 stop_int_2\n", "0 8587020 0\n", "1 8595511 1\n", "2 8590603 2\n", "3 8591829 3\n", "4 8590602 4" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create dataframe with stops\n", "df_stop_int2 = stop_times_int[[\"stop_id\", \"stop_int\"]].rename(columns={\"stop_id\": \"stop_id2\", \"stop_int\" : \"stop_int_2\"})\n", "df_stop_int2.head(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "transfers_df_int = transfers_df.merge(df_stop_int2.set_index(\"stop_id2\"), how=\"inner\", on = \"stop_id2\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'transfers' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtransfers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'transfers' is not defined" ] } ], "source": [ "transfers.head(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "transfers_stop_int2 = transfers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#check number unique stops2 in transfers\n", "transfers.stop_id2.nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "transfers.join(stop_times_curated[[\"stop_id\", \"stop_int\"]].set_index(\"stop_id\"), how=\"inner\", on = \"stop_id\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Raptor specific preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### StopTimes: \n", "[[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by making sure the order is correct" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", "
route_inttrip_intstop_intstop_sequencearrival_timedeparture_timeroute_idtrip_idstop_idroute_descstop_id_raw
0000116:53:0016:53:0026-301-j19-1131.TA.26-301-j19-1.1.R8587020Bus8587020:0:G
1000116:53:0016:53:00216:55:0016:55:0026-301-j19-1131.TA.26-301-j19-1.1.R85870208595511Bus8587020:0:G8595511
2001216:55:0016:55:00316:57:0016:57:0026-301-j19-1131.TA.26-301-j19-1.1.R85955118590603Bus85955118590603
3001216:55:0016:55:003416:58:0016:58:0026-301-j19-1131.TA.26-301-j19-1.1.R85955118591829Bus85955118591829
4002316:57:0016:57:004516:59:0016:59:0026-301-j19-1131.TA.26-301-j19-1.1.R85906038590602Bus85906038590602
\n", "
" ], "text/plain": [ " route_int trip_int stop_int stop_sequence arrival_time departure_time \\\n", "0 0 0 0 1 16:53:00 16:53:00 \n", - "1 0 0 0 1 16:53:00 16:53:00 \n", - "2 0 0 1 2 16:55:00 16:55:00 \n", - "3 0 0 1 2 16:55:00 16:55:00 \n", - "4 0 0 2 3 16:57:00 16:57:00 \n", + "1 0 0 1 2 16:55:00 16:55:00 \n", + "2 0 0 2 3 16:57:00 16:57:00 \n", + "3 0 0 3 4 16:58:00 16:58:00 \n", + "4 0 0 4 5 16:59:00 16:59:00 \n", "\n", " route_id trip_id stop_id route_desc stop_id_raw \n", "0 26-301-j19-1 131.TA.26-301-j19-1.1.R 8587020 Bus 8587020:0:G \n", - "1 26-301-j19-1 131.TA.26-301-j19-1.1.R 8587020 Bus 8587020:0:G \n", - "2 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", - "3 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", - "4 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 Bus 8590603 " + "1 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", + "2 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 Bus 8590603 \n", + "3 26-301-j19-1 131.TA.26-301-j19-1.1.R 8591829 Bus 8591829 \n", + "4 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590602 Bus 8590602 " ] }, - "execution_count": 54, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered = stop_times_int.sort_values(by=[\"route_int\", \"trip_int\", \"stop_sequence\"])\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
arrival_timedeparture_time
016:53:0016:53:00
116:53:0016:53:0016:55:0016:55:00
216:55:0016:55:0016:57:0016:57:00
316:55:0016:55:0016:58:0016:58:00
416:57:0016:57:0016:59:0016:59:00
\n", "
" ], "text/plain": [ " arrival_time departure_time\n", "0 16:53:00 16:53:00\n", - "1 16:53:00 16:53:00\n", - "2 16:55:00 16:55:00\n", - "3 16:55:00 16:55:00\n", - "4 16:57:00 16:57:00" + "1 16:55:00 16:55:00\n", + "2 16:57:00 16:57:00\n", + "3 16:58:00 16:58:00\n", + "4 16:59:00 16:59:00" ] }, - "execution_count": 55, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We add None to first arrival time and last departure time." ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
route_inttrip_intstop_intstop_sequencearrival_timedeparture_timeroute_idtrip_idstop_idroute_descstop_id_rawsequence_shift_1
0000116:53:0016:53:0026-301-j19-1131.TA.26-301-j19-1.1.R8587020Bus8587020:0:G12
1000116:53:0016:53:00216:55:0016:55:0026-301-j19-1131.TA.26-301-j19-1.1.R85870208595511Bus8587020:0:G285955113
2001216:55:0016:55:00316:57:0016:57:0026-301-j19-1131.TA.26-301-j19-1.1.R85955118590603Bus8595511285906034
3001216:55:0016:55:003416:58:0016:58:0026-301-j19-1131.TA.26-301-j19-1.1.R85955118591829Bus8595511385918295
4002316:57:0016:57:004516:59:0016:59:0026-301-j19-1131.TA.26-301-j19-1.1.R85906038590602Bus8590603385906026
\n", "
" ], "text/plain": [ " route_int trip_int stop_int stop_sequence arrival_time departure_time \\\n", "0 0 0 0 1 16:53:00 16:53:00 \n", - "1 0 0 0 1 16:53:00 16:53:00 \n", - "2 0 0 1 2 16:55:00 16:55:00 \n", - "3 0 0 1 2 16:55:00 16:55:00 \n", - "4 0 0 2 3 16:57:00 16:57:00 \n", + "1 0 0 1 2 16:55:00 16:55:00 \n", + "2 0 0 2 3 16:57:00 16:57:00 \n", + "3 0 0 3 4 16:58:00 16:58:00 \n", + "4 0 0 4 5 16:59:00 16:59:00 \n", "\n", " route_id trip_id stop_id route_desc stop_id_raw \\\n", "0 26-301-j19-1 131.TA.26-301-j19-1.1.R 8587020 Bus 8587020:0:G \n", - "1 26-301-j19-1 131.TA.26-301-j19-1.1.R 8587020 Bus 8587020:0:G \n", - "2 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", - "3 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", - "4 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 Bus 8590603 \n", + "1 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 Bus 8595511 \n", + "2 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 Bus 8590603 \n", + "3 26-301-j19-1 131.TA.26-301-j19-1.1.R 8591829 Bus 8591829 \n", + "4 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590602 Bus 8590602 \n", "\n", " sequence_shift_1 \n", - "0 1 \n", - "1 2 \n", - "2 2 \n", - "3 3 \n", - "4 3 " + "0 2 \n", + "1 3 \n", + "2 4 \n", + "3 5 \n", + "4 6 " ] }, - "execution_count": 56, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#adding a shift\n", "stop_times_ordered[\"sequence_shift_1\"] = stop_times_ordered[\"stop_sequence\"].shift(-1, fill_value=0)\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"stop_sequence\"] > stop_times_ordered[\"sequence_shift_1\"]), None, stop_times_ordered['departure_time'])" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"stop_sequence\"] == 1), None, stop_times_ordered['arrival_time'])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timedeparture_timestop_sequencesequence_shift_1
0None16:53:0012
116:55:0016:55:0023
216:57:0016:57:0034
316:58:0016:58:0045
416:59:0016:59:0056
\n", + "
" + ], + "text/plain": [ + " arrival_time departure_time stop_sequence sequence_shift_1\n", + "0 None 16:53:00 1 2\n", + "1 16:55:00 16:55:00 2 3\n", + "2 16:57:00 16:57:00 3 4\n", + "3 16:58:00 16:58:00 4 5\n", + "4 16:59:00 16:59:00 5 6" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stop_times_ordered[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_shift_1\"]].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We transform it in datetime as required by the raptor algorithm" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 82, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'pd' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstop_times_ordered\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'arrival_time'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstop_times_ordered\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'arrival_time'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mstop_times_ordered\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'departure_time'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstop_times_ordered\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'departure_time'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined" - ] - } - ], + "outputs": [], "source": [ "stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n", "stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timedeparture_time
0NaT2020-05-21 16:53:00
12020-05-21 16:55:002020-05-21 16:55:00
22020-05-21 16:57:002020-05-21 16:57:00
32020-05-21 16:58:002020-05-21 16:58:00
42020-05-21 16:59:002020-05-21 16:59:00
\n", + "
" + ], + "text/plain": [ + " arrival_time departure_time\n", + "0 NaT 2020-05-21 16:53:00\n", + "1 2020-05-21 16:55:00 2020-05-21 16:55:00\n", + "2 2020-05-21 16:57:00 2020-05-21 16:57:00\n", + "3 2020-05-21 16:58:00 2020-05-21 16:58:00\n", + "4 2020-05-21 16:59:00 2020-05-21 16:59:00" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_times_df.pkl','wb') as f: pickle.dump(stop_times_ordered, f)" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", "
route_inttrip_intstop_intstop_sequencearrival_timedeparture_timeroute_idtrip_idstop_idroute_descstop_id_rawsequence_shift_1
00001NaT2020-05-21 16:53:0026-301-j19-1131.TA.26-301-j19-1.1.R8587020Bus8587020:0:G2
100122020-05-21 16:55:002020-05-21 16:55:0026-301-j19-1131.TA.26-301-j19-1.1.R8595511Bus85955113
200232020-05-21 16:57:002020-05-21 16:57:0026-301-j19-1131.TA.26-301-j19-1.1.R8590603Bus85906034
300342020-05-21 16:58:002020-05-21 16:58:0026-301-j19-1131.TA.26-301-j19-1.1.R8591829Bus85918295
400452020-05-21 16:59:002020-05-21 16:59:0026-301-j19-1131.TA.26-301-j19-1.1.R8590602Bus85906026
\n", "
" ], "text/plain": [ " route_int trip_int stop_int stop_sequence arrival_time \\\n", "0 0 0 0 1 NaT \n", "1 0 0 1 2 2020-05-21 16:55:00 \n", "2 0 0 2 3 2020-05-21 16:57:00 \n", "3 0 0 3 4 2020-05-21 16:58:00 \n", "4 0 0 4 5 2020-05-21 16:59:00 \n", "\n", " departure_time route_id trip_id stop_id \\\n", "0 2020-05-21 16:53:00 26-301-j19-1 131.TA.26-301-j19-1.1.R 8587020 \n", "1 2020-05-21 16:55:00 26-301-j19-1 131.TA.26-301-j19-1.1.R 8595511 \n", "2 2020-05-21 16:57:00 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590603 \n", "3 2020-05-21 16:58:00 26-301-j19-1 131.TA.26-301-j19-1.1.R 8591829 \n", "4 2020-05-21 16:59:00 26-301-j19-1 131.TA.26-301-j19-1.1.R 8590602 \n", "\n", - " sequence_shift_1 \n", - "0 2 \n", - "1 3 \n", - "2 4 \n", - "3 5 \n", - "4 6 " + " route_desc stop_id_raw sequence_shift_1 \n", + "0 Bus 8587020:0:G 2 \n", + "1 Bus 8595511 3 \n", + "2 Bus 8590603 4 \n", + "3 Bus 8591829 5 \n", + "4 Bus 8590602 6 " ] }, - "execution_count": 62, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered = stop_times_ordered.sort_values(by=[\"route_int\", \"trip_int\", \"stop_sequence\"])\n", "stop_times_ordered.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we transform it to array, ready ti be used by raptor" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 'NaT', '2020-05-21T16:53:00.000000000'],\n", " ['2020-05-21T16:55:00.000000000', '2020-05-21T16:55:00.000000000'],\n", " ['2020-05-21T16:57:00.000000000', '2020-05-21T16:57:00.000000000'],\n", " ...,\n", " ['2020-05-21T15:10:00.000000000', 'NaT'],\n", " [ 'NaT', '2020-05-21T16:45:00.000000000'],\n", " ['2020-05-21T17:05:00.000000000', 'NaT']],\n", " dtype='datetime64[ns]')" ] }, - "execution_count": 63, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_array = stop_times_ordered[[\"arrival_time\", \"departure_time\"]].to_numpy()\n", "stop_times_array" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "245738" ] }, - "execution_count": 64, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stop_times_array,0)" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_times_array.pkl','wb') as f: pickle.dump(stop_times_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Routes: \n", "[[route0_nr.Trips, route0_nr. Stops, route0_pointerRoutes, route0_pointerStops_times],[route1_nr.Trips, route1_nr. Stops,, route1_pointerRoutes, route1_pointerStops_times],…]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by getting the number of trips and stops there is for each route" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stops
route_int
0111
1111
2111
3111
4111
\n", "
" ], "text/plain": [ " n_Trips n_stops\n", "route_int \n", "0 1 11\n", "1 1 11\n", "2 1 11\n", "3 1 11\n", "4 1 11" ] }, - "execution_count": 72, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops = stop_times_ordered.groupby([\"route_int\"]).nunique()[[\"trip_int\",\"stop_int\"]].sort_index().rename(columns={\"trip_int\": \"n_Trips\", \"stop_int\": \"n_stops\"})\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(16210, 2)" ] }, - "execution_count": 73, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the pointer for the route stops, by adding the unique stops for each route" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stops
route_int
01110
111111
211122
311133
411144
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops\n", "route_int \n", "0 1 11 0\n", "1 1 11 11\n", "2 1 11 22\n", "3 1 11 33\n", "4 1 11 44" ] }, - "execution_count": 74, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops['pointer_routes_stops'] = distinct_trips_stops.n_stops.cumsum().shift(1, fill_value=0)\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We create the pointer for stop_times by adding the number of stops in each route, counting duplicates (due to several trips)" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "distinct_trips_stops[\"pointer_stop_times\"] = (stop_times_ordered.groupby([\"route_int\"]).count().stop_id).cumsum().shift(1, fill_value=0)" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stopspointer_stop_timespointer_routes_stops_shiftpointer_stop_times_shift
route_int
0111001111
111111112222
211122223333
311133334444
411144445555
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n", "route_int \n", "0 1 11 0 0 \n", "1 1 11 11 11 \n", "2 1 11 22 22 \n", "3 1 11 33 33 \n", "4 1 11 44 44 \n", "\n", " pointer_routes_stops_shift pointer_stop_times_shift \n", "route_int \n", "0 11 11 \n", "1 22 22 \n", "2 33 33 \n", "3 44 44 \n", "4 55 55 " ] }, - "execution_count": 76, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops[\"pointer_routes_stops_shift\"] = distinct_trips_stops['pointer_routes_stops'].shift(-1, fill_value=0)\n", "distinct_trips_stops[\"pointer_stop_times_shift\"] = distinct_trips_stops['pointer_stop_times'].shift(-1, fill_value=0)\n", "distinct_trips_stops.head(5)" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "distinct_trips_stops['pointer_routes_stops'] = np.where((distinct_trips_stops[\"pointer_routes_stops\"] == distinct_trips_stops[\"pointer_routes_stops_shift\"]), None, distinct_trips_stops['pointer_routes_stops'])\n", "distinct_trips_stops['pointer_stop_times'] = np.where((distinct_trips_stops[\"pointer_stop_times\"] == distinct_trips_stops[\"pointer_stop_times_shift\"]), None, distinct_trips_stops['pointer_stop_times'])\n" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "n_Trips False\n", "n_stops False\n", "pointer_routes_stops False\n", "pointer_stop_times False\n", "pointer_routes_stops_shift False\n", "pointer_stop_times_shift False\n", "dtype: bool" ] }, - "execution_count": 78, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.isna().any()" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "with open('../data/routes_array_df.pkl','wb') as f: pickle.dump(distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']], f)" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 16210 entries, 0 to 16209\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 n_Trips 16210 non-null int64 \n", " 1 n_stops 16210 non-null int64 \n", " 2 pointer_routes_stops 16210 non-null object\n", " 3 pointer_stop_times 16210 non-null object\n", " 4 pointer_routes_stops_shift 16210 non-null int64 \n", " 5 pointer_stop_times_shift 16210 non-null int64 \n", "dtypes: int64(4), object(2)\n", "memory usage: 886.5+ KB\n" ] } ], "source": [ "distinct_trips_stops.info()" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 11, 0, 0],\n", " [1, 11, 11, 11],\n", " [1, 11, 22, 22],\n", " ...,\n", " [1, 6, 237432, 245713],\n", " [1, 13, 237438, 245719],\n", " [3, 2, 237451, 245732]], dtype=object)" ] }, - "execution_count": 81, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "routes_array = distinct_trips_stops[['n_Trips', 'n_stops', 'pointer_routes_stops', 'pointer_stop_times']].to_numpy()\n", "routes_array" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16210" ] }, - "execution_count": 76, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(routes_array, 0)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "with open('../data/routes_array.pkl','wb') as f: pickle.dump(routes_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
0000
1101
2202
3303
4404
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "3 3 0 3\n", "4 4 0 4" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops = stop_times_ordered.sort_values([\"route_int\", \"stop_sequence\"])\n", "route_stops = route_stops[['route_int', 'stop_int']].drop_duplicates().reset_index()\n", "route_stops.head(5)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 237453 entries, 0 to 237452\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype\n", "--- ------ -------------- -----\n", " 0 index 237453 non-null int64\n", " 1 route_int 237453 non-null int64\n", " 2 stop_int 237453 non-null int64\n", "dtypes: int64(3)\n", "memory usage: 5.4 MB\n" ] } ], "source": [ "route_stops.info()" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16210" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.route_int.nunique()" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "with open('../data/route_stops_df.pkl','wb') as f: pickle.dump(route_stops, f)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 1, 2, ..., 1187, 573, 778])" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops_array = route_stops.stop_int.to_numpy()\n", "route_stops_array" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(np.unique(route_stops_array))" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "237453" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(route_stops_array, 0)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(237453,)" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops_array.shape" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "with open('../data/route_stops_array.pkl','wb') as f: pickle.dump(route_stops_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if pointers are correct\n", "It is fundamental that the indexes, that serve as pointers, in Routes are correct" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by looking at where the indexes for stop_times and route_stops diverge. This will allow us to change. We can see that Route stops should have a new route at 318 while stop_times should have it at 348, so we try with that" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Tripsn_stopspointer_routes_stopspointer_stop_timespointer_routes_stops_shiftpointer_stop_times_shift
route_int
24113315315318348
25114318348332362
26122332362334386
27153334386337431
28103337431340461
\n", "
" ], "text/plain": [ " n_Trips n_stops pointer_routes_stops pointer_stop_times \\\n", "route_int \n", "24 11 3 315 315 \n", "25 1 14 318 348 \n", "26 12 2 332 362 \n", "27 15 3 334 386 \n", "28 10 3 337 431 \n", "\n", " pointer_routes_stops_shift pointer_stop_times_shift \n", "route_int \n", "24 318 348 \n", "25 332 362 \n", "26 334 386 \n", "27 337 431 \n", "28 340 461 " ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_trips_stops.loc[24:29].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can check if the pointer indicates the routes index number. At the pointer_routes should indicate the first stop of a new route. We try with 318 to see if route_stops has a new route at this index. It does so it works" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
3153152425
3163212427
3173162426
3183482525
3193492526
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "315 315 24 25\n", "316 321 24 27\n", "317 316 24 26\n", "318 348 25 25\n", "319 349 25 26" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.loc[315:320].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We go and see if stop_times has a new route at 348. It does, so it works" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
route_inttrip_intstop_intstop_sequencearrival_timedeparture_timeroute_idtrip_idstop_idsequence_shift_1
34624342622020-05-21 18:53:002020-05-21 18:53:0030-57-Y-j19-127.TA.30-57-Y-j19-1.1.H85022093
34724342732020-05-21 18:59:002020-05-21 18:59:0030-57-Y-j19-127.TA.30-57-Y-j19-1.1.H85032023
34825352532020-05-21 13:14:002020-05-21 13:15:0026-24-j19-1662.TA.26-24-j19-1.254.R85022084
34925352642020-05-21 13:17:002020-05-21 13:17:0026-24-j19-1662.TA.26-24-j19-1.254.R85022095
35025352752020-05-21 13:21:002020-05-21 13:21:0026-24-j19-1662.TA.26-24-j19-1.254.R85032026
\n", "
" ], "text/plain": [ " route_int trip_int stop_int stop_sequence arrival_time \\\n", "346 24 34 26 2 2020-05-21 18:53:00 \n", "347 24 34 27 3 2020-05-21 18:59:00 \n", "348 25 35 25 3 2020-05-21 13:14:00 \n", "349 25 35 26 4 2020-05-21 13:17:00 \n", "350 25 35 27 5 2020-05-21 13:21:00 \n", "\n", " departure_time route_id trip_id stop_id \\\n", "346 2020-05-21 18:53:00 30-57-Y-j19-1 27.TA.30-57-Y-j19-1.1.H 8502209 \n", "347 2020-05-21 18:59:00 30-57-Y-j19-1 27.TA.30-57-Y-j19-1.1.H 8503202 \n", "348 2020-05-21 13:15:00 26-24-j19-1 662.TA.26-24-j19-1.254.R 8502208 \n", "349 2020-05-21 13:17:00 26-24-j19-1 662.TA.26-24-j19-1.254.R 8502209 \n", "350 2020-05-21 13:21:00 26-24-j19-1 662.TA.26-24-j19-1.254.R 8503202 \n", "\n", " sequence_shift_1 \n", "346 3 \n", "347 3 \n", "348 4 \n", "349 5 \n", "350 6 " ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_ordered.loc[346:400].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "#stops_join = route_stops.join(transfers.set_index(\"stop_int\"), how=\"left\", on=\"stop_int\").drop_duplicates()\n", "#stops_join.head(5)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
0000
1101
2202
3303
4404
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "3 3 0 3\n", "4 4 0 4" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_join = route_stops\n", "stops_join.head(5)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_join.stop_int.nunique()" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_Transfers
stop_int
07470
11740
21740
31740
41740
\n", "
" ], "text/plain": [ " n_Routes n_Transfers\n", "stop_int \n", "0 747 0\n", "1 174 0\n", "2 174 0\n", "3 174 0\n", "4 174 0" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers = stops_join.groupby([\"stop_int\"]).nunique().rename(columns={\"route_int\": \"n_Routes\", \"stop_int2\": \"n_Transfers\"})\n", "distinct_route_transfers[\"n_Transfers\"] = 0\n", "distinct_route_transfers = distinct_route_transfers[[\"n_Routes\", \"n_Transfers\"]].sort_index().rename(columns={\"route_int\": \"n_Trips\", \"stop_int2\": \"n_stops\"})\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_TransfersCumsum_routesCumsum_transfers
stop_int
0747000
117407470
217409210
3174010950
4174012690
\n", "
" ], "text/plain": [ " n_Routes n_Transfers Cumsum_routes Cumsum_transfers\n", "stop_int \n", "0 747 0 0 0\n", "1 174 0 747 0\n", "2 174 0 921 0\n", "3 174 0 1095 0\n", "4 174 0 1269 0" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers['Cumsum_routes'] = distinct_route_transfers.n_Routes.cumsum().shift(1, fill_value=0)\n", "distinct_route_transfers['Cumsum_transfers'] = distinct_route_transfers.n_Transfers.cumsum().shift(1, fill_value=0)\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_TransfersCumsum_routesCumsum_transfersCumsum_routes_shiftCumsum_transfers_shift
stop_int
07470007470
1174074709210
21740921010950
317401095012690
417401269014430
\n", "
" ], "text/plain": [ " n_Routes n_Transfers Cumsum_routes Cumsum_transfers \\\n", "stop_int \n", "0 747 0 0 0 \n", "1 174 0 747 0 \n", "2 174 0 921 0 \n", "3 174 0 1095 0 \n", "4 174 0 1269 0 \n", "\n", " Cumsum_routes_shift Cumsum_transfers_shift \n", "stop_int \n", "0 747 0 \n", "1 921 0 \n", "2 1095 0 \n", "3 1269 0 \n", "4 1443 0 " ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers[\"Cumsum_routes_shift\"] = distinct_route_transfers['Cumsum_routes'].shift(-1, fill_value=0)\n", "distinct_route_transfers[\"Cumsum_transfers_shift\"] = distinct_route_transfers['Cumsum_transfers'].shift(-1, fill_value=0)\n", "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_Routesn_TransfersCumsum_routesCumsum_transfersCumsum_routes_shiftCumsum_transfers_shift
stop_int
074700None7470
11740747None9210
21740921None10950
317401095None12690
417401269None14430
\n", "
" ], "text/plain": [ " n_Routes n_Transfers Cumsum_routes Cumsum_transfers \\\n", "stop_int \n", "0 747 0 0 None \n", "1 174 0 747 None \n", "2 174 0 921 None \n", "3 174 0 1095 None \n", "4 174 0 1269 None \n", "\n", " Cumsum_routes_shift Cumsum_transfers_shift \n", "stop_int \n", "0 747 0 \n", "1 921 0 \n", "2 1095 0 \n", "3 1269 0 \n", "4 1443 0 " ] }, "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers.head(5)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "distinct_route_transfers['Cumsum_routes'] = np.where((distinct_route_transfers[\"Cumsum_routes\"] == distinct_route_transfers[\"Cumsum_routes_shift\"]), None, distinct_route_transfers['Cumsum_routes'])\n", "distinct_route_transfers['Cumsum_transfers'] = np.where((distinct_route_transfers[\"Cumsum_transfers\"] == distinct_route_transfers[\"Cumsum_transfers_shift\"]), None, distinct_route_transfers['Cumsum_transfers'])\n" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "n_Routes False\n", "n_Transfers False\n", "Cumsum_routes False\n", "Cumsum_transfers True\n", "Cumsum_routes_shift False\n", "Cumsum_transfers_shift False\n", "dtype: bool" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct_route_transfers.isna().any()" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "with open('../data/stops_df.pkl','wb') as f: pickle.dump(distinct_route_transfers[['Cumsum_routes', 'Cumsum_transfers']], f)" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, None],\n", " [747, None],\n", " [921, None],\n", " ...,\n", " [237334, None],\n", " [237389, None],\n", " [237444, None]], dtype=object)" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_array = distinct_route_transfers[['Cumsum_routes', 'Cumsum_transfers']].to_numpy()\n", "stops_array" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stops_array, 0)" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1407, 2)" ] }, "execution_count": 135, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stops_array.shape" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [], "source": [ "with open('../data/stops_array.pkl','wb') as f: pickle.dump(stops_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_intstop_int
0000
1101
2202
3303
4404
\n", "
" ], "text/plain": [ " index route_int stop_int\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "3 3 0 3\n", "4 4 0 4" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "route_stops.head(5)" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "stop_routes = stop_times_ordered[[\"route_id\", \"route_int\", \"stop_id\", \"stop_int\"]].drop_duplicates().sort_values([\"stop_int\", \"route_int\"])\n", "stop_routes = stop_routes.reset_index()" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexroute_idroute_intstop_idstop_int
92024477726-302-j19-11612185955111
921226-301-j19-1085906032
9221326-301-j19-1185906032
9232426-301-j19-1285906032
9243526-301-j19-1385906032
\n", "
" ], "text/plain": [ " index route_id route_int stop_id stop_int\n", "920 244777 26-302-j19-1 16121 8595511 1\n", "921 2 26-301-j19-1 0 8590603 2\n", "922 13 26-301-j19-1 1 8590603 2\n", "923 24 26-301-j19-1 2 8590603 2\n", "924 35 26-301-j19-1 3 8590603 2" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.loc[920:925].head()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(237810, 4)" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.shape" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "249" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_times_curated.route_id.nunique()" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16210" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes.route_int.nunique()" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_routes_df.pkl','wb') as f: pickle.dump(stop_routes, f)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 1, 2, ..., 16136, 16146, 16147])" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes_array = stop_routes[\"route_int\"].to_numpy()\n", "stop_routes_array" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "237810" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(stop_routes_array, 0)" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(237810,)" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stop_routes_array.shape" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_routes_array.pkl','wb') as f: pickle.dump(stop_routes_array, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'stop_int'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtransfer_pandas\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransfers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"stop_int\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"stop_int2\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Transfer_time_sec\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtransfer_pandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36msort_values\u001b[0;34m(self, by, axis, ascending, inplace, kind, na_position, ignore_index)\u001b[0m\n\u001b[1;32m 4925\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4926\u001b[0m \u001b[0mby\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4927\u001b[0;31m \u001b[0mk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_label_or_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4928\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4929\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtuple\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_label_or_level_values\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1690\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_level_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1691\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1692\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1693\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1694\u001b[0m \u001b[0;31m# Check for duplicates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'stop_int'" ] } ], "source": [ "transfer_pandas = transfers.sort_values(\"stop_int\")[[\"stop_int2\", \"Transfer_time_sec\"]]\n", "transfer_pandas.head()" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1794" ] }, "execution_count": 226, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_pandas.stop_int2.nunique()" ] }, { "cell_type": "code", "execution_count": 225, "metadata": {}, "outputs": [], "source": [ "with open('../data/transfer_df.pkl','wb') as f: pickle.dump(transfers.sort_values(\"stop_id\"), f)" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1166, 146],\n", " [1270, 360],\n", " [ 2, 8],\n", " ...,\n", " [ 108, 371],\n", " [ 102, 439],\n", " [1739, 519]])" ] }, "execution_count": 219, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transfer_array = transfer_pandas.to_numpy()\n", "transfer_array" ] }, { "cell_type": "code", "execution_count": 220, "metadata": {}, "outputs": [], "source": [ "with open('../data/transfer_array.pkl','wb') as f: pickle.dump(transfer_array, f)" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "12564" ] }, "execution_count": 221, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.size(transfer_array, 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "read files as pickles" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "with open('../data/stop_times_array.pkl','rb') as f: arrayname1 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "with open('../data/routes_array.pkl','rb') as f: arrayname2 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "with open('../data/route_stops_array.pkl','rb') as f: arrayname3 = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 'NaT', '2020-05-21T16:53:00.000000000'],\n", " ['2020-05-21T16:55:00.000000000', '2020-05-21T16:55:00.000000000'],\n", " ['2020-05-21T16:57:00.000000000', '2020-05-21T16:57:00.000000000'],\n", " ...,\n", " ['2020-05-21T15:10:00.000000000', 'NaT'],\n", " [ 'NaT', '2020-05-21T16:45:00.000000000'],\n", " ['2020-05-21T17:05:00.000000000', 'NaT']],\n", " dtype='datetime64[ns]')" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arrayname1" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 11, 0, 0],\n", " [1, 11, 11, 11],\n", " [1, 11, 22, 22],\n", " ...,\n", " [1, 6, 237432, 245713],\n", " [1, 13, 237438, 245719],\n", " [3, 2, 237451, 245732]], dtype=object)" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arrayname2" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 1, 2, ..., 1187, 573, 778])" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arrayname3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/notebooks/transfer_to_local.ipynb b/notebooks/transfer_to_local.ipynb index eb577dd..3d812fd 100644 --- a/notebooks/transfer_to_local.ipynb +++ b/notebooks/transfer_to_local.ipynb @@ -1,255 +1,255 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## transfer files from HDFS to local\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7272application_1589299642358_1768pysparkidleLinkLink
7281application_1589299642358_1777pysparkidleLinkLink
7283application_1589299642358_1779pysparkidleLinkLink
7292application_1589299642358_1788pysparkidleLinkLink
7298application_1589299642358_1794pysparkidleLinkLink
7301application_1589299642358_1797pysparkbusyLinkLink
7307application_1589299642358_1803pysparkbusyLinkLink
7309application_1589299642358_1805pysparkidleLinkLink
7310application_1589299642358_1806pysparkidleLinkLink
7311application_1589299642358_1807pysparkidleLinkLink
7314application_1589299642358_1810pysparkidleLinkLink
7315application_1589299642358_1811pysparkbusyLinkLink
7317application_1589299642358_1813pysparkidleLinkLink
7318application_1589299642358_1814pysparkidleLinkLink
7320application_1589299642358_1816pysparkidleLinkLink
7322application_1589299642358_1818pysparkidleLinkLink
7323application_1589299642358_1819pysparkidleLinkLink
7325application_1589299642358_1821pysparkbusyLinkLink
7326application_1589299642358_1822pysparkidleLinkLink
7328application_1589299642358_1824pysparkidleLinkLink
7331application_1589299642358_1827pysparkidleLinkLink
7334application_1589299642358_1830pysparkbusyLinkLink
7335application_1589299642358_1831pysparkdeadLinkLink
7336application_1589299642358_1832pysparkidleLinkLink
7337application_1589299642358_1833pysparkidleLinkLink
7338application_1589299642358_1834pysparkbusyLinkLink
7339application_1589299642358_1835pysparkidleLinkLink
7340application_1589299642358_1836pysparkidleLinkLink
7341application_1589299642358_1837pysparkidleLinkLink
7342application_1589299642358_1838pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7272application_1589299642358_1768pysparkidleLinkLink7283application_1589299642358_1779pysparkidleLinkLink7292application_1589299642358_1788pysparkidleLinkLink7298application_1589299642358_1794pysparkidleLinkLink7301application_1589299642358_1797pysparkbusyLinkLink7310application_1589299642358_1806pysparkidleLinkLink7311application_1589299642358_1807pysparkidleLinkLink7314application_1589299642358_1810pysparkidleLinkLink7315application_1589299642358_1811pysparkbusyLinkLink7318application_1589299642358_1814pysparkidleLinkLink7325application_1589299642358_1821pysparkidleLinkLink7326application_1589299642358_1822pysparkidleLinkLink7328application_1589299642358_1824pysparkidleLinkLink7331application_1589299642358_1827pysparkidleLinkLink7334application_1589299642358_1830pysparkbusyLinkLink7336application_1589299642358_1832pysparkidleLinkLink7337application_1589299642358_1833pysparkidleLinkLink7338application_1589299642358_1834pysparkbusyLinkLink7339application_1589299642358_1835pysparkidleLinkLink7340application_1589299642358_1836pysparkidleLinkLink7341application_1589299642358_1837pysparkidleLinkLink7342application_1589299642358_1838pysparkidleLinkLink7343application_1589299642358_1839pysparkidleLinkLink7345application_1589299642358_1841pysparkidleLinkLink7346application_1589299642358_1842pysparkidleLinkLink7347application_1589299642358_1843pysparkidleLinkLink7348application_1589299642358_1844pysparkidleLinkLink7349application_1589299642358_1845pysparkbusyLinkLink7350application_1589299642358_1846pysparkidleLinkLink7352application_1589299642358_1848pysparkbusyLinkLink7353application_1589299642358_1849pysparkidleLinkLink7354application_1589299642358_1850pysparkstartingLinkLink" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
7343application_1589299642358_1839pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?7355application_1589299642358_1851pysparkidleLinkLink✔" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfert and save tables from hdfs to local \n", "\n", "Here we describe the process of loading a table on hdfs, saving it to a proper place so that we can load it in local and then save it.\n", "\n", "First, we load the data that is in an otherwise not accessible place in hdfs :" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "username = 'tturner'\n", "\n", "my_files = ['stop_times_curated.csv', 'trips_curated.csv',\n", " 'stops_15km.csv', 'transfers.csv', 'routes_curated.csv']\n", "\n", "for file in my_files:\n", " this_file = spark.read.csv('data/lgpt_guys/{}'.format(file), \\\n", " header = True) \n", " this_file.write.csv(\"/user/{0}/{1}\".format(username, file.replace('.csv','')), \\\n", " header = True, mode = 'overwrite')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/work/final_project/notebooks\n", "stop_times_curated\n", "trips_curated\n", "stops_15km\n", "transfers\n", "routes_curated\n" ] } ], "source": [ "%local\n", "\n", "from hdfs3 import HDFileSystem\n", "import pandas as pd\n", "import numpy as np \n", "import os\n", "print(os.getcwd())\n", "\n", "hdfs = HDFileSystem(host='hdfs://iccluster044.iccluster.epfl.ch', port=8020, user='ebouille')\n", "\n", "username = 'tturner'\n", "\n", "my_folders = ['stop_times_curated', 'trips_curated',\n", " 'stops_15km', 'transfers', 'routes_curated']\n", "\n", "for folder in my_folders:\n", " print(folder)\n", " array_files = hdfs.glob('/user/{0}/{1}/*.csv'.format(username, folder))\n", " array = pd.DataFrame()\n", " for file in array_files:\n", " with hdfs.open(file) as f:\n", " array = array.append(pd.read_csv(f))\n", "\n", " array.to_csv('../data/{}.csv'.format(folder), header=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 }