diff --git a/.gitattributes b/.gitattributes index 1efcd35..fb29057 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,7 +1,9 @@ data/distributions.pickle filter=lfs diff=lfs merge=lfs -text data/stop_times_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/transfer_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/routes_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/route_stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text data/stop_routes_array_version3.csv filter=lfs diff=lfs merge=lfs -text data/stops_array_version2.csv filter=lfs diff=lfs merge=lfs -text +object.data filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/notebooks/Arrays_to_pickle.ipynb b/notebooks/Arrays_to_pickle.ipynb new file mode 100644 index 0000000..e36091a --- /dev/null +++ b/notebooks/Arrays_to_pickle.ipynb @@ -0,0 +1,2387 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transform csv preproccesed files to pickle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import packages" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read files" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1883 entries, 0 to 1882\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 1883 non-null int64 \n", + " 1 stop_id 1883 non-null object \n", + " 2 stop_name 1883 non-null object \n", + " 3 stop_lat 1883 non-null float64\n", + " 4 stop_lon 1883 non-null float64\n", + " 5 location_type 75 non-null float64\n", + " 6 parent_station 286 non-null object \n", + " 7 stop_int 1883 non-null int64 \n", + "dtypes: float64(3), int64(2), object(3)\n", + "memory usage: 117.8+ KB\n" + ] + } + ], + "source": [ + "#stops_15km\n", + "stops_15km = pd.read_csv(\"../data/stops_15km.csv\")\n", + "stops_15km.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 247920 entries, 0 to 247919\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 247920 non-null int64 \n", + " 1 trip_id 247920 non-null object \n", + " 2 stop_id 247920 non-null object \n", + " 3 arrival_time 247920 non-null object \n", + " 4 departure_time 247920 non-null object \n", + " 5 stop_sequence 247920 non-null int64 \n", + " 6 pickup_type 247920 non-null int64 \n", + " 7 drop_off_type 247920 non-null int64 \n", + " 8 stop_int 247920 non-null int64 \n", + " 9 route_id 247920 non-null object \n", + " 10 sequence_1 247919 non-null float64\n", + " 11 trip_1 247919 non-null object \n", + " 12 route_int 247920 non-null int64 \n", + "dtypes: float64(1), int64(6), object(6)\n", + "memory usage: 24.6+ MB\n" + ] + } + ], + "source": [ + "#stop_times\n", + "stop_times_curated = pd.read_csv(\"../data/stop_times_curated.csv\")\n", + "stop_times_curated.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 12564 entries, 0 to 12563\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 12564 non-null int64 \n", + " 1 stop_id 12564 non-null object\n", + " 2 stop_int 12564 non-null int64 \n", + " 3 stop_id2 12564 non-null object\n", + " 4 stop_int2 12564 non-null int64 \n", + " 5 Transfer_time_sec 12564 non-null int64 \n", + "dtypes: int64(4), object(2)\n", + "memory usage: 589.1+ KB\n" + ] + } + ], + "source": [ + "#transfers\n", + "transfers = pd.read_csv(\"../data/transfers.csv\")\n", + "transfers.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 22807 entries, 0 to 22806\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 22807 non-null int64 \n", + " 1 route_id 22807 non-null object\n", + " 2 direction_id 22807 non-null int64 \n", + " 3 trip_id 22807 non-null object\n", + " 4 service_id 22807 non-null object\n", + " 5 trip_headsign 22807 non-null object\n", + " 6 trip_short_name 22807 non-null int64 \n", + " 7 route_int 22807 non-null int64 \n", + "dtypes: int64(4), object(4)\n", + "memory usage: 1.4+ MB\n" + ] + } + ], + "source": [ + "#trips_curated\n", + "trips_curated = pd.read_csv(\"../data/trips_curated.csv\")\n", + "trips_curated.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Raptor specific preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "StopTimes: [[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0trip_idstop_idarrival_timedeparture_timestop_sequencepickup_typedrop_off_typestop_introute_idsequence_1trip_1route_int
12941294100.TA.1-1-E-j19-1.12.R857867908:49:0008:49:001007911-1-E-j19-12.0100.TA.1-1-E-j19-1.12.R884763262977
12951295100.TA.1-1-E-j19-1.12.R859031408:50:0008:50:0020010361-1-E-j19-13.0100.TA.1-1-E-j19-1.12.R884763262977
12961296100.TA.1-1-E-j19-1.12.R859031708:51:0008:51:0030010371-1-E-j19-119.0100.TA.1-321-j19-1.6.H884763262977
20232023101.TA.1-1-E-j19-1.12.R857867909:04:0009:04:001007911-1-E-j19-12.0101.TA.1-1-E-j19-1.12.R884763262977
20242024101.TA.1-1-E-j19-1.12.R859031409:05:0009:05:0020010361-1-E-j19-13.0101.TA.1-1-E-j19-1.12.R884763262977
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 trip_id stop_id arrival_time \\\n", + "1294 1294 100.TA.1-1-E-j19-1.12.R 8578679 08:49:00 \n", + "1295 1295 100.TA.1-1-E-j19-1.12.R 8590314 08:50:00 \n", + "1296 1296 100.TA.1-1-E-j19-1.12.R 8590317 08:51:00 \n", + "2023 2023 101.TA.1-1-E-j19-1.12.R 8578679 09:04:00 \n", + "2024 2024 101.TA.1-1-E-j19-1.12.R 8590314 09:05:00 \n", + "\n", + " departure_time stop_sequence pickup_type drop_off_type stop_int \\\n", + "1294 08:49:00 1 0 0 791 \n", + "1295 08:50:00 2 0 0 1036 \n", + "1296 08:51:00 3 0 0 1037 \n", + "2023 09:04:00 1 0 0 791 \n", + "2024 09:05:00 2 0 0 1036 \n", + "\n", + " route_id sequence_1 trip_1 route_int \n", + "1294 1-1-E-j19-1 2.0 100.TA.1-1-E-j19-1.12.R 884763262977 \n", + "1295 1-1-E-j19-1 3.0 100.TA.1-1-E-j19-1.12.R 884763262977 \n", + "1296 1-1-E-j19-1 19.0 100.TA.1-321-j19-1.6.H 884763262977 \n", + "2023 1-1-E-j19-1 2.0 101.TA.1-1-E-j19-1.12.R 884763262977 \n", + "2024 1-1-E-j19-1 3.0 101.TA.1-1-E-j19-1.12.R 884763262977 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_ordered = stop_times_curated.sort_values(by=[\"route_id\", \"trip_id\", \"stop_sequence\"])\n", + "stop_times_ordered.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n", + "stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timedeparture_time
12942020-05-19 08:49:002020-05-19 08:49:00
12952020-05-19 08:50:002020-05-19 08:50:00
12962020-05-19 08:51:002020-05-19 08:51:00
20232020-05-19 09:04:002020-05-19 09:04:00
20242020-05-19 09:05:002020-05-19 09:05:00
\n", + "
" + ], + "text/plain": [ + " arrival_time departure_time\n", + "1294 2020-05-19 08:49:00 2020-05-19 08:49:00\n", + "1295 2020-05-19 08:50:00 2020-05-19 08:50:00\n", + "1296 2020-05-19 08:51:00 2020-05-19 08:51:00\n", + "2023 2020-05-19 09:04:00 2020-05-19 09:04:00\n", + "2024 2020-05-19 09:05:00 2020-05-19 09:05:00" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding None to first arrival time and last departure time" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered['departure_time'] = np.where((stop_times_ordered[\"stop_sequence\"] > stop_times_ordered[\"sequence_1\"]), None, stop_times_ordered['departure_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered[\"arrival_time\"] = np.where((stop_times_ordered[\"stop_sequence\"] == 1), None, stop_times_ordered['arrival_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timedeparture_timestop_sequencesequence_1
1294None158987814000000000012.0
12951589878200000000000158987820000000000023.0
129615898782600000000001589878260000000000319.0
2023None158987904000000000012.0
20241589879100000000000158987910000000000023.0
\n", + "
" + ], + "text/plain": [ + " arrival_time departure_time stop_sequence sequence_1\n", + "1294 None 1589878140000000000 1 2.0\n", + "1295 1589878200000000000 1589878200000000000 2 3.0\n", + "1296 1589878260000000000 1589878260000000000 3 19.0\n", + "2023 None 1589879040000000000 1 2.0\n", + "2024 1589879100000000000 1589879100000000000 2 3.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_ordered[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_1\"]].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_ordered['arrival_time'] = pd.to_datetime(stop_times_ordered['arrival_time'])\n", + "stop_times_ordered['departure_time'] = pd.to_datetime(stop_times_ordered['departure_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
arrival_timedeparture_time
1294NaT2020-05-19 08:49:00
12952020-05-19 08:50:002020-05-19 08:50:00
12962020-05-19 08:51:002020-05-19 08:51:00
2023NaT2020-05-19 09:04:00
20242020-05-19 09:05:002020-05-19 09:05:00
\n", + "
" + ], + "text/plain": [ + " arrival_time departure_time\n", + "1294 NaT 2020-05-19 08:49:00\n", + "1295 2020-05-19 08:50:00 2020-05-19 08:50:00\n", + "1296 2020-05-19 08:51:00 2020-05-19 08:51:00\n", + "2023 NaT 2020-05-19 09:04:00\n", + "2024 2020-05-19 09:05:00 2020-05-19 09:05:00" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_ordered[[\"arrival_time\", \"departure_time\"]].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 247920 entries, 1294 to 239715\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 247920 non-null int64 \n", + " 1 trip_id 247920 non-null object \n", + " 2 stop_id 247920 non-null object \n", + " 3 arrival_time 231520 non-null datetime64[ns]\n", + " 4 departure_time 212017 non-null datetime64[ns]\n", + " 5 stop_sequence 247920 non-null int64 \n", + " 6 pickup_type 247920 non-null int64 \n", + " 7 drop_off_type 247920 non-null int64 \n", + " 8 stop_int 247920 non-null int64 \n", + " 9 route_id 247920 non-null object \n", + " 10 sequence_1 247919 non-null float64 \n", + " 11 trip_1 247919 non-null object \n", + " 12 route_int 247920 non-null int64 \n", + "dtypes: datetime64[ns](2), float64(1), int64(6), object(4)\n", + "memory usage: 26.5+ MB\n" + ] + } + ], + "source": [ + "stop_times_ordered.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 'NaT', '2020-05-19T08:49:00.000000000'],\n", + " ['2020-05-19T08:50:00.000000000', '2020-05-19T08:50:00.000000000'],\n", + " ['2020-05-19T08:51:00.000000000', '2020-05-19T08:51:00.000000000'],\n", + " ...,\n", + " [ 'NaT', '2020-05-19T16:07:00.000000000'],\n", + " ['2020-05-19T16:11:00.000000000', '2020-05-19T16:15:00.000000000'],\n", + " ['2020-05-19T16:19:00.000000000', 'NaT']],\n", + " dtype='datetime64[ns]')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_times_array = stop_times_ordered[[\"arrival_time\", \"departure_time\"]].to_numpy()\n", + "stop_times_array" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "247920" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.size(stop_times_array,0)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_times_array.pkl','wb') as f: pickle.dump(stop_times_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Routes: [[route0_nr. Stops, route0_nr.Trips, route0_pointerStops, route0_pointerTrip],[ route1_nr. Stops, route1_nr. Trips, route1_pointerStops, route1_pointerTrip],…]" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_Tripsn_stops
route_id
1-1-E-j19-11403
1-10-j19-1374
1-11-B-j19-1485
1-16-j19-1248
1-17-A-j19-110619
\n", + "
" + ], + "text/plain": [ + " n_Trips n_stops\n", + "route_id \n", + "1-1-E-j19-1 140 3\n", + "1-10-j19-1 37 4\n", + "1-11-B-j19-1 48 5\n", + "1-16-j19-1 24 8\n", + "1-17-A-j19-1 106 19" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_trips_stops = stop_times_curated.groupby([\"route_id\"]).nunique()[[\"trip_id\", \"stop_id\"]].sort_index().rename(columns={\"trip_id\": \"n_Trips\", \"stop_id\": \"n_stops\"})\n", + "distinct_trips_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(279, 2)" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_trips_stops.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_Tripsn_stopsCumsum_tripsCumsum_stops
route_id
1-1-E-j19-114031403
1-10-j19-13741777
1-11-B-j19-148522512
1-16-j19-124824920
1-17-A-j19-11061935539
\n", + "
" + ], + "text/plain": [ + " n_Trips n_stops Cumsum_trips Cumsum_stops\n", + "route_id \n", + "1-1-E-j19-1 140 3 140 3\n", + "1-10-j19-1 37 4 177 7\n", + "1-11-B-j19-1 48 5 225 12\n", + "1-16-j19-1 24 8 249 20\n", + "1-17-A-j19-1 106 19 355 39" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_trips_stops['Cumsum_trips'] = distinct_trips_stops.n_Trips.cumsum()\n", + "distinct_trips_stops['Cumsum_stops'] = distinct_trips_stops.n_stops.cumsum()\n", + "distinct_trips_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 279 entries, 1-1-E-j19-1 to 90-71-Y-j19-1\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 n_Trips 279 non-null int64\n", + " 1 n_stops 279 non-null int64\n", + " 2 Cumsum_trips 279 non-null int64\n", + " 3 Cumsum_stops 279 non-null int64\n", + "dtypes: int64(4)\n", + "memory usage: 10.9+ KB\n" + ] + } + ], + "source": [ + "distinct_trips_stops.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_Tripsn_stopsCumsum_tripsCumsum_stopsCumsum_trips_shiftCumsum_stops_shift
route_id
1-1-E-j19-1140314031777
1-10-j19-1374177722512
1-11-B-j19-14852251224920
1-16-j19-12482492035539
1-17-A-j19-1106193553961045
\n", + "
" + ], + "text/plain": [ + " n_Trips n_stops Cumsum_trips Cumsum_stops \\\n", + "route_id \n", + "1-1-E-j19-1 140 3 140 3 \n", + "1-10-j19-1 37 4 177 7 \n", + "1-11-B-j19-1 48 5 225 12 \n", + "1-16-j19-1 24 8 249 20 \n", + "1-17-A-j19-1 106 19 355 39 \n", + "\n", + " Cumsum_trips_shift Cumsum_stops_shift \n", + "route_id \n", + "1-1-E-j19-1 177 7 \n", + "1-10-j19-1 225 12 \n", + "1-11-B-j19-1 249 20 \n", + "1-16-j19-1 355 39 \n", + "1-17-A-j19-1 610 45 " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_trips_stops[\"Cumsum_trips_shift\"] = distinct_trips_stops['Cumsum_trips'].shift(-1, fill_value=0)\n", + "distinct_trips_stops[\"Cumsum_stops_shift\"] = distinct_trips_stops['Cumsum_stops'].shift(-1, fill_value=0)\n", + "distinct_trips_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_trips_stops['Cumsum_trips'] = np.where((distinct_trips_stops[\"Cumsum_trips\"] == distinct_trips_stops[\"Cumsum_trips_shift\"]), None, distinct_trips_stops['Cumsum_trips'])\n", + "distinct_trips_stops['Cumsum_stops'] = np.where((distinct_trips_stops[\"Cumsum_stops\"] == distinct_trips_stops[\"Cumsum_stops_shift\"]), None, distinct_trips_stops['Cumsum_stops'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "n_Trips False\n", + "n_stops False\n", + "Cumsum_trips False\n", + "Cumsum_stops False\n", + "Cumsum_trips_shift False\n", + "Cumsum_stops_shift False\n", + "dtype: bool" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_trips_stops.isna().any()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[140, 3, 140, 3],\n", + " [37, 4, 177, 7],\n", + " [48, 5, 225, 12],\n", + " ...,\n", + " [84, 2, 22709, 3275],\n", + " [85, 2, 22794, 3277],\n", + " [13, 5, 22807, 3282]], dtype=object)" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "routes_array = distinct_trips_stops[['n_Trips', 'n_stops', 'Cumsum_trips', 'Cumsum_stops']].to_numpy()\n", + "routes_array" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "279" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.size(routes_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/routes_array.pkl','wb') as f: pickle.dump(routes_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_intstop_int
1294884763262977791
12958847632629771036
12968847632629771037
1613782576980377611037
1480072576980377611036
\n", + "
" + ], + "text/plain": [ + " route_int stop_int\n", + "1294 884763262977 791\n", + "1295 884763262977 1036\n", + "1296 884763262977 1037\n", + "161378 257698037761 1037\n", + "148007 257698037761 1036" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_stops = stop_times_curated.sort_values([\"route_id\", \"stop_sequence\"])\n", + "route_stops = route_stops[['route_int', 'stop_int']].drop_duplicates()\n", + "route_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 5545 entries, 1294 to 64424\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 route_int 5545 non-null int64\n", + " 1 stop_int 5545 non-null int64\n", + "dtypes: int64(2)\n", + "memory usage: 130.0 KB\n" + ] + } + ], + "source": [ + "route_stops.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 791, 1036, 1037, ..., 471, 472, 473])" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_stops_array = route_stops.stop_int.to_numpy()\n", + "route_stops_array" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5545" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.size(route_stops_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/route_stops_array.pkl','wb') as f: pickle.dump(route_stops_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_intstop_intUnnamed: 0stop_idstop_id2stop_int2Transfer_time_sec
12948847632629777916053857867985903141036331
1613792576980377617916053857867985903141036331
12958847632629771036759585903148578679791331
129588476326297710367596859031485903171037496
1480072576980377611036759585903148578679791331
\n", + "
" + ], + "text/plain": [ + " route_int stop_int Unnamed: 0 stop_id stop_id2 stop_int2 \\\n", + "1294 884763262977 791 6053 8578679 8590314 1036 \n", + "161379 257698037761 791 6053 8578679 8590314 1036 \n", + "1295 884763262977 1036 7595 8590314 8578679 791 \n", + "1295 884763262977 1036 7596 8590314 8590317 1037 \n", + "148007 257698037761 1036 7595 8590314 8578679 791 \n", + "\n", + " Transfer_time_sec \n", + "1294 331 \n", + "161379 331 \n", + "1295 331 \n", + "1295 496 \n", + "148007 331 " + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stops_join = route_stops.join(transfers.set_index(\"stop_int\"), how=\"inner\", on=\"stop_int\").drop_duplicates()\n", + "stops_join.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_Routesn_Transfers
stop_idstop_int
8500926012
8502186:0:1217
8502186:0:2317
8502187:0:1613
8502187:0:2713
\n", + "
" + ], + "text/plain": [ + " n_Routes n_Transfers\n", + "stop_id stop_int \n", + "8500926 0 1 2\n", + "8502186:0:1 2 1 7\n", + "8502186:0:2 3 1 7\n", + "8502187:0:1 6 1 3\n", + "8502187:0:2 7 1 3" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_route_transfers = stops_join.groupby([\"stop_id\", \"stop_int\"]).nunique().rename(columns={\"route_int\": \"n_Routes\", \"stop_int2\": \"n_Transfers\"})\n", + "distinct_route_transfers = distinct_route_transfers[[\"n_Routes\", \"n_Transfers\"]].sort_index().rename(columns={\"route_int\": \"n_Trips\", \"stop_int2\": \"n_stops\"})\n", + "distinct_route_transfers.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_Routesn_TransfersCumsum_routesCumsum_transfers
stop_idstop_int
850092601212
8502186:0:121729
8502186:0:2317316
8502187:0:1613419
8502187:0:2713522
\n", + "
" + ], + "text/plain": [ + " n_Routes n_Transfers Cumsum_routes Cumsum_transfers\n", + "stop_id stop_int \n", + "8500926 0 1 2 1 2\n", + "8502186:0:1 2 1 7 2 9\n", + "8502186:0:2 3 1 7 3 16\n", + "8502187:0:1 6 1 3 4 19\n", + "8502187:0:2 7 1 3 5 22" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_route_transfers['Cumsum_routes'] = distinct_route_transfers.n_Routes.cumsum()\n", + "distinct_route_transfers['Cumsum_transfers'] = distinct_route_transfers.n_Transfers.cumsum()\n", + "distinct_route_transfers.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_Routesn_TransfersCumsum_routesCumsum_transfersCumsum_routes_shiftCumsum_transfers_shift
stop_idstop_int
85009260121229
8502186:0:121729316
8502186:0:2317316419
8502187:0:1613419522
8502187:0:2713522634
\n", + "
" + ], + "text/plain": [ + " n_Routes n_Transfers Cumsum_routes Cumsum_transfers \\\n", + "stop_id stop_int \n", + "8500926 0 1 2 1 2 \n", + "8502186:0:1 2 1 7 2 9 \n", + "8502186:0:2 3 1 7 3 16 \n", + "8502187:0:1 6 1 3 4 19 \n", + "8502187:0:2 7 1 3 5 22 \n", + "\n", + " Cumsum_routes_shift Cumsum_transfers_shift \n", + "stop_id stop_int \n", + "8500926 0 2 9 \n", + "8502186:0:1 2 3 16 \n", + "8502186:0:2 3 4 19 \n", + "8502187:0:1 6 5 22 \n", + "8502187:0:2 7 6 34 " + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_route_transfers[\"Cumsum_routes_shift\"] = distinct_route_transfers['Cumsum_routes'].shift(-1, fill_value=0)\n", + "distinct_route_transfers[\"Cumsum_transfers_shift\"] = distinct_route_transfers['Cumsum_transfers'].shift(-1, fill_value=0)\n", + "distinct_route_transfers.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "distinct_route_transfers['Cumsum_routes'] = np.where((distinct_route_transfers[\"Cumsum_routes\"] == distinct_route_transfers[\"Cumsum_routes_shift\"]), None, distinct_route_transfers['Cumsum_routes'])\n", + "distinct_route_transfers['Cumsum_transfers'] = np.where((distinct_route_transfers[\"Cumsum_transfers\"] == distinct_route_transfers[\"Cumsum_transfers_shift\"]), None, distinct_route_transfers['Cumsum_transfers'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "n_Routes False\n", + "n_Transfers False\n", + "Cumsum_routes False\n", + "Cumsum_transfers False\n", + "Cumsum_routes_shift False\n", + "Cumsum_transfers_shift False\n", + "dtype: bool" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_route_transfers.isna().any()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 2],\n", + " [2, 9],\n", + " [3, 16],\n", + " ...,\n", + " [5387, 10654],\n", + " [5389, 10669],\n", + " [5398, 10702]], dtype=object)" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stops_array = distinct_route_transfers[['Cumsum_routes', 'Cumsum_transfers']].to_numpy()\n", + "stops_array" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1512" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.size(stops_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stops_array.pkl','wb') as f: pickle.dump(stops_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_intstop_int
1294884763262977791
12958847632629771036
12968847632629771037
1613782576980377611037
1480072576980377611036
\n", + "
" + ], + "text/plain": [ + " route_int stop_int\n", + "1294 884763262977 791\n", + "1295 884763262977 1036\n", + "1296 884763262977 1037\n", + "161378 257698037761 1037\n", + "148007 257698037761 1036" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_stops.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_idroute_intstop_idstop_int
3451226-301-j19-1146028888064185009260
840181-17-A-j19-11803886264328502186:0:12
20341-17-A-j19-14810363371538502186:0:23
840151-17-A-j19-11803886264328502187:0:16
20371-17-A-j19-14810363371538502187:0:27
\n", + "
" + ], + "text/plain": [ + " route_id route_int stop_id stop_int\n", + "34512 26-301-j19-1 1460288880641 8500926 0\n", + "84018 1-17-A-j19-1 180388626432 8502186:0:1 2\n", + "2034 1-17-A-j19-1 481036337153 8502186:0:2 3\n", + "84015 1-17-A-j19-1 180388626432 8502187:0:1 6\n", + "2037 1-17-A-j19-1 481036337153 8502187:0:2 7" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_routes = stop_times_curated[[\"route_id\", \"route_int\", \"stop_id\", \"stop_int\"]].drop_duplicates().sort_values([\"stop_id\", \"route_id\"])\n", + "stop_routes.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5545, 4)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_routes.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([['26-301-j19-1', 1460288880641, '8500926', 0],\n", + " ['1-17-A-j19-1', 180388626432, '8502186:0:1', 2],\n", + " ['1-17-A-j19-1', 481036337153, '8502186:0:2', 3],\n", + " ...,\n", + " ['6-E01-j19-1', 317827579907, '8596126', 1882],\n", + " ['6-E02-j19-1', 317827579906, '8596126', 1882],\n", + " ['6-E02-j19-1', 618475290627, '8596126', 1882]], dtype=object)" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stop_routes_array = stop_routes.to_numpy()\n", + "stop_routes_array" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5545" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.size(stop_routes_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_routes_array.pkl','wb') as f: pickle.dump(stop_routes_array, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_int2Transfer_time_sec
01166146
11270360
228
3316
440
\n", + "
" + ], + "text/plain": [ + " stop_int2 Transfer_time_sec\n", + "0 1166 146\n", + "1 1270 360\n", + "2 2 8\n", + "3 3 16\n", + "4 4 0" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfer_pandas = transfers.sort_values(\"stop_id\")[[\"stop_int2\", \"Transfer_time_sec\"]]\n", + "transfer_pandas.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1166, 146],\n", + " [1270, 360],\n", + " [ 2, 8],\n", + " ...,\n", + " [ 108, 371],\n", + " [ 102, 439],\n", + " [1739, 519]])" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transfer_array = transfer_pandas.to_numpy()\n", + "transfer_array" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/transfer_array.pkl','wb') as f: pickle.dump(transfer_array, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12564" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.size(transfer_array, 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "read files as pickles" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/stop_times_array.pkl','rb') as f: arrayname1 = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/transfer_array.pkl','rb') as f: arrayname2 = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 'NaT', '2020-05-19T08:49:00.000000000'],\n", + " ['2020-05-19T08:50:00.000000000', '2020-05-19T08:50:00.000000000'],\n", + " ['2020-05-19T08:51:00.000000000', '2020-05-19T08:51:00.000000000'],\n", + " ...,\n", + " [ 'NaT', '2020-05-19T16:07:00.000000000'],\n", + " ['2020-05-19T16:11:00.000000000', '2020-05-19T16:15:00.000000000'],\n", + " ['2020-05-19T16:19:00.000000000', 'NaT']],\n", + " dtype='datetime64[ns]')" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arrayname1" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1166, 146],\n", + " [1270, 360],\n", + " [ 2, 8],\n", + " ...,\n", + " [ 108, 371],\n", + " [ 102, 439],\n", + " [1739, 519]])" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arrayname2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/to_begin_with.ipynb b/notebooks/to_begin_with.ipynb index 71b2e9f..5880da3 100644 --- a/notebooks/to_begin_with.ipynb +++ b/notebooks/to_begin_with.ipynb @@ -1,4439 +1,4453 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# To Begin With...\n", "\n", "### Name your spark application as `GASPAR_final` or `GROUP_NAME_final`.\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6663application_1589299642358_1152pysparkidleLinkLink
6666application_1589299642358_1155pysparkidleLinkLink
6667application_1589299642358_1156pysparkidleLinkLink
6668application_1589299642358_1157pysparkidleLinkLink
6670application_1589299642358_1159pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?6692application_1589299642358_1181pysparkidleLinkLink6708application_1589299642358_1199pysparkidleLinkLink6709application_1589299642358_1200pysparkidleLinkLink6710application_1589299642358_1201pysparkidleLinkLink6712application_1589299642358_1203pysparkidleLinkLink6714application_1589299642358_1205pysparkidleLinkLink6719application_1589299642358_1210pysparkidleLinkLink6721application_1589299642358_1212pysparkidleLinkLink6722application_1589299642358_1213pysparkidleLinkLink6724application_1589299642358_1215pysparkbusyLinkLink6725application_1589299642358_1216pysparkidleLinkLink6726application_1589299642358_1217pysparkidleLinkLink6727application_1589299642358_1218pysparkidleLinkLink6728application_1589299642358_1219pysparkidleLinkLink6729application_1589299642358_1220pysparkidleLinkLink6730application_1589299642358_1221pysparkidleLinkLink6733application_1589299642358_1224pysparkidleLinkLink6734application_1589299642358_1225pysparkidleLinkLink" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6671application_1589299642358_1160pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?6736application_1589299642358_1227pysparkidleLinkLink✔" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocessing" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from geopy.distance import great_circle\n", "from pyspark.sql.functions import *\n", "import numpy as np\n", "import pandas as pd\n", "from geopy.distance import great_circle\n", "from pyspark.sql.types import DoubleType\n", "from pyspark.sql.types import DateType\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create list of stops within 15 km" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Zurich HB coordinates\n", "zurich_geo = (47.378177, 8.540192)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+----------------+----------------+-------------+--------------+\n", "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+-------+--------------------+----------------+----------------+-------------+--------------+\n", "|1322000| Altoggio|46.1672513851495| 8.345807131427| null| null|\n", "|1322001| Antronapiana| 46.060121674738|8.11361957990831| null| null|\n", "|1322002| Anzola|45.9898698225697|8.34571729989858| null| null|\n", "|1322003| Baceno|46.2614983591677|8.31925293162473| null| null|\n", "|1322004|Beura Cardezza, c...|46.0790618438814|8.29927439970313| null| null|\n", "+-------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 5 rows" ] } ], "source": [ "stops = spark.read.csv(\"/data/sbb/timetables/csv/stops/2019/05/14/stops.txt\", header=True, sep = \",\")\n", "stops.show(5)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#defining udf function\n", "@udf(\"float\")\n", "def great_circle_udf(x, y):\n", " return great_circle(x, y).kilometers" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null|\n", "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P|\n", "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P|\n", "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P|\n", "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+\n", "only showing top 5 rows" ] } ], "source": [ "#transforming zurich coordinates in columsn\n", "zurich_geo_col = struct(lit(zurich_geo[0]), lit(zurich_geo[1]))\n", "#applying filter function based on distance\n", "stops_15km = stops.filter(great_circle_udf(zurich_geo_col, struct(stops.stop_lat, stops.stop_lon)) < 15)\n", "stops_15km.show(5)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 0|\n", "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1|\n", "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P| 2|\n", "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P| 3|\n", "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null| 4|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "stops_15km = stops_15km.withColumn(\"stop_int\", monotonically_increasing_id())\n", "stops_15km.show(5)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stops_15km.write.csv('data/lgpt_guys/stops_15km.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stops_15km = spark.read.csv('data/lgpt_guys/stops_15km.csv', header = True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Precompute walking times between stations under 500m distance" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- stop_id: string (nullable = true)\n", " |-- stop_name: string (nullable = true)\n", " |-- stop_lat: string (nullable = true)\n", " |-- stop_lon: string (nullable = true)\n", " |-- location_type: string (nullable = true)\n", " |-- parent_station: string (nullable = true)\n", " |-- stop_int: long (nullable = false)" ] } ], "source": [ "stops_15km.printSchema()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stops_15km2 = stops_15km.select(stops_15km.stop_id.alias(\"stop_id2\"), stops_15km.stop_int.alias(\"stop_int2\"), stops_15km.stop_lat.alias(\"stop_lat2\"),\\\n", " stops_15km.stop_lon.alias(\"stop_lon2\") )" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int| stop_id2|stop_int2| stop_lat2| stop_lon2|\n", "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8500926| 0|47.4236270123012| 8.4031825286317|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186| 1|47.3934058321612|8.39894248049007|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:1| 2|47.3934666445388|8.39894248049007|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:2| 3|47.3935274568464|8.39894248049007|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186P| 4|47.3934058321612|8.39894248049007|\n", "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", "only showing top 5 rows" ] } ], "source": [ "#creating all combinations of transfers\n", "combination_stops = stops_15km.crossJoin(stops_15km2)\n", "combination_stops.show(5)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int| stop_id2|stop_int2| stop_lat2| stop_lon2|\n", "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186| 1|47.3934058321612|8.39894248049007|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:1| 2|47.3934666445388|8.39894248049007|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0|8502186:0:2| 3|47.3935274568464|8.39894248049007|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502186P| 4|47.3934058321612|8.39894248049007|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012|8.4031825286317| null| null| 0| 8502187| 5|47.3646945560768|8.37709545277724|\n", "+-------+--------------------+----------------+---------------+-------------+--------------+--------+-----------+---------+----------------+----------------+\n", "only showing top 5 rows" ] } ], "source": [ "#eliminating transfer to same station\n", "combination_stops = combination_stops.filter(combination_stops.stop_id!=combination_stops.stop_id2)\n", "combination_stops.show(5)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+----------------+----------------+-------------+--------------+--------+-----------+---------+----------------+----------------+------------+\n", "|stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int| stop_id2|stop_int2| stop_lat2| stop_lon2| distance|\n", "+-------+--------------------+----------------+----------------+-------------+--------------+--------+-----------+---------+----------------+----------------+------------+\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 0| 8590616| 1166|47.4228794433749|8.40437728795975| 0.122429974|\n", "|8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 0| 8590737| 1270|47.4253712985116|8.40013723981811| 0.30017462|\n", "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1|8502186:0:1| 2|47.3934666445388|8.39894248049007|0.0067620375|\n", "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1|8502186:0:2| 3|47.3935274568464|8.39894248049007| 0.013524067|\n", "|8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1| 8502186P| 4|47.3934058321612|8.39894248049007| 0.0|\n", "+-------+--------------------+----------------+----------------+-------------+--------------+--------+-----------+---------+----------------+----------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "combination_stops = combination_stops.withColumn(\"distance\", great_circle_udf(struct(combination_stops.stop_lat, combination_stops.stop_lon), struct(combination_stops.stop_lat2, combination_stops.stop_lon2)))\n", "combination_stops = combination_stops.filter(combination_stops.distance < 0.5)\n", "combination_stops.show(5)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------+-----------+---------+-----------------+\n", "|stop_id|stop_int| stop_id2|stop_int2|Transfer_time_sec|\n", "+-------+--------+-----------+---------+-----------------+\n", "|8500926| 0| 8590616| 1166| 146|\n", "|8500926| 0| 8590737| 1270| 360|\n", "|8502186| 1|8502186:0:1| 2| 8|\n", "|8502186| 1|8502186:0:2| 3| 16|\n", "|8502186| 1| 8502186P| 4| 0|\n", "+-------+--------+-----------+---------+-----------------+\n", "only showing top 5 rows" ] } ], "source": [ "transfer = combination_stops.select(combination_stops.stop_id, combination_stops.stop_int, combination_stops.stop_id2,\\\n", " combination_stops.stop_int2, ((combination_stops.distance/0.05*60).alias(\"Transfer_time_sec\")).cast(\"integer\"))\n", "transfer.show(5)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- stop_id: string (nullable = true)\n", " |-- stop_int: long (nullable = false)\n", " |-- stop_id2: string (nullable = true)\n", " |-- stop_int2: long (nullable = false)\n", " |-- Transfer_time_sec: integer (nullable = true)" ] } ], "source": [ "transfer.printSchema()" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "transfer.write.csv('data/lgpt_guys/transfers.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean timetable data\n", "At the end of each step a file reader to be see the dataframe at this step without having to recompute" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Calendar.txt\n", "- drop services that are not on all business days" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "calendar = spark.read.csv(\"/data/sbb/timetables/csv/calendar/2019/05/14/calendar.txt\", header=True, sep = \",\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b08k4| 1| 0| 0| 0| 0| 0| 0| 20181209|20191214|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "calendar.show(5)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "|service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "| TA+b0nx9| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b03bf| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0008| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxg| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "| TA+b0nxn| 1| 1| 1| 1| 1| 0| 0| 20181209|20191214|\n", "+----------+------+-------+---------+--------+------+--------+------+----------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "calendar_business_days = calendar.filter((calendar.monday==1) & (calendar.tuesday==1) & (calendar.wednesday==1) & (calendar.thursday==1) & (calendar.friday==1))\n", "calendar_business_days.show(5)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+\n", "|service_id|\n", "+----------+\n", "| TA+b0nx9|\n", "| TA+b03bf|\n", "| TA+b0008|\n", "| TA+b0nxg|\n", "| TA+b0nxn|\n", "+----------+\n", "only showing top 5 rows" ] } ], "source": [ "service_id_business_days = calendar_business_days.select(calendar_business_days.service_id)\n", "service_id_business_days.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### trips.txt\n", "- Drop trips not in business days" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+----------+--------------------+------------------+---------------+------------+\n", "| route_id|service_id| trip_id| trip_headsign|trip_short_name|direction_id|\n", "+-----------+----------+--------------------+------------------+---------------+------------+\n", "|1-1-C-j19-1| TA+b0001|5.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 108| 1|\n", "|1-1-C-j19-1| TA+b0001|7.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 112| 1|\n", "|1-1-C-j19-1| TA+b0001|9.TA.1-1-C-j19-1.3.R|Zofingen, Altachen| 116| 1|\n", "|1-1-C-j19-1| TA+b0001|11.TA.1-1-C-j19-1...|Zofingen, Altachen| 120| 1|\n", "|1-1-C-j19-1| TA+b0001|13.TA.1-1-C-j19-1...|Zofingen, Altachen| 124| 1|\n", "+-----------+----------+--------------------+------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips = spark.read.csv(\"/data/sbb/timetables/csv/trips/2019/05/14/trips.txt\", header=True, sep = \",\")\n", "trips.show(5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+-----------+--------------------+--------------------+---------------+------------+\n", "|service_id| route_id| trip_id| trip_headsign|trip_short_name|direction_id|\n", "+----------+-----------+--------------------+--------------------+---------------+------------+\n", "| TA+b0001|1-1-C-j19-1|46.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 113| 0|\n", "| TA+b0001|1-1-C-j19-1|59.TA.1-1-C-j19-1...|Aarburg-Oftringen...| 139| 0|\n", "| TA+b0001|1-340-j19-1|2.TA.1-340-j19-1.1.H| Wohlen AG, Bahnhof| 105| 0|\n", "| TA+b0001|1-354-j19-1|36.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35435| 0|\n", "| TA+b0001|1-354-j19-1|47.TA.1-354-j19-1...|Kaiserstuhl AG, B...| 35467| 0|\n", "+----------+-----------+--------------------+--------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips_businessdays = service_id_business_days.join(trips, how=\"inner\", on=\"service_id\").dropDuplicates()\n", "trips_businessdays.show(5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "528368" ] } ], "source": [ "trips_businessdays.count()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_businessdays.write.csv('data/lgpt_guys/trips_businessdays.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_businessdays = spark.read.csv('data/lgpt_guys/trips_businessdays.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Stop_times.txt\n", "- Drop trips not in trips_business_days\n", "- Drop stops not inside 15 km radius\n", "- Drop departures before 7 am and after 9 pm\n", "- Add stop_int" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "| trip_id|arrival_time|departure_time| stop_id|stop_sequence|pickup_type|drop_off_type|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "|1.TA.1-1-B-j19-1.1.R| 04:20:00| 04:20:00|8500010:0:3| 1| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:24:00| 04:24:00|8500020:0:3| 2| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:28:00| 04:28:00|8500021:0:5| 3| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:30:00| 04:30:00|8517131:0:2| 4| 0| 0|\n", "|1.TA.1-1-B-j19-1.1.R| 04:32:00| 04:32:00|8500300:0:5| 5| 0| 0|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times = spark.read.csv(\"/data/sbb/timetables/csv/stop_times/2019/05/14/stop_times.txt\", header=True, sep = \",\")\n", "stop_times.show(5)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| trip_id|\n", "+--------------------+\n", "|65.TA.1-1-C-j19-1...|\n", "|34.TA.1-135-j19-1...|\n", "|50.TA.1-136-j19-1...|\n", "|42.TA.1-139-j19-1...|\n", "|39.TA.1-141-j19-1...|\n", "+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "trip_id_trips_businessdays = trips_businessdays.select(trips_businessdays.trip_id)\n", "trip_id_trips_businessdays.show(5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "| trip_id|arrival_time|departure_time| stop_id|stop_sequence|pickup_type|drop_off_type|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "|93.TA.1-1-B-j19-1...| 17:24:00| 17:24:00|8500020:0:3| 2| 0| 0|\n", "|143.TA.1-1-B-j19-...| 07:28:00| 07:28:00|8517131:0:1| 9| 0| 0|\n", "|164.TA.1-1-B-j19-...| 10:21:00| 10:22:00|8500301:0:4| 6| 0| 0|\n", "|31.TA.1-1-C-j19-1...| 19:39:00| 19:39:00| 8502384| 19| 0| 0|\n", "|34.TA.1-1-C-j19-1...| 20:39:00| 20:39:00| 8502384| 19| 0| 0|\n", "+--------------------+------------+--------------+-----------+-------------+-----------+-------------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop trips not in business days\n", "stop_times_business_days = stop_times.join(trip_id_trips_businessdays, how=\"inner\", on = \"trip_id\").dropDuplicates()\n", "stop_times_business_days.show(5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------+\n", "| stop_id|stop_int|\n", "+-----------+--------+\n", "| 8500926| 0|\n", "| 8502186| 1|\n", "|8502186:0:1| 2|\n", "|8502186:0:2| 3|\n", "| 8502186P| 4|\n", "+-----------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_id_stops_15km = stops_15km.select(stops_15km.stop_id, stops_15km.stop_int)\n", "stop_id_stops_15km.show(5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", "| stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int|\n", "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", "|8503000:0:10|113.TA.21-75-j19-...| 07:35:00| 07:35:00| 1| 0| 0| 96|\n", "| 8591361|2290.TA.26-31-j19...| 24:49:00| 24:49:00| 8| 0| 0| 1721|\n", "| 8590318|2289.TA.26-31-j19...| 24:52:00| 24:52:00| 7| 0| 0| 1038|\n", "| 8591219|2629.TA.26-31-j19...| 20:45:00| 20:45:00| 12| 0| 0| 1588|\n", "| 8591137|2629.TA.26-31-j19...| 21:01:00| 21:01:00| 23| 0| 0| 1510|\n", "+------------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop trips not in 15 km radius\n", "stop_times_zurich = stop_times_business_days.join(stop_id_stops_15km, how=\"inner\", on = \"stop_id\").dropDuplicates()\n", "stop_times_zurich.show(5)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", "|stop_id| trip_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int|\n", "+-------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", "|8591219|2629.TA.26-31-j19...| 20:45:00| 20:45:00| 12| 0| 0| 1588|\n", "|8591262|2631.TA.26-31-j19...| 20:30:00| 20:30:00| 14| 0| 0| 1630|\n", "|8591072|2638.TA.26-31-j19...| 19:59:00| 19:59:00| 30| 0| 0| 1452|\n", "|8591445|2687.TA.26-31-j19...| 13:55:00| 13:55:00| 32| 0| 0| 1801|\n", "|8591186|2695.TA.26-31-j19...| 12:21:00| 12:21:00| 7| 0| 0| 1555|\n", "+-------+--------------------+------------+--------------+-------------+-----------+-------------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop departures before 7 am and after 9 pm\n", "stop_times_zurich_day = stop_times_zurich.filter(hour(stop_times_zurich.departure_time) < 21).filter(hour(stop_times_zurich.departure_time) > 7)\n", "stop_times_zurich_day.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_zurich_day.write.csv('data/lgpt_guys/stop_times_zurich_day.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_zurich_day = spark.read.csv('data/lgpt_guys/stop_times_zurich_day.csv', header = True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "296583" ] } ], "source": [ "stop_times_zurich_day.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### trips.txt\n", "- drop trips not inside zurich area\n", "- drop trips active only before 7am and 9pm\n", "- Add route_int" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| trip_id|\n", "+--------------------+\n", "|247.TA.16-13-j19-...|\n", "|246.TA.16-5-j19-1...|\n", "|67.TA.17-4-j19-1....|\n", "|32.TA.21-75-j19-1...|\n", "|2667.TA.26-31-j19...|\n", "+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "#trips inside zurich sarea and add route_int\n", "trips_zurich = stop_times_zurich_day.select(stop_times_zurich_day.trip_id)\n", "trips_zurich.show(5)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+----------+-------------+------------------+---------------+------------+\n", "| trip_id|service_id| route_id| trip_headsign|trip_short_name|direction_id|\n", "+--------------------+----------+-------------+------------------+---------------+------------+\n", "|1349.TA.26-13-j19...| TA+b0003| 26-13-j19-1|Zürich, Frankental| 3781| 1|\n", "|902.TA.26-14-A-j1...| TA+b0003|26-14-A-j19-1| Zürich, Seebach| 3535| 1|\n", "|25.TA.26-25-A-j19...| TA+b0003|26-25-A-j19-1| Zürich, Dolder| 108| 1|\n", "|335.TA.26-25-A-j1...| TA+b0003|26-25-A-j19-1| Zürich, Römerhof| 54| 0|\n", "|161.TA.26-302-j19...| TA+b0003| 26-302-j19-1| Dietikon, Bahnhof| 8727| 1|\n", "+--------------------+----------+-------------+------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop trips not in zurich and during the day\n", "trips_zurich_day = trips_businessdays.join(trips_zurich, how=\"inner\", on=\"trip_id\").dropDuplicates()\n", "trips_zurich_day.show(5)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day.write.csv('data/lgpt_guys/trips_zurich_day.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day = spark.read.csv('data/lgpt_guys/trips_zurich_day.csv', header = True)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "24386" ] } ], "source": [ "trips_zurich_day.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### stop_times.txt\n", "- drop day speficic trip \n", "- Drop trips that are trimmed in 2 by 15 km radius" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+------------+\n", "|2680.TA.26-31-j19...|8588078| 14:28:00| 14:28:00| 16| 0| 0| 975| 26-31-j19-1|\n", "|253.TA.26-62-j19-...|8591118| 12:26:00| 12:26:00| 7| 0| 0| 1493| 26-62-j19-1|\n", "|71.TA.26-650-j19-...|8575937| 09:56:00| 09:56:00| 3| 0| 0| 642|26-650-j19-1|\n", "|37.TA.26-720-j19-...|8576133| 12:26:00| 12:26:00| 9| 0| 0| 672|26-720-j19-1|\n", "|68.TA.26-721-j19-...|8503155| 15:52:00| 15:52:00| 7| 0| 0| 271|26-721-j19-1|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#drop day specific trips\n", "stop_times_w_routes = stop_times_zurich_day.join(trips_zurich_day.select(trips_zurich_day.route_id, trips_zurich_day.trip_id), \\\n", " how=\"inner\", on=\"trip_id\").dropDuplicates()\n", "stop_times_w_routes.show(5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "unique_route_time = stop_times_w_routes.dropDuplicates(['route_id', 'arrival_time', 'departure_time', 'stop_sequence', 'stop_id', 'stop_int'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dropping routes if routes have been trimmed in 2" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#creating pandas dataframe to use shift function\n", "stop_times_route_pandas1 = unique_route_time.orderBy(\"trip_id\", \"stop_sequence\").toPandas()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas = stop_times_route_pandas1" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas[\"sequence_1\"] = stop_times_route_pandas[\"stop_sequence\"].shift(-1)\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " stop_sequence sequence_1\n", "0 1 10\n", "1 10 11\n", "2 11 12\n", "3 12 13\n", "4 13 14" ] } ], "source": [ "stop_times_route_pandas[[\"stop_sequence\", \"sequence_1\"]].head(5)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas[\"stop_sequence\"] = pd.to_numeric(stop_times_route_pandas[\"stop_sequence\"])\n", "stop_times_route_pandas[\"sequence_1\"] = pd.to_numeric(stop_times_route_pandas[\"sequence_1\"])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas[\"trip_1\"] = stop_times_route_pandas[\"trip_id\"].shift(-1)\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " trip_id trip_1 stop_sequence sequence_1\n", "0 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 1 10.0\n", "1 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 10 11.0\n", "2 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 11 12.0\n", "3 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 12 13.0\n", "4 1.TA.1-231-j19-1.1.H 1.TA.1-231-j19-1.1.H 13 14.0" ] } ], "source": [ "stop_times_route_pandas[[\"trip_id\",\"trip_1\",\"stop_sequence\", \"sequence_1\"]].head(5)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trip_id_drop = np.where(((stop_times_route_pandas[\"trip_id\"] == stop_times_route_pandas[\"trip_1\"]) &\\\n", " ((stop_times_route_pandas[\"stop_sequence\"]+1) < stop_times_route_pandas[\"sequence_1\"])),\\\n", " stop_times_route_pandas[\"trip_id\"] , None)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas_clean = stop_times_route_pandas[~stop_times_route_pandas[\"trip_id\"].isin(trip_id_drop)]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route_pandas = stop_times_route_pandas.astype(str)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route = spark.createDataFrame(stop_times_route_pandas)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+\n", "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 500|1-231-j19-1| 10.0|1.TA.1-231-j19-1.1.H|\n", "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 599|1-231-j19-1| 11.0|1.TA.1-231-j19-1.1.H|\n", "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 401|1-231-j19-1| 12.0|1.TA.1-231-j19-1.1.H|\n", "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 598|1-231-j19-1| 13.0|1.TA.1-231-j19-1.1.H|\n", "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 401|1-231-j19-1| 14.0|1.TA.1-231-j19-1.1.H|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times_route.show(5)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route.write.csv('data/lgpt_guys/stop_times_route.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_route = spark.read.csv('data/lgpt_guys/stop_times_route.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Trips.txt\n", "- dropping trips who have been trimmed" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| trip_id|\n", "+--------------------+\n", "|292.TA.26-304-j19...|\n", "|3.TA.26-23-j19-1.3.H|\n", "|302.TA.26-652-j19...|\n", "|304.TA.26-61-j19-...|\n", "|305.TA.26-912-j19...|\n", "+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips_zurich_day_clean = stop_times_route.select(\"trip_id\").dropDuplicates()\n", "trips_zurich_day_clean.show(5)\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+----------+-------------+--------------------+---------------+------------+\n", "| trip_id|service_id| route_id| trip_headsign|trip_short_name|direction_id|\n", "+--------------------+----------+-------------+--------------------+---------------+------------+\n", "|644.TA.26-67-j19-...| TA+b0003| 26-67-j19-1|Zürich, Dunkelhölzli| 4388| 1|\n", "|1223.TA.26-72-j19...| TA+b0003| 26-72-j19-1| Zürich, Milchbuck| 4912| 1|\n", "|337.TA.26-145-j19...| TA+b0fal| 26-145-j19-1| Thalwil, Zentrum| 3900| 1|\n", "|3717.TA.26-8-C-j1...| TA+b0fe2| 26-8-C-j19-1| Zürich, Klusplatz| 105| 0|\n", "|607.TA.26-75-A-j1...| TA+b0d7h|26-75-A-j19-1|Zürich, Seebacher...| 3298| 1|\n", "+--------------------+----------+-------------+--------------------+---------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips_zurich_day_clean_complete = trips_zurich_day.join(trips_zurich_day_clean, on=\"trip_id\", how=\"inner\" ).dropDuplicates()\n", "trips_zurich_day_clean_complete.show(5)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "22807" ] } ], "source": [ "trips_zurich_day_clean_complete.count()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day_clean_complete.write.csv('data/lgpt_guys/trips_zurich_day_clean_complete.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_zurich_day_clean_complete = spark.read.csv('data/lgpt_guys/trips_zurich_day_clean_complete.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### routes.txt\n", "- Drop routes not used during all business days and outside radius 15 km \n", "- drop routes trimmed\n", "- Give directionality to routes\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+---------+----------------+---------------+----------+----------+\n", "| route_id|agency_id|route_short_name|route_long_name|route_desc|route_type|\n", "+-----------+---------+----------------+---------------+----------+----------+\n", "|11-40-j19-1| 801| 040| null| Bus| 700|\n", "|11-61-j19-1| 7031| 061| null| Bus| 700|\n", "|11-62-j19-1| 7031| 062| null| Bus| 700|\n", "|24-64-j19-1| 801| 064| null| Bus| 700|\n", "|11-83-j19-1| 801| 083| null| Bus| 700|\n", "+-----------+---------+----------------+---------------+----------+----------+\n", "only showing top 5 rows" ] } ], "source": [ "routes = spark.read.csv(\"/data/sbb/timetables/csv/routes/2019/05/14/routes.txt\", header=True, sep = \",\")\n", "routes.show(5)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+------------+\n", "| route_id|direction_id|\n", "+------------+------------+\n", "| 26-69-j19-1| 0|\n", "|26-3-A-j19-1| 1|\n", "|26-301-j19-1| 0|\n", "|26-140-j19-1| 0|\n", "| 26-80-j19-1| 0|\n", "+------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#select routes within 15 km, active on business days and not trimmed\n", "route_zurich_clean = trips_zurich_day_clean_complete.select(trips_zurich_day_clean_complete.route_id, trips_zurich_day_clean_complete.direction_id)\n", "route_zurich_clean.show(5)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+---------+----------------+---------------+----------+----------+------------+\n", "| route_id|agency_id|route_short_name|route_long_name|route_desc|route_type|direction_id|\n", "+------------+---------+----------------+---------------+----------+----------+------------+\n", "|26-6-A-j19-1| 11| 6| null| S-Bahn| 400| 1|\n", "|54-4-Y-j19-1| 11| 4-Y| null| ICE| 102| 0|\n", "|26-842-j19-1| 838| 842| null| Bus| 700| 0|\n", "|12-E03-j19-1| 767| E03| null| Bus| 700| 1|\n", "|26-912-j19-1| 849| 912| null| Bus| 700| 0|\n", "+------------+---------+----------------+---------------+----------+----------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "#Drop routes not within 15km and active on business days\n", "routes_non_directional = routes.join(route_zurich_clean, how=\"inner\", on=\"route_id\").dropDuplicates()\n", "routes_non_directional.show(5)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional|\n", "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", "| 26-6-A-j19-1| 1| 26-6-A-j19-1-1| 11| 400| 6-1| null|\n", "| 54-4-Y-j19-1| 0| 54-4-Y-j19-1-0| 11| 102| 4-Y-0| null|\n", "| 26-842-j19-1| 0| 26-842-j19-1-0| 838| 700| 842-0| null|\n", "| 26-912-j19-1| 0| 26-912-j19-1-0| 849| 700| 912-0| null|\n", "|90-71-Y-j19-1| 0| 90-71-Y-j19-1-0| 180| 1000| 71-Y-0| null|\n", "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+\n", "only showing top 5 rows" ] } ], "source": [ "#Give directionality to routes\n", "routes_directional = routes_non_directional.select(routes_non_directional.route_id, routes_non_directional.direction_id, concat(routes_non_directional.route_id, lit(\"-\"), routes_non_directional.direction_id).alias(\"route_id_directional\"),\\\n", " routes_non_directional.agency_id, routes_non_directional.route_type,\\\n", " concat(routes_non_directional.route_short_name, lit(\"-\"), routes_non_directional.direction_id).alias(\"route_short_name_directional\"), \\\n", " concat(routes_non_directional.route_long_name, lit(\"-\"), routes_non_directional.direction_id).alias(\"route_long_name_directional\"))\n", "routes_directional.show(5)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+----------+\n", "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional| route_int|\n", "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+----------+\n", "| 26-6-A-j19-1| 1| 26-6-A-j19-1-1| 11| 400| 6-1| null| 0|\n", "| 54-4-Y-j19-1| 0| 54-4-Y-j19-1-0| 11| 102| 4-Y-0| null| 1|\n", "| 26-842-j19-1| 0| 26-842-j19-1-0| 838| 700| 842-0| null|8589934592|\n", "| 26-912-j19-1| 0| 26-912-j19-1-0| 849| 700| 912-0| null|8589934593|\n", "|90-71-Y-j19-1| 0| 90-71-Y-j19-1-0| 180| 1000| 71-Y-0| null|8589934594|\n", "+-------------+------------+--------------------+---------+----------+----------------------------+---------------------------+----------+\n", "only showing top 5 rows" ] } ], "source": [ "routes_curated = routes_directional.withColumn(\"route_int\", monotonically_increasing_id() )\n", "routes_curated.orderBy(\"route_int\").show(5)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "routes_curated.write.csv('data/lgpt_guys/routes_curated.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "routes_curated = spark.read.csv('data/lgpt_guys/routes_curated.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### trip.txt\n", "- add route_int" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", "| route_id|direction_id| trip_id|service_id| trip_headsign|trip_short_name| route_int|\n", "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", "| 26-69-j19-1| 0|1201.TA.26-69-j19...| TA+b0003| Zürich, Milchbuck| 1255|1451698946048|\n", "|26-3-A-j19-1| 1|1513.TA.26-3-A-j1...| TA+b0j3n| Zürich, Albisrieden| 3206| 386547056640|\n", "|26-301-j19-1| 0|246.TA.26-301-j19...| TA+b0003| Dietikon, Bahnhof| 3608|1477468749824|\n", "|26-140-j19-1| 0|988.TA.26-140-j19...| TA+b0nfk|Langnau am Albis,...| 542|1176821039104|\n", "| 26-80-j19-1| 0|529.TA.26-80-j19-...| TA+b0003|Zürich, Triemlisp...| 637|1554778161152|\n", "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips_curated = trips_zurich_day_clean_complete.join(routes_curated.select(\"route_id\", \"direction_id\", \"route_int\"), how=\"left\", on=[\"route_id\", \"direction_id\"])\n", "trips_curated.show(5)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_curated.write.csv('data/lgpt_guys/trips_curated.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trips_curated = spark.read.csv('data/lgpt_guys/trips_curated.csv', header = True)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "22807" ] } ], "source": [ "trips_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Stop_times.txt\n", "- add route_int" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1| route_int|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 500|1-231-j19-1| 10.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 599|1-231-j19-1| 11.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 401|1-231-j19-1| 12.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 598|1-231-j19-1| 13.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 401|1-231-j19-1| 14.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times_curated = stop_times_route.join(trips_curated.select(\"trip_id\", \"route_int\"), how=\"left\", on=\"trip_id\")\n", "stop_times_curated.show(5)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_curated.write.csv('data/lgpt_guys/stop_times_curated.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read preprocessed files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transfer time walking distance between zurich stops" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------+-----------+---------+-----------------+\n", "|stop_id|stop_int| stop_id2|stop_int2|Transfer_time_sec|\n", "+-------+--------+-----------+---------+-----------------+\n", "|8500926| 0| 8590616| 1166| 146|\n", "|8500926| 0| 8590737| 1270| 360|\n", "|8502186| 1|8502186:0:1| 2| 8|\n", "|8502186| 1|8502186:0:2| 3| 16|\n", "|8502186| 1| 8502186P| 4| 0|\n", "+-------+--------+-----------+---------+-----------------+\n", "only showing top 5 rows" ] } ], "source": [ "transfer = spark.read.csv('data/lgpt_guys/transfers.csv', header=True)\n", "transfer.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stops within a 15 km radius of Zurich" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", "| stop_id| stop_name| stop_lat| stop_lon|location_type|parent_station|stop_int|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", "| 8500926|Oetwil a.d.L., Sc...|47.4236270123012| 8.4031825286317| null| null| 0|\n", "| 8502186|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| null| 8502186P| 1|\n", "|8502186:0:1|Dietikon Stoffelbach|47.3934666445388|8.39894248049007| null| 8502186P| 2|\n", "|8502186:0:2|Dietikon Stoffelbach|47.3935274568464|8.39894248049007| null| 8502186P| 3|\n", "| 8502186P|Dietikon Stoffelbach|47.3934058321612|8.39894248049007| 1| null| 4|\n", "+-----------+--------------------+----------------+----------------+-------------+--------------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "stops_15km = spark.read.csv('data/lgpt_guys/stops_15km.csv', header = True)\n", "stops_15km.show(5)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "1883" ] } ], "source": [ "stops_15km.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Trips during business days and in 15 km radius of zurich" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", "| route_id|direction_id| trip_id|service_id| trip_headsign|trip_short_name| route_int|\n", "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", "| 26-69-j19-1| 0|1201.TA.26-69-j19...| TA+b0003| Zürich, Milchbuck| 1255|1451698946048|\n", "|26-3-A-j19-1| 1|1513.TA.26-3-A-j1...| TA+b0j3n| Zürich, Albisrieden| 3206| 386547056640|\n", "|26-301-j19-1| 0|246.TA.26-301-j19...| TA+b0003| Dietikon, Bahnhof| 3608|1477468749824|\n", "|26-140-j19-1| 0|988.TA.26-140-j19...| TA+b0nfk|Langnau am Albis,...| 542|1176821039104|\n", "| 26-80-j19-1| 0|529.TA.26-80-j19-...| TA+b0003|Zürich, Triemlisp...| 637|1554778161152|\n", "+------------+------------+--------------------+----------+--------------------+---------------+-------------+\n", "only showing top 5 rows" ] } ], "source": [ "trips_curated = spark.read.csv('data/lgpt_guys/trips_curated.csv', header = True)\n", "trips_curated.show(5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "22807" ] } ], "source": [ "trips_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stop_times of stops inside 15 km and with trips on business days" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1| route_int|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "|1.TA.1-231-j19-1.1.H|8572747| 09:37:00| 09:37:00| 1| 0| 0| 500|1-231-j19-1| 10.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8573721| 09:50:00| 09:50:00| 10| 0| 0| 599|1-231-j19-1| 11.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8503598| 09:53:00| 09:53:00| 11| 0| 0| 401|1-231-j19-1| 12.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8573720| 09:55:00| 09:59:00| 12| 0| 0| 598|1-231-j19-1| 13.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "|1.TA.1-231-j19-1.1.H|8503598| 10:00:00| 10:00:00| 13| 0| 0| 401|1-231-j19-1| 14.0|1.TA.1-231-j19-1.1.H|592705486850|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times_curated = spark.read.csv('data/lgpt_guys/stop_times_curated.csv', header = True)\n", "stop_times_curated.show(5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- trip_id: string (nullable = true)\n", " |-- stop_id: string (nullable = true)\n", " |-- arrival_time: string (nullable = true)\n", " |-- departure_time: string (nullable = true)\n", " |-- stop_sequence: string (nullable = true)\n", " |-- pickup_type: string (nullable = true)\n", " |-- drop_off_type: string (nullable = true)\n", " |-- stop_int: string (nullable = true)\n", " |-- route_id: string (nullable = true)\n", " |-- sequence_1: string (nullable = true)\n", " |-- trip_1: string (nullable = true)\n", " |-- route_int: string (nullable = true)" ] } ], "source": [ "stop_times_curated.printSchema()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "247920" ] } ], "source": [ "stop_times_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Routes inside 15 km and active during business days" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+-------------+\n", "| route_id|direction_id|route_id_directional|agency_id|route_type|route_short_name_directional|route_long_name_directional| route_int|\n", "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+-------------+\n", "|26-33E-j19-1| 0| 26-33E-j19-1-0| 849| 700| 33E-0| null|1657857376256|\n", "|26-734-j19-1| 1| 26-734-j19-1-1| 773| 700| 734-1| null|1657857376257|\n", "|26-132-j19-1| 0| 26-132-j19-1-0| 807| 700| 132-0| null|1657857376258|\n", "| 26-91-j19-1| 0| 26-91-j19-1-0| 849| 700| 91-0| null|1657857376259|\n", "|26-142-j19-1| 1| 26-142-j19-1-1| 807| 700| 142-1| null|1657857376260|\n", "+------------+------------+--------------------+---------+----------+----------------------------+---------------------------+-------------+\n", "only showing top 5 rows" ] } ], "source": [ "routes_curated = spark.read.csv('data/lgpt_guys/routes_curated.csv', header = True)\n", "routes_curated.show(5)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "507" ] } ], "source": [ "routes_curated.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Raptor specific preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "StopTimes: [[departure_route0_trip0_stop0, arrival_route0_trip0_stop_0], [departure_route0_trip0_stop1, arrival_route0_trip0_stop_1], …], [[departure_route0_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], [[[departure_route1_trip0_stop0, arrival_route1_trip0_stop_0], …], [[departure_route1_trip1_stop0, arrival_route0_trip1_stop_0], …], ….], …]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_curated = stop_times_curated.withColumn(\"arrival_time_stamp\", to_timestamp(stop_times_curated.arrival_time, 'HH:mm:ss')).withColumn(\"departure_time_stamp\", to_timestamp(stop_times_curated.departure_time, 'HH:mm:ss'))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+-------------------+--------------------+\n", "| trip_id|stop_id|arrival_time|departure_time|stop_sequence|pickup_type|drop_off_type|stop_int| route_id|sequence_1| trip_1| route_int| arrival_time_stamp|departure_time_stamp|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+-------------------+--------------------+\n", "|100.TA.1-1-E-j19-...|8578679| 08:49:00| 08:49:00| 1| 0| 0| 791|1-1-E-j19-1| 2.0|100.TA.1-1-E-j19-...|884763262977|1970-01-01 08:49:00| 1970-01-01 08:49:00|\n", "|100.TA.1-1-E-j19-...|8590314| 08:50:00| 08:50:00| 2| 0| 0| 1036|1-1-E-j19-1| 3.0|100.TA.1-1-E-j19-...|884763262977|1970-01-01 08:50:00| 1970-01-01 08:50:00|\n", "|100.TA.1-1-E-j19-...|8590317| 08:51:00| 08:51:00| 3| 0| 0| 1037|1-1-E-j19-1| 19.0|100.TA.1-321-j19-...|884763262977|1970-01-01 08:51:00| 1970-01-01 08:51:00|\n", "|101.TA.1-1-E-j19-...|8578679| 09:04:00| 09:04:00| 1| 0| 0| 791|1-1-E-j19-1| 2.0|101.TA.1-1-E-j19-...|884763262977|1970-01-01 09:04:00| 1970-01-01 09:04:00|\n", "|101.TA.1-1-E-j19-...|8590314| 09:05:00| 09:05:00| 2| 0| 0| 1036|1-1-E-j19-1| 3.0|101.TA.1-1-E-j19-...|884763262977|1970-01-01 09:05:00| 1970-01-01 09:05:00|\n", "+--------------------+-------+------------+--------------+-------------+-----------+-------------+--------+-----------+----------+--------------------+------------+-------------------+--------------------+\n", "only showing top 5 rows" ] } ], "source": [ "stop_times_ordered = stop_times_curated.orderBy(\"route_id\", \"trip_id\", \"stop_sequence\")\n", "stop_times_ordered.show(5)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_ordered.write.csv('data/lgpt_guys/stop_times_ordered_like_array.csv', header = True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_ordered = stop_times_ordered.toPandas()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_ordered_pandas = stop_times_ordered" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " arrival_time arrival_time_stamp\n", "0 08:49:00 1970-01-01 08:49:00\n", "1 08:50:00 1970-01-01 08:50:00\n", "2 08:51:00 1970-01-01 08:51:00\n", "3 09:04:00 1970-01-01 09:04:00\n", "4 09:05:00 1970-01-01 09:05:00" ] } ], "source": [ "stop_times_ordered_pandas[[\"arrival_time\", \"arrival_time_stamp\"]].head(5)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_ordered_pandas['arrival_time'] = pd.to_datetime(stop_times_ordered_pandas['arrival_time'])\n", "stop_times_ordered_pandas['departure_time'] = pd.to_datetime(stop_times_ordered_pandas['departure_time'])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " arrival_time arrival_time_stamp\n", "0 2020-05-19 08:49:00 1970-01-01 08:49:00\n", "1 2020-05-19 08:50:00 1970-01-01 08:50:00\n", "2 2020-05-19 08:51:00 1970-01-01 08:51:00\n", "3 2020-05-19 09:04:00 1970-01-01 09:04:00\n", "4 2020-05-19 09:05:00 1970-01-01 09:05:00" ] } ], "source": [ "stop_times_ordered_pandas[[\"arrival_time\", \"arrival_time_stamp\"]].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Adding None to first arrival time and last departure time" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_ordered_pandas['departure_time'] = np.where((stop_times_ordered_pandas[\"stop_sequence\"] > stop_times_ordered_pandas[\"sequence_1\"]), None, stop_times_ordered_pandas['departure_time'])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_ordered_pandas[\"arrival_time\"] = np.where((stop_times_ordered_pandas[\"stop_sequence\"] == 1), None, stop_times_ordered_pandas['arrival_time'])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " arrival_time departure_time stop_sequence sequence_1\n", "0 1589878140000000000 1589878140000000000 1 2.0\n", "1 1589878200000000000 1589878200000000000 2 3.0\n", "2 1589878260000000000 None 3 19.0\n", "3 1589879040000000000 1589879040000000000 1 2.0\n", "4 1589879100000000000 1589879100000000000 2 3.0" ] } ], "source": [ "stop_times_ordered_pandas[[\"arrival_time\",\"departure_time\", \"stop_sequence\", \"sequence_1\"]].head(5)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "stop_times_ordered_pandas['arrival_time'] = pd.to_datetime(stop_times_ordered_pandas['arrival_time'])\n", "stop_times_ordered_pandas['departure_time'] = pd.to_datetime(stop_times_ordered_pandas['departure_time'])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " arrival_time arrival_time_stamp\n", "0 2020-05-19 08:49:00 1970-01-01 08:49:00\n", "1 2020-05-19 08:50:00 1970-01-01 08:50:00\n", "2 2020-05-19 08:51:00 1970-01-01 08:51:00\n", "3 2020-05-19 09:04:00 1970-01-01 09:04:00\n", "4 2020-05-19 09:05:00 1970-01-01 09:05:00" ] } ], "source": [ "stop_times_ordered_pandas[[\"arrival_time\", \"arrival_time_stamp\"]].head(5)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 247920 entries, 0 to 247919\n", "Data columns (total 14 columns):\n", "trip_id 247920 non-null object\n", "stop_id 247920 non-null object\n", "arrival_time 247920 non-null datetime64[ns]\n", "departure_time 226565 non-null datetime64[ns]\n", "stop_sequence 247920 non-null object\n", "pickup_type 247920 non-null object\n", "drop_off_type 247920 non-null object\n", "stop_int 247920 non-null object\n", "route_id 247920 non-null object\n", "sequence_1 247920 non-null object\n", "trip_1 247920 non-null object\n", "route_int 247920 non-null object\n", "arrival_time_stamp 247920 non-null datetime64[ns]\n", "departure_time_stamp 247920 non-null datetime64[ns]\n", "dtypes: datetime64[ns](4), object(10)\n", "memory usage: 26.5+ MB" ] } ], "source": [ "stop_times_ordered.info()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "array([['2020-05-19T08:49:00.000000000', '2020-05-19T08:49:00.000000000'],\n", " ['2020-05-19T08:50:00.000000000', '2020-05-19T08:50:00.000000000'],\n", " ['2020-05-19T08:51:00.000000000', 'NaT'],\n", " ...,\n", " ['2020-05-19T16:07:00.000000000', '2020-05-19T16:07:00.000000000'],\n", " ['2020-05-19T16:11:00.000000000', '2020-05-19T16:15:00.000000000'],\n", " ['2020-05-19T16:19:00.000000000', 'NaT']],\n", " dtype='datetime64[ns]')" ] } ], "source": [ "stop_times_array = stop_times_ordered_pandas[[\"arrival_time\", \"departure_time\"]].to_numpy()\n", "stop_times_array" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "An error was encountered:\n", - "An error occurred while calling o257.saveAsTextFile.\n", - ": org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://iccluster044.iccluster.epfl.ch:8020/user/ebouille/data/lgpt_guys/stop_times_array_version3.txt already exists\n", - "\tat org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)\n", - "\tat org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:283)\n", - "\tat org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)\n", - "\tat org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1493)\n", - "\tat org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1472)\n", - "\tat org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1472)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1472)\n", - "\tat org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550)\n", - "\tat org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)\n", - "\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n", - "\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n", - "\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n", - "\tat java.lang.reflect.Method.invoke(Method.java:498)\n", - "\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n", - "\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n", - "\tat py4j.Gateway.invoke(Gateway.java:282)\n", - "\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n", - "\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n", - "\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n", - "\tat java.lang.Thread.run(Thread.java:745)\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/hdata/sdc/hadoop/yarn/local/usercache/ebouille/appcache/application_1589299642358_1160/container_e06_1589299642358_1160_01_000001/pyspark.zip/pyspark/rdd.py\", line 1585, in saveAsTextFile\n", - " keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)\n", - " File \"/hdata/sdc/hadoop/yarn/local/usercache/ebouille/appcache/application_1589299642358_1160/container_e06_1589299642358_1160_01_000001/py4j-0.10.7-src.zip/py4j/java_gateway.py\", line 1257, in __call__\n", - " answer, self.gateway_client, self.target_id, self.name)\n", - " File \"/hdata/sdc/hadoop/yarn/local/usercache/ebouille/appcache/application_1589299642358_1160/container_e06_1589299642358_1160_01_000001/pyspark.zip/pyspark/sql/utils.py\", line 63, in deco\n", - " return f(*a, **kw)\n", - " File \"/hdata/sdc/hadoop/yarn/local/usercache/ebouille/appcache/application_1589299642358_1160/container_e06_1589299642358_1160_01_000001/py4j-0.10.7-src.zip/py4j/protocol.py\", line 328, in get_return_value\n", - " format(target_id, \".\", name), value)\n", - "Py4JJavaError: An error occurred while calling o257.saveAsTextFile.\n", - ": org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://iccluster044.iccluster.epfl.ch:8020/user/ebouille/data/lgpt_guys/stop_times_array_version3.txt already exists\n", - "\tat org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)\n", - "\tat org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:283)\n", - "\tat org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)\n", - "\tat org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1493)\n", - "\tat org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1472)\n", - "\tat org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1472)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n", - "\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n", - "\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n", - "\tat org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1472)\n", - "\tat org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550)\n", - "\tat org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)\n", - "\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n", - "\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n", - "\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n", - "\tat java.lang.reflect.Method.invoke(Method.java:498)\n", - "\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n", - "\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n", - "\tat py4j.Gateway.invoke(Gateway.java:282)\n", - "\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n", - "\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n", - "\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n", - "\tat java.lang.Thread.run(Thread.java:745)\n", - "\n", - "\n" + "247920" ] } ], + "source": [ + "np.size(stop_times_array,0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "sc.parallelize(stop_times_array).saveAsTextFile('data/lgpt_guys/stop_times_array_version3.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Routes: [[route0_nr. Stops, route0_nr.Trips, route0_pointerStops, route0_pointerTrip],[ route1_nr. Stops, route1_nr. Trips, route1_pointerStops, route1_pointerTrip],…]" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+-------+-------+\n", "| route_id|n_Trips|n_stops|\n", "+------------+-------+-------+\n", "| 1-1-E-j19-1| 140| 3|\n", "| 1-10-j19-1| 37| 4|\n", "|1-11-B-j19-1| 48| 5|\n", "| 1-16-j19-1| 24| 8|\n", "|1-17-A-j19-1| 106| 19|\n", "+------------+-------+-------+\n", "only showing top 5 rows" ] } ], "source": [ "distinct_trips_stops = stop_times_curated.groupBy(\"route_id\").agg(countDistinct(\"trip_id\").alias(\"n_Trips\"), countDistinct(\"stop_id\").alias(\"n_stops\")).orderBy(\"route_id\")\n", "distinct_trips_stops.show(5)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "distinct_trips_stops_pandas = distinct_trips_stops.toPandas()" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " route_id n_Trips n_stops Cumsum_trips Cumsum_stops\n", "0 1-1-E-j19-1 140 3 140 3\n", "1 1-10-j19-1 37 4 177 7\n", "2 1-11-B-j19-1 48 5 225 12\n", "3 1-16-j19-1 24 8 249 20\n", "4 1-17-A-j19-1 106 19 355 39" ] } ], "source": [ "distinct_trips_stops_pandas['Cumsum_trips'] = distinct_trips_stops_pandas.n_Trips.cumsum()\n", "distinct_trips_stops_pandas['Cumsum_stops'] = distinct_trips_stops_pandas.n_stops.cumsum()\n", "distinct_trips_stops_pandas.head(5)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "array([[ 140, 3, 140, 3],\n", " [ 37, 4, 177, 7],\n", " [ 48, 5, 225, 12],\n", " ...,\n", " [ 84, 2, 22709, 3275],\n", " [ 85, 2, 22794, 3277],\n", " [ 13, 5, 22807, 3282]])" ] } ], "source": [ "routes_array = distinct_trips_stops_pandas[['n_Trips', 'n_stops', 'Cumsum_trips', 'Cumsum_stops']].to_numpy()\n", "routes_array" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "279" + ] } ], + "source": [ + "np.size(routes_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "sc.parallelize(routes_array).saveAsTextFile('data/lgpt_guys/routes_array_version3.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "RouteStops: [route0_stop0, route0_stop1,…, route1_stop0, route1_stop1,…, …]\n" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+--------+\n", "| route_int|stop_int|\n", "+-------------+--------+\n", - "| 523986010115| 1015|\n", - "| 25769803776| 1678|\n", + "|1460288880640| 783|\n", "| 25769803776| 476|\n", - "|1640677507072| 1482|\n", - "|1348619730946| 706|\n", + "|1176821039104| 510|\n", + "|1297080123392| 1094|\n", + "|1297080123392| 1817|\n", "+-------------+--------+\n", "only showing top 5 rows" ] } ], "source": [ "route_stops = stop_times_curated.orderBy(\"route_id\", \"stop_sequence\").select('route_int', 'stop_int').dropDuplicates()\n", "route_stops.show(5)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "array([[1015],\n", - " [1035],\n", - " [ 476],\n", + "array([[1678],\n", + " [ 510],\n", + " [1482],\n", " ...,\n", - " [ 723],\n", + " [1551],\n", " [ 718],\n", - " [ 916]])" + " [ 963]])" ] } ], "source": [ "route_stops_array = np.array(route_stops.select(route_stops.stop_int.cast(\"integer\")).collect())\n", "route_stops_array" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5545" + ] + } + ], + "source": [ + "np.size(route_stops_array, 0)" + ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sc.parallelize(route_stops_array).saveAsTextFile('data/lgpt_guys/route_stops_array_version3.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stops: [[stop0_pointerRoutes, stop0_pointerTransfer], [stop1_pointerRoutes, stop1_pointerTransfer], …]" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------+-------------+-------+--------+---------+-----------------+\n", "|stop_int| route_int|stop_id|stop_id2|stop_int2|Transfer_time_sec|\n", "+--------+-------------+-------+--------+---------+-----------------+\n", - "| 783|1460288880640|8576480| 8576498| 785| 569|\n", - "| 783|1460288880640|8576480| 8576496| 784| 304|\n", - "| 783|1460288880640|8576480| 8576164| 685| 431|\n", - "| 1454|1245540515840|8591074| 8591398| 1757| 524|\n", - "| 1454|1245540515840|8591074| 8591374| 1734| 431|\n", + "| 1015| 523986010115|8590204| 8590203| 1014| 277|\n", + "| 1015| 523986010115|8590204| 8590201| 1012| 296|\n", + "| 1015| 523986010115|8590204| 8590200| 1011| 167|\n", + "| 1482|1640677507072|8591105| 8591384| 1744| 495|\n", + "| 1482|1640677507072|8591105| 8591299| 1665| 367|\n", "+--------+-------------+-------+--------+---------+-----------------+\n", "only showing top 5 rows" ] } ], "source": [ "stops_join = route_stops.join(transfer, how=\"inner\", on=\"stop_int\").dropDuplicates()\n", "stops_join.show(5)" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-----------+--------+--------+-----------+\n", "| stop_id|stop_int|n_Routes|n_Transfers|\n", "+-----------+--------+--------+-----------+\n", "| 8500926| 0| 1| 2|\n", "|8502186:0:1| 2| 1| 7|\n", "|8502186:0:2| 3| 1| 7|\n", "|8502187:0:1| 6| 1| 3|\n", "|8502187:0:2| 7| 1| 3|\n", "+-----------+--------+--------+-----------+\n", "only showing top 5 rows" ] } ], "source": [ "distinct_route_transfers = stops_join.groupBy(\"stop_id\", \"stop_int\").agg(countDistinct(\"route_int\").alias(\"n_Routes\"), countDistinct(\"stop_int2\").alias(\"n_Transfers\")).orderBy(\"stop_id\")\n", "distinct_route_transfers.show(5)" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "distinct_route_transfers_pandas = distinct_route_transfers.toPandas()" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " stop_id stop_int n_Routes n_Transfers Cumsum_routes Cumsum_transfers\n", "0 8500926 0 1 2 1 2\n", "1 8502186:0:1 2 1 7 2 9\n", "2 8502186:0:2 3 1 7 3 16\n", "3 8502187:0:1 6 1 3 4 19\n", "4 8502187:0:2 7 1 3 5 22" ] } ], "source": [ "distinct_route_transfers_pandas['Cumsum_routes'] = distinct_route_transfers_pandas.n_Routes.cumsum()\n", "distinct_route_transfers_pandas['Cumsum_transfers'] = distinct_route_transfers_pandas.n_Transfers.cumsum()\n", "distinct_route_transfers_pandas.head(5)" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "array([[ 1, 2],\n", " [ 2, 9],\n", " [ 3, 16],\n", " ...,\n", " [ 5387, 10654],\n", " [ 5389, 10669],\n", " [ 5398, 10702]])" ] } ], "source": [ "stops_array = distinct_route_transfers_pandas[['Cumsum_routes', 'Cumsum_transfers']].to_numpy()\n", "stops_array" ] }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1512" + ] + } + ], + "source": [ + "np.size(stops_array, 0)" + ] + }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sc.parallelize(stops_array).saveAsTextFile('data/lgpt_guys/stops_array_version3.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "StopRoutes: [stop0_route1, stop0_route3, stop1_route1, stop2_route1, stop1_route4, …]" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "array([[1460288880641],\n", " [ 180388626432],\n", " [ 481036337153],\n", " ...,\n", " [ 317827579907],\n", - " [ 317827579906],\n", - " [ 618475290627]])" + " [ 618475290627],\n", + " [ 317827579906]])" ] } ], "source": [ "stop_routes_array = np.array(route_stops.orderBy(\"stop_id\", \"route_id\").select(route_stops.route_int).collect()).astype(int)\n", "stop_routes_array" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sc.parallelize(stop_routes_array).saveAsTextFile('data/lgpt_guys/stop_routes_array_version4.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transfer: [[[stop0_nameTargetStop1, transferTime1], [stop0_nameTargetStop2, transferTime2],….], [stop1_nameTargetStop1, transferTime1], [stop1_nameTargetStop2, transferTime2],….],…]" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- stop_id: string (nullable = true)\n", " |-- stop_int: string (nullable = true)\n", " |-- stop_id2: string (nullable = true)\n", " |-- stop_int2: string (nullable = true)\n", " |-- Transfer_time_sec: string (nullable = true)" ] } ], "source": [ "transfer.printSchema()" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "array([[1166, 146],\n", " [1270, 360],\n", " [ 2, 8],\n", " ...,\n", " [1650, 255],\n", " [1728, 161],\n", " [1739, 519]])" ] } ], "source": [ "transfer_array = np.array(transfer.orderBy(\"stop_id\").select(transfer.stop_int2.cast(\"integer\"), transfer.Transfer_time_sec.cast(\"integer\")).collect())\n", "transfer_array" ] }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12564" + ] + } + ], + "source": [ + "np.size(transfer_array, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sc.parallelize(transfer_array).saveAsTextFile('data/lgpt_guys/transfer_array_version3.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Open question\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/notebooks/transfer_to_local.ipynb b/notebooks/transfer_to_local.ipynb index 8815f93..f41d412 100644 --- a/notebooks/transfer_to_local.ipynb +++ b/notebooks/transfer_to_local.ipynb @@ -1,244 +1,247 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## transfer files from HDFS to local\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6512application_1589299642358_1001pysparkidleLinkLink
6531application_1589299642358_1020pysparkidleLinkLink
6532application_1589299642358_1021pysparkbusyLinkLink
6543application_1589299642358_1032pysparkidleLinkLink
6545application_1589299642358_1034pysparkidleLinkLink
6550application_1589299642358_1039pysparkidleLinkLink
6552application_1589299642358_1041pysparkidleLinkLink
6555application_1589299642358_1044pysparkidleLinkLink
6558application_1589299642358_1047pysparkidleLinkLink
6560application_1589299642358_1049pysparkidleLinkLink
6561application_1589299642358_1050pysparkidleLinkLink
6562application_1589299642358_1051pysparkbusyLinkLink
6563application_1589299642358_1052pysparkidleLinkLink
6564application_1589299642358_1053pysparkidleLinkLink
6567application_1589299642358_1056pysparkidleLinkLink
6568application_1589299642358_1057pysparkidleLinkLink
6575application_1589299642358_1064pysparkidleLinkLink
6577application_1589299642358_1066pysparkidleLinkLink
6578application_1589299642358_1067pysparkidleLinkLink
6579application_1589299642358_1068pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?6692application_1589299642358_1181pysparkidleLinkLink6708application_1589299642358_1199pysparkidleLinkLink6712application_1589299642358_1203pysparkidleLinkLink6719application_1589299642358_1210pysparkidleLinkLink6721application_1589299642358_1212pysparkidleLinkLink6725application_1589299642358_1216pysparkidleLinkLink6726application_1589299642358_1217pysparkidleLinkLink6727application_1589299642358_1218pysparkidleLinkLink6728application_1589299642358_1219pysparkidleLinkLink6729application_1589299642358_1220pysparkidleLinkLink6730application_1589299642358_1221pysparkbusyLinkLink6733application_1589299642358_1224pysparkidleLinkLink6734application_1589299642358_1225pysparkidleLinkLink6736application_1589299642358_1227pysparkidleLinkLink6737application_1589299642358_1228pysparkbusyLinkLink6740application_1589299642358_1231pysparkidleLinkLink6741application_1589299642358_1232pysparkidleLinkLink6742application_1589299642358_1233pysparkidleLinkLink6743application_1589299642358_1234pysparkidleLinkLink6744application_1589299642358_1235pysparkidleLinkLink6745application_1589299642358_1236pysparkidleLinkLink6746application_1589299642358_1237pysparkidleLinkLink" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", - "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6580application_1589299642358_1069pysparkidleLinkLink
" + "IDYARN Application IDKindStateSpark UIDriver logCurrent session?6747application_1589299642358_1238pysparkidleLinkLink✔" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transfert and save tables from hdfs to local \n", "\n", "Here we describe the process of loading a table on hdfs, saving it to a proper place so that we can load it in local and then save it.\n", "\n", "First, we load the data that is in an otherwise not accessible place in hdfs :" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "username = 'acoudray'\n", + "username = 'tturner'\n", "\n", - "my_files = ['transfer_array_version2.txt', 'stop_routes_array_version3.txt',\n", - " 'stops_array_version2.txt', 'route_stops_array_version2.txt',\n", - " 'routes_array_version2.txt', 'stop_times_array_version2.txt']\n", + "my_files = ['stop_times_curated.csv', 'trips_curated.csv',\n", + " 'stops_15km.csv', 'transfers.csv']\n", "\n", "for file in my_files:\n", " this_file = spark.read.csv('data/lgpt_guys/{}'.format(file), \\\n", - " header = False) \n", - " this_file.write.csv(\"/user/{0}/{1}\".format(username, file.replace('.txt','')), \\\n", + " header = True) \n", + " this_file.write.csv(\"/user/{0}/{1}\".format(username, file.replace('.csv','')), \\\n", " header = True, mode = 'overwrite')" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/work/final_project/notebooks\n", - "transfer_array_version2\n", - "stop_routes_array_version3\n", - "stops_array_version2\n", - "route_stops_array_version2\n", - "routes_array_version2\n", - "stop_times_array_version2\n" + "stop_times_curated\n", + "trips_curated\n", + "stops_15km\n", + "transfers\n" ] } ], "source": [ "%local\n", "\n", "from hdfs3 import HDFileSystem\n", "import pandas as pd\n", "import numpy as np \n", "import os\n", "print(os.getcwd())\n", "\n", "hdfs = HDFileSystem(host='hdfs://iccluster044.iccluster.epfl.ch', port=8020, user='ebouille')\n", "\n", - "username = 'acoudray'\n", + "username = 'tturner'\n", "\n", - "my_folders = ['transfer_array_version2', 'stop_routes_array_version3',\n", - " 'stops_array_version2', 'route_stops_array_version2',\n", - " 'routes_array_version2', 'stop_times_array_version2']\n", + "my_folders = ['stop_times_curated', 'trips_curated',\n", + " 'stops_15km', 'transfers']\n", "\n", "for folder in my_folders:\n", " print(folder)\n", " array_files = hdfs.glob('/user/{0}/{1}/*.csv'.format(username, folder))\n", " array = pd.DataFrame()\n", " for file in array_files:\n", " with hdfs.open(file) as f:\n", " array = array.append(pd.read_csv(f))\n", "\n", " array.to_csv('../data/{}.csv'.format(folder), header=True)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 }