diff --git a/notebooks/proba_functions.ipynb b/notebooks/proba_functions.ipynb index bcee49c..68945bb 100644 --- a/notebooks/proba_functions.ipynb +++ b/notebooks/proba_functions.ipynb @@ -1,355 +1,361 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Make distribution tables to calculate probabilities of transfer\n", "\n", "
Any application without a proper name would be promptly killed.
" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6800application_1589299642358_1295pysparkbusyLinkLink
6803application_1589299642358_1298pysparkidleLinkLink
6806application_1589299642358_1301pysparkbusyLinkLink
6807application_1589299642358_1302pysparkidleLinkLink
6808application_1589299642358_1303pysparkidleLinkLink
6809application_1589299642358_1305pysparkidleLinkLink
6810application_1589299642358_1306pysparkidleLinkLink
6811application_1589299642358_1308pysparkidleLinkLink
6815application_1589299642358_1312pysparkidleLinkLink
6816application_1589299642358_1313pysparkidleLinkLink
6818application_1589299642358_1315pysparkbusyLinkLink
6819application_1589299642358_1316pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%configure\n", "{\"conf\": {\n", " \"spark.app.name\": \"lgptguys_final\"\n", "}}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start Spark" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6820application_1589299642358_1317pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "An error was encountered:\n", "unknown magic command '%spark'\n", "UnknownMagic: unknown magic command '%spark'\n", "\n" ] } ], "source": [ "# Initialization\n", "%%spark" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compute probability of missing a transfer from delays distributions\n", "\n", "Let's first have a look at a slice of the dictionnary of distribution" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('1286.TA.26-32-j19-1.12.H__8591182',\n", " array([ 0, 1158, 306, 162, 94, 24, 28, 21, 3, 2, 0,\n", " 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('1286.TA.26-32-j19-1.12.H__8591184',\n", " array([ 1, 762, 552, 292, 118, 48, 13, 8, 0, 1, 1, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0])),\n", " ('1286.TA.26-32-j19-1.12.H__8591195',\n", " array([ 0, 1083, 444, 143, 64, 35, 16, 9, 3, 1, 0,\n", " 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n", " ('1286.TA.26-32-j19-1.12.H__8591200',\n", " array([ 2, 239, 227, 228, 212, 128, 74, 42, 29, 17, 3, 3, 2,\n", " 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 1])),\n", " ('1286.TA.26-32-j19-1.12.H__8591209',\n", " array([ 0, 1151, 308, 169, 94, 24, 29, 16, 4, 3, 1,\n", " 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%local\n", "\n", "import pickle \n", "import gzip\n", "from itertools import islice\n", "\n", "with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n", " d = pickle.load(input_file)\n", "\n", "# Functon to take a slice from a dictionnary - head equivalent\n", "def take(n, iterable):\n", " \"Return first n items of the iterable as a list\"\n", " return list(islice(iterable, n))\n", "\n", "# display a slice of it\n", "take(5, d.items())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Poisson cumulative distribution\n", "\n", - "The Poisson distribution is popular for modeling the number of times an event occurs in an interval of time or space. We modeled a poisson distribution for delays assuming k is the time (as it was done [here](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0126137)).\n", + "The Poisson distribution is popular for modeling the number of times an event occurs in an interval of time or space. We modeled a poisson distribution for delays assuming parameter $k$ is the time in minutes (as it was done [here](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0126137), formulas $(4),(5),(6)$).\n", "\n", "A discrete random variable X is said to have a Poisson distribution with parameter λ > 0, if, for k = 0, 1, 2, ..., the probability mass function of X is given by:\n", "\n", "$${\\displaystyle \\!f(k;\\lambda )=\\Pr(X=k)={\\frac {\\lambda ^{k}e^{-\\lambda }}{k!}},}$$\n", "where\n", "\n", "e is Euler's number (e = 2.71828...)\n", "k! is the factorial of k.\n", - "The positive real number λ is equal to the expected value of X __and__ also to its variance.\n", + "The positive real number λ is equal to the expected value of X __and__ to its variance.\n", "\n", "$${\\displaystyle \\lambda =\\operatorname {E} (X)=\\operatorname {Var} (X)}$$\n", "\n", + "We can approximate E[𝑋]∼$\\mu_i$ for our data $X_i$ if performed on a sample of size N from that distribution if we estimate that $X_i$∼$X$.\n", + "\n", "The Poisson distribution can be applied to systems with a large number of possible events, each of which is rare. The number of such events that occur during a fixed time interval is, under the right circumstances, a random number with a Poisson distribution.\n", "\n", "Poisson-related __assumptions__ :\n", - "- k is the number of times an event occurs in an interval and k can take values 0, 1, 2, ... \n", - " - __k is therefore our delay in minutes = number of times the event (=delay) occurs__\n", - "- The occurrence of one event does not affect the probability that a second event will occur. That is, events occur independently.\n", + "- $k$ is the __delay time in minutes__ and can take values 0, 1, 2, ... (strictly positive and discrete)\n", + "- We assume our sampling $X_i$ of $X$ is good enough to approximate E[X] ~ $\\mu_i$\n", + "- The occurrence of one event does not affect probability of others. That is, events occur independently.\n", " - __We assume being late one day is not affecting the delay of the day after__ \n", "- The average rate at which events occur is independent of any occurrences. For simplicity, this is usually assumed to be constant, but may in practice vary with time.\n", - " - __we assumes delays occurs with a constant rate__\n", + " - __we assumes delays occurs with a constant rate over time__\n", "- Two events cannot occur at exactly the same instant\n", "\n", - "We made a function _poisson_proba_ that takes a trip_id, a stop_id, a arrival time and a departure time and a dictionnary {key : distribution} to compute a __probability to be at least 2 minutes before departure of next trip__. \n", + "We made a function `poisson_proba` that takes a `trip_id`, a `stop_id`, an `arrival time` and a `departure time` and a dictionnary {key : distribution} to compute a __probability to be at least 2 minutes before departure of next trip__. \n", "\n", - "We make a few __sssumptions__ on our side :\n", + "We make a few __assumptions__ on our side :\n", "- We assume that if we have less than 2 minutes for the transfer, we miss it.\n", "- We assume the next train is on time.\n", - "- As for poisson distribution $k$ is strictly positive, we assume trains ahead of schedule were on time ($k=0$)" + "- As for poisson distribution $k$ is strictly positive, we assume trains ahead of schedule were on time ($k=0$)\n", + "\n", + "\n", + "_Question we should address :_\n", + "- _Is the poisson a reasonable approximation of the binomial distribution in our case ?_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's first test the poisson distribution and compare it with our distribution to see our well it fits the data. We will compute $Pr(X = k)$ for each values of k and look at the shape of the poisson distribution compared to the shape of our scaled data. Then, we will " + "Let's first test the poisson distribution and compare it with our distribution to see how well it fits the data. We will compute $Pr(X = k)$ for each values of k and look at the shape of the poisson distribution compared to the shape of our scaled data. Then, we will compare $\\sum_{k=0}^T Pr(X = k)$ with the cumulative distribution function which directly gives $Pr(k \\leq X)$" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "lambda (expectation given distribution): 1.0200445434298442 \n", "\n", "Probability of success for transfer time = 3.0 minutes : 0.9797581319055967\n" ] } ], "source": [ "################################# POISSON FUNCTIONS ########################################\n", "%local\n", "\n", "import pickle \n", "import gzip\n", "import time\n", "import math \n", "import datetime\n", "import time\n", "from scipy.stats import poisson\n", "\n", "with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n", " d = pickle.load(input_file)\n", " \n", "def get_distrib(key, dico):\n", " if key in dico:\n", " return dico[key]\n", " else:\n", " raise ValueError(\"KEY ERROR: {} not found un distribution dictionnary\".format(key))\n", " \n", "def evaluate_lamda(distrib):\n", " # First calculate total number of measures N\n", " N = -1 # by starting at -1 we ignore trains ahead of schedule\n", " for x in distrib:\n", " N += x\n", "\n", " lambda_p = 0 # expectation - we want to calculate it\n", " t = -1 # time = index - 1\n", "\n", " for x in distrib:\n", " if t>0:\n", " lambda_p += t*x\n", " t += 1\n", "\n", " # calculate lambda - the expectation of x\n", " if N > 0:\n", " lambda_p /= N \n", " print('lambda (expectation given distribution): ',lambda_p, '\\n')\n", " return lambda_p\n", " else : \n", " raise ValueError(\"ERROR : {} distribution has 0 counts\".format(key))\n", " #print('Returning 1 to avoid later problem... \\n')\n", " return 1\n", "\n", "def process_time(str_time):\n", " x = time.strptime(str_time,'%H:%M')\n", " return datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()\n", "\n", "def get_transfer_time(arr_time, dep_time, delta=2.0):\n", " diff_time_min = ( process_time(dep_time) - process_time(arr_time) ) / 60\n", " return diff_time_min - delta\n", "\n", "def poisson_proba(trip_id, stop_id, arr_time, dep_time, dico):\n", " # Generate key from trip_id / stop_id \n", " key = str(trip_id) + '__' + str(stop_id[0:7]) # 7 first char to be sbb-compatible\n", "\n", " # Get distribution from dictionnary\n", " distrib = get_distrib(key, dico)\n", " \n", " # Calculate transfer time at disposal \n", " T = get_transfer_time(arr_time, dep_time)\n", " \n", " # Get lambda value to calculate proba\n", " lambda_p = evaluate_lamda(distrib)\n", "\n", " # Get proba\n", " poisson_p = poisson.cdf(T, lambda_p)\n", " print('Probability of success for transfer time = {} minutes : '.format(T),poisson_p)\n", "\n", " return poisson_p\n", "\n", "# input data :\n", "trip_id = '1286.TA.26-32-j19-1.12.H'\n", "stop_id = '8591184'\n", "arrival_time = '07:45'\n", "departure_time = '07:50'\n", "Pr = poisson_proba(trip_id, stop_id, arrival_time, departure_time, d)" ] } ], "metadata": { "kernelspec": { "display_name": "PySpark", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python3" } }, "nbformat": 4, "nbformat_minor": 4 }