diff --git a/notebooks/proba_functions.ipynb b/notebooks/proba_functions.ipynb
index bcee49c..68945bb 100644
--- a/notebooks/proba_functions.ipynb
+++ b/notebooks/proba_functions.ipynb
@@ -1,355 +1,361 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make distribution tables to calculate probabilities of transfer\n",
"\n",
"
Any application without a proper name would be promptly killed.
"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Current session configs: {'conf': {'spark.app.name': 'lgptguys_final'}, 'kind': 'pyspark'}
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
6800 | application_1589299642358_1295 | pyspark | busy | Link | Link | |
6803 | application_1589299642358_1298 | pyspark | idle | Link | Link | |
6806 | application_1589299642358_1301 | pyspark | busy | Link | Link | |
6807 | application_1589299642358_1302 | pyspark | idle | Link | Link | |
6808 | application_1589299642358_1303 | pyspark | idle | Link | Link | |
6809 | application_1589299642358_1305 | pyspark | idle | Link | Link | |
6810 | application_1589299642358_1306 | pyspark | idle | Link | Link | |
6811 | application_1589299642358_1308 | pyspark | idle | Link | Link | |
6815 | application_1589299642358_1312 | pyspark | idle | Link | Link | |
6816 | application_1589299642358_1313 | pyspark | idle | Link | Link | |
6818 | application_1589299642358_1315 | pyspark | busy | Link | Link | |
6819 | application_1589299642358_1316 | pyspark | idle | Link | Link | |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%configure\n",
"{\"conf\": {\n",
" \"spark.app.name\": \"lgptguys_final\"\n",
"}}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Start Spark"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting Spark application\n"
]
},
{
"data": {
"text/html": [
"\n",
"ID | YARN Application ID | Kind | State | Spark UI | Driver log | Current session? |
---|
6820 | application_1589299642358_1317 | pyspark | idle | Link | Link | ✔ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"SparkSession available as 'spark'.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"An error was encountered:\n",
"unknown magic command '%spark'\n",
"UnknownMagic: unknown magic command '%spark'\n",
"\n"
]
}
],
"source": [
"# Initialization\n",
"%%spark"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compute probability of missing a transfer from delays distributions\n",
"\n",
"Let's first have a look at a slice of the dictionnary of distribution"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('1286.TA.26-32-j19-1.12.H__8591182',\n",
" array([ 0, 1158, 306, 162, 94, 24, 28, 21, 3, 2, 0,\n",
" 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n",
" ('1286.TA.26-32-j19-1.12.H__8591184',\n",
" array([ 1, 762, 552, 292, 118, 48, 13, 8, 0, 1, 1, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0])),\n",
" ('1286.TA.26-32-j19-1.12.H__8591195',\n",
" array([ 0, 1083, 444, 143, 64, 35, 16, 9, 3, 1, 0,\n",
" 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),\n",
" ('1286.TA.26-32-j19-1.12.H__8591200',\n",
" array([ 2, 239, 227, 228, 212, 128, 74, 42, 29, 17, 3, 3, 2,\n",
" 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 1])),\n",
" ('1286.TA.26-32-j19-1.12.H__8591209',\n",
" array([ 0, 1151, 308, 169, 94, 24, 29, 16, 4, 3, 1,\n",
" 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%local\n",
"\n",
"import pickle \n",
"import gzip\n",
"from itertools import islice\n",
"\n",
"with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n",
" d = pickle.load(input_file)\n",
"\n",
"# Functon to take a slice from a dictionnary - head equivalent\n",
"def take(n, iterable):\n",
" \"Return first n items of the iterable as a list\"\n",
" return list(islice(iterable, n))\n",
"\n",
"# display a slice of it\n",
"take(5, d.items())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Poisson cumulative distribution\n",
"\n",
- "The Poisson distribution is popular for modeling the number of times an event occurs in an interval of time or space. We modeled a poisson distribution for delays assuming k is the time (as it was done [here](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0126137)).\n",
+ "The Poisson distribution is popular for modeling the number of times an event occurs in an interval of time or space. We modeled a poisson distribution for delays assuming parameter $k$ is the time in minutes (as it was done [here](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0126137), formulas $(4),(5),(6)$).\n",
"\n",
"A discrete random variable X is said to have a Poisson distribution with parameter λ > 0, if, for k = 0, 1, 2, ..., the probability mass function of X is given by:\n",
"\n",
"$${\\displaystyle \\!f(k;\\lambda )=\\Pr(X=k)={\\frac {\\lambda ^{k}e^{-\\lambda }}{k!}},}$$\n",
"where\n",
"\n",
"e is Euler's number (e = 2.71828...)\n",
"k! is the factorial of k.\n",
- "The positive real number λ is equal to the expected value of X __and__ also to its variance.\n",
+ "The positive real number λ is equal to the expected value of X __and__ to its variance.\n",
"\n",
"$${\\displaystyle \\lambda =\\operatorname {E} (X)=\\operatorname {Var} (X)}$$\n",
"\n",
+ "We can approximate E[𝑋]∼$\\mu_i$ for our data $X_i$ if performed on a sample of size N from that distribution if we estimate that $X_i$∼$X$.\n",
+ "\n",
"The Poisson distribution can be applied to systems with a large number of possible events, each of which is rare. The number of such events that occur during a fixed time interval is, under the right circumstances, a random number with a Poisson distribution.\n",
"\n",
"Poisson-related __assumptions__ :\n",
- "- k is the number of times an event occurs in an interval and k can take values 0, 1, 2, ... \n",
- " - __k is therefore our delay in minutes = number of times the event (=delay) occurs__\n",
- "- The occurrence of one event does not affect the probability that a second event will occur. That is, events occur independently.\n",
+ "- $k$ is the __delay time in minutes__ and can take values 0, 1, 2, ... (strictly positive and discrete)\n",
+ "- We assume our sampling $X_i$ of $X$ is good enough to approximate E[X] ~ $\\mu_i$\n",
+ "- The occurrence of one event does not affect probability of others. That is, events occur independently.\n",
" - __We assume being late one day is not affecting the delay of the day after__ \n",
"- The average rate at which events occur is independent of any occurrences. For simplicity, this is usually assumed to be constant, but may in practice vary with time.\n",
- " - __we assumes delays occurs with a constant rate__\n",
+ " - __we assumes delays occurs with a constant rate over time__\n",
"- Two events cannot occur at exactly the same instant\n",
"\n",
- "We made a function _poisson_proba_ that takes a trip_id, a stop_id, a arrival time and a departure time and a dictionnary {key : distribution} to compute a __probability to be at least 2 minutes before departure of next trip__. \n",
+ "We made a function `poisson_proba` that takes a `trip_id`, a `stop_id`, an `arrival time` and a `departure time` and a dictionnary {key : distribution} to compute a __probability to be at least 2 minutes before departure of next trip__. \n",
"\n",
- "We make a few __sssumptions__ on our side :\n",
+ "We make a few __assumptions__ on our side :\n",
"- We assume that if we have less than 2 minutes for the transfer, we miss it.\n",
"- We assume the next train is on time.\n",
- "- As for poisson distribution $k$ is strictly positive, we assume trains ahead of schedule were on time ($k=0$)"
+ "- As for poisson distribution $k$ is strictly positive, we assume trains ahead of schedule were on time ($k=0$)\n",
+ "\n",
+ "\n",
+ "_Question we should address :_\n",
+ "- _Is the poisson a reasonable approximation of the binomial distribution in our case ?_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's first test the poisson distribution and compare it with our distribution to see our well it fits the data. We will compute $Pr(X = k)$ for each values of k and look at the shape of the poisson distribution compared to the shape of our scaled data. Then, we will "
+ "Let's first test the poisson distribution and compare it with our distribution to see how well it fits the data. We will compute $Pr(X = k)$ for each values of k and look at the shape of the poisson distribution compared to the shape of our scaled data. Then, we will compare $\\sum_{k=0}^T Pr(X = k)$ with the cumulative distribution function which directly gives $Pr(k \\leq X)$"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"lambda (expectation given distribution): 1.0200445434298442 \n",
"\n",
"Probability of success for transfer time = 3.0 minutes : 0.9797581319055967\n"
]
}
],
"source": [
"################################# POISSON FUNCTIONS ########################################\n",
"%local\n",
"\n",
"import pickle \n",
"import gzip\n",
"import time\n",
"import math \n",
"import datetime\n",
"import time\n",
"from scipy.stats import poisson\n",
"\n",
"with gzip.open(\"../data/distributions.pickle\", \"rb\") as input_file:\n",
" d = pickle.load(input_file)\n",
" \n",
"def get_distrib(key, dico):\n",
" if key in dico:\n",
" return dico[key]\n",
" else:\n",
" raise ValueError(\"KEY ERROR: {} not found un distribution dictionnary\".format(key))\n",
" \n",
"def evaluate_lamda(distrib):\n",
" # First calculate total number of measures N\n",
" N = -1 # by starting at -1 we ignore trains ahead of schedule\n",
" for x in distrib:\n",
" N += x\n",
"\n",
" lambda_p = 0 # expectation - we want to calculate it\n",
" t = -1 # time = index - 1\n",
"\n",
" for x in distrib:\n",
" if t>0:\n",
" lambda_p += t*x\n",
" t += 1\n",
"\n",
" # calculate lambda - the expectation of x\n",
" if N > 0:\n",
" lambda_p /= N \n",
" print('lambda (expectation given distribution): ',lambda_p, '\\n')\n",
" return lambda_p\n",
" else : \n",
" raise ValueError(\"ERROR : {} distribution has 0 counts\".format(key))\n",
" #print('Returning 1 to avoid later problem... \\n')\n",
" return 1\n",
"\n",
"def process_time(str_time):\n",
" x = time.strptime(str_time,'%H:%M')\n",
" return datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()\n",
"\n",
"def get_transfer_time(arr_time, dep_time, delta=2.0):\n",
" diff_time_min = ( process_time(dep_time) - process_time(arr_time) ) / 60\n",
" return diff_time_min - delta\n",
"\n",
"def poisson_proba(trip_id, stop_id, arr_time, dep_time, dico):\n",
" # Generate key from trip_id / stop_id \n",
" key = str(trip_id) + '__' + str(stop_id[0:7]) # 7 first char to be sbb-compatible\n",
"\n",
" # Get distribution from dictionnary\n",
" distrib = get_distrib(key, dico)\n",
" \n",
" # Calculate transfer time at disposal \n",
" T = get_transfer_time(arr_time, dep_time)\n",
" \n",
" # Get lambda value to calculate proba\n",
" lambda_p = evaluate_lamda(distrib)\n",
"\n",
" # Get proba\n",
" poisson_p = poisson.cdf(T, lambda_p)\n",
" print('Probability of success for transfer time = {} minutes : '.format(T),poisson_p)\n",
"\n",
" return poisson_p\n",
"\n",
"# input data :\n",
"trip_id = '1286.TA.26-32-j19-1.12.H'\n",
"stop_id = '8591184'\n",
"arrival_time = '07:45'\n",
"departure_time = '07:50'\n",
"Pr = poisson_proba(trip_id, stop_id, arrival_time, departure_time, d)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark",
"language": "",
"name": "pysparkkernel"
},
"language_info": {
"codemirror_mode": {
"name": "python",
"version": 3
},
"mimetype": "text/x-python",
"name": "pyspark",
"pygments_lexer": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}