{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Central Limit Theorem\n",
    "# One of the most important, powerful theorems in statistics (arguably THE most important)\n",
    "# Independent of the PDF of a distribution, when one computes the average of a large enough number of samples,\n",
    "# these averaged experiments will follow a normal distribution with the same mean as the original, non-normal distribution!\n",
    "options(repr.plot.width=20, repr.plot.height=12.5)   # this command just formats the size of the figures. Adapt to view them nicelyin your browser.\n",
    "\n",
    "\n",
    "\n",
    "# Create a non-uniform population of 100,000 numbers between 1 and 100\n",
    "pop1 <- rnorm(20000, mean = 10, sd = 3)\n",
    "pop2 <- rnorm(80000, mean = 70, sd = 10)\n",
    "pop <- c(pop1, pop2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Lets look at the real underlying probability distribution function.\n",
    "#In real life, this is unknown, and it is what you want to figure out from experiments\n",
    "#with finite numbers of samples.\n",
    "\n",
    "mu <- mean(pop) #calculate the population mean\n",
    "sigma <- sd(pop) #calculate the population standard deviation\n",
    "hist(pop,xlab=\"X\", main=\"PDF of the real distribution\")\n",
    "abline(v=mu,col=\"red\",lwd=2)\n",
    "abline(v=mu+sigma,col=\"blue\",lwd=2)\n",
    "abline(v=mu-sigma,col=\"blue\",lwd=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Finite sample: draw n random numbers from this distribution\n",
    "n=10\n",
    "k=length(pop)\n",
    "rdu<-function(n,k) sample(1:k,n,replace=T) # generates n random samples of numbers 1:k\n",
    "selection <- rdu(n,k)\n",
    "hist(pop[selection],breaks = 20,xlab=\"X\", main=\"Histogram of finite sample\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Repeat this experiment: you draw m-times n random numbers, and compute the sample average over the n numbers.\n",
    "# output will then be m averaged results. Total of n*m measurements!\n",
    "\n",
    "m=1000000  #number of experiments\n",
    "n=10  #number of samples per experiment\n",
    "averaged_data <- numeric(length = m)\n",
    "\n",
    "for (i in seq(1,m,by=1)) {\n",
    "  selection <- rdu(n,k)\n",
    "  averaged_data[i]=sum(pop[selection])/n\n",
    "}\n",
    "\n",
    "hist(averaged_data,breaks = seq(0,150,by=2))\n",
    "abline(v=mu,col=\"red\",lwd=2)\n",
    "abline(v=mu+sigma/sqrt(n),col=\"blue\",lwd=2)\n",
    "abline(v=mu-sigma/sqrt(n),col=\"blue\",lwd=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Compare to original distribution\n",
    "p1 <- hist(averaged_data,plot = \"FALSE\")       \n",
    "p2 <- hist(pop,plot = \"FALSE\")                \n",
    "plot( p2, col=rgb(1,0,0,1/4), xlim=c(0,100), main=\"Comparison of experimental sample and theoretical distribution\", xlab=\"X\") \n",
    "plot( p1, col=rgb(0,0,1,1/4), xlim=c(0,100), add=T) \n",
    "abline(v=mu,col=\"red\",lwd=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}