{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Central Limit Theorem\n", "# One of the most important, powerful theorems in statistics (arguably THE most important)\n", "# Independent of the PDF of a distribution, when one computes the average of a large enough number of samples,\n", "# these averaged experiments will follow a normal distribution with the same mean as the original, non-normal distribution!\n", "options(repr.plot.width=20, repr.plot.height=12.5) # this command just formats the size of the figures. Adapt to view them nicelyin your browser.\n", "\n", "\n", "\n", "# Create a non-uniform population of 100,000 numbers between 1 and 100\n", "pop1 <- rnorm(20000, mean = 10, sd = 3)\n", "pop2 <- rnorm(80000, mean = 70, sd = 10)\n", "pop <- c(pop1, pop2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Lets look at the real underlying probability distribution function.\n", "#In real life, this is unknown, and it is what you want to figure out from experiments\n", "#with finite numbers of samples.\n", "\n", "mu <- mean(pop) #calculate the population mean\n", "sigma <- sd(pop) #calculate the population standard deviation\n", "hist(pop,xlab=\"X\", main=\"PDF of the real distribution\")\n", "abline(v=mu,col=\"red\",lwd=2)\n", "abline(v=mu+sigma,col=\"blue\",lwd=2)\n", "abline(v=mu-sigma,col=\"blue\",lwd=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Finite sample: draw n random numbers from this distribution\n", "n=10\n", "k=length(pop)\n", "rdu<-function(n,k) sample(1:k,n,replace=T) # generates n random samples of numbers 1:k\n", "selection <- rdu(n,k)\n", "hist(pop[selection],breaks = 20,xlab=\"X\", main=\"Histogram of finite sample\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Repeat this experiment: you draw m-times n random numbers, and compute the sample average over the n numbers.\n", "# output will then be m averaged results. Total of n*m measurements!\n", "\n", "m=1000000 #number of experiments\n", "n=10 #number of samples per experiment\n", "averaged_data <- numeric(length = m)\n", "\n", "for (i in seq(1,m,by=1)) {\n", " selection <- rdu(n,k)\n", " averaged_data[i]=sum(pop[selection])/n\n", "}\n", "\n", "hist(averaged_data,breaks = seq(0,150,by=2))\n", "abline(v=mu,col=\"red\",lwd=2)\n", "abline(v=mu+sigma/sqrt(n),col=\"blue\",lwd=2)\n", "abline(v=mu-sigma/sqrt(n),col=\"blue\",lwd=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Compare to original distribution\n", "p1 <- hist(averaged_data,plot = \"FALSE\") \n", "p2 <- hist(pop,plot = \"FALSE\") \n", "plot( p2, col=rgb(1,0,0,1/4), xlim=c(0,100), main=\"Comparison of experimental sample and theoretical distribution\", xlab=\"X\") \n", "plot( p1, col=rgb(0,0,1,1/4), xlim=c(0,100), add=T) \n", "abline(v=mu,col=\"red\",lwd=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 4 }