Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F84143268
dataanalysis.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Sep 20, 23:44
Size
22 KB
Mime Type
text/x-python
Expires
Sun, Sep 22, 23:44 (2 d)
Engine
blob
Format
Raw Data
Handle
20890655
Attached To
rNOTOEXPGRPA noto-experiment-groupa
dataanalysis.py
View Options
import
numpy
as
np
from
ipywidgets
import
interact
,
interactive
,
fixed
,
interact_manual
from
ipywidgets
import
HBox
,
VBox
,
Label
,
Layout
import
ipywidgets
as
widgets
from
IPython.display
import
IFrame
from
IPython.display
import
set_matplotlib_formats
,
display
,
Math
,
Markdown
,
Latex
,
HTML
set_matplotlib_formats
(
'svg'
)
# Enable interactive backend for matplotlib
from
IPython
import
get_ipython
get_ipython
()
.
run_line_magic
(
'matplotlib'
,
'inline'
)
import
matplotlib.pyplot
as
plt
import
matplotlib.patches
as
pat
import
matplotlib.ticker
as
ticker
plt
.
style
.
use
(
'seaborn-whitegrid'
)
# global style for plotting
from
matplotlib.ticker
import
MultipleLocator
import
scipy.stats
as
stats
def
visualize_ttest
(
sample_size
,
alpha
,
t
):
# Create the t-test visualization
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
12
,
4
))
ax
.
set_title
(
"Probability distribution of all possible sample means if $H_0$ is true"
)
# Let's plot the T distribution for this sample size
tdist
=
stats
.
t
(
df
=
sample_size
,
loc
=
0
,
scale
=
1
)
x
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
0.9999
),
100
)
y
=
tdist
.
pdf
(
x
)
ax
.
plot
(
x
,
y
,
color
=
'black'
,
linewidth
=
1
)
# Polish the look of the graph
ax
.
get_yaxis
()
.
set_visible
(
False
)
# hide the y axis
ax
.
set_ylim
(
bottom
=
0
)
ax
.
grid
(
False
)
# hide the grid
ax
.
spines
[
'top'
]
.
set_visible
(
False
)
# hide the frame except bottom line
ax
.
spines
[
'right'
]
.
set_visible
(
False
)
ax
.
spines
[
'left'
]
.
set_visible
(
False
)
# Plot the rejection zone two tailed
x_zone_1
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
alpha
/
2
),
100
)
x_zone_2
=
np
.
linspace
(
tdist
.
ppf
(
1
-
alpha
/
2
),
tdist
.
ppf
(
0.9999
),
100
)
y_zone_1
=
tdist
.
pdf
(
x_zone_1
)
y_zone_2
=
tdist
.
pdf
(
x_zone_2
)
ax
.
fill_between
(
x_zone_1
,
y_zone_1
,
0
,
alpha
=
0.3
,
color
=
'red'
,
label
=
r'rejection of $H_0$ with $\alpha={}$'
.
format
(
alpha
))
ax
.
fill_between
(
x_zone_2
,
y_zone_2
,
0
,
alpha
=
0.3
,
color
=
'red'
)
# Plot the t-test stat
ax
.
axvline
(
x
=
t
,
color
=
'firebrick'
,
linestyle
=
'dashed'
,
linewidth
=
1
)
ax
.
annotate
(
't-test $t$={:.3f}'
.
format
(
t
),
xy
=
(
t
,
0
),
xytext
=
(
-
10
,
130
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
# Add a legend
ax
.
legend
()
# Display the graph
plt
.
show
()
def
visualize_ttest_pvalue
(
sample_size
,
alpha
,
t
,
p
):
# Create the t-test visualization
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
12
,
4
))
ax
.
set_title
(
"Probability distribution of all possible sample means if $H_0$ is true"
)
# Let's plot the T distribution for this sample size
tdist
=
stats
.
t
(
df
=
sample_size
-
1
,
loc
=
0
,
scale
=
1
)
x
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
0.9999
),
100
)
y
=
tdist
.
pdf
(
x
)
ax
.
plot
(
x
,
y
,
color
=
'black'
,
linewidth
=
1
)
# Polish the look of the graph
ax
.
get_yaxis
()
.
set_visible
(
False
)
# hide the y axis
ax
.
set_ylim
(
bottom
=
0
)
ax
.
grid
(
False
)
# hide the grid
ax
.
spines
[
'top'
]
.
set_visible
(
False
)
# hide the frame except bottom line
ax
.
spines
[
'right'
]
.
set_visible
(
False
)
ax
.
spines
[
'left'
]
.
set_visible
(
False
)
# Plot the rejection zone two tailed
x_zone_1
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
alpha
/
2
),
100
)
x_zone_2
=
np
.
linspace
(
tdist
.
ppf
(
1
-
alpha
/
2
),
tdist
.
ppf
(
0.9999
),
100
)
y_zone_1
=
tdist
.
pdf
(
x_zone_1
)
y_zone_2
=
tdist
.
pdf
(
x_zone_2
)
ax
.
fill_between
(
x_zone_1
,
y_zone_1
,
0
,
alpha
=
0.3
,
color
=
'red'
,
label
=
r'rejection of $H_0$ with $\alpha={}$'
.
format
(
alpha
))
ax
.
fill_between
(
x_zone_2
,
y_zone_2
,
0
,
alpha
=
0.3
,
color
=
'red'
)
# Plot the t-test stats
ax
.
axvline
(
x
=
t
,
color
=
'firebrick'
,
linestyle
=
'dashed'
,
linewidth
=
1
)
ax
.
annotate
(
't-test $t$={:.3f}'
.
format
(
t
),
xy
=
(
t
,
0
),
xytext
=
(
-
10
,
130
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
# Plot the p-value
if
t
>=
0
:
x_t
=
np
.
linspace
(
t
,
tdist
.
ppf
(
0.9999
),
100
)
else
:
x_t
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
t
,
100
)
y_t
=
tdist
.
pdf
(
x_t
)
ax
.
fill_between
(
x_t
,
y_t
,
0
,
facecolor
=
"none"
,
edgecolor
=
"firebrick"
,
hatch
=
"///"
,
linewidth
=
0.0
,
label
=
r'p-value $p$={:.3f}'
.
format
(
p
))
# Add a legend
ax
.
legend
()
# Display the graph
plt
.
show
()
def
visualize_ttest2
():
return
True
def
draw_sample
(
sample_size
,
mu
=
5.552
,
sigma
=
0.56068
):
# sigma = 0.56068
# mu = 5.552
sample_data
=
sigma
*
np
.
random
.
randn
(
sample_size
)
+
mu
return
sample_data
def
plot_sample_histogram
(
sample
,
mu
,
title
,
color
=
"grey"
):
plt
.
title
(
"Histogram of "
+
title
)
plt
.
hist
(
sample
,
color
=
"lightgrey"
)
#plt.xticks(np.arange(4.6, 7.2, 0.2))
# Add a vertical line for the population mean
plt
.
axvline
(
x
=
mu
,
color
=
'black'
,
linestyle
=
'-.'
,
linewidth
=
2
,
label
=
"population mean $\mu$"
)
# Add a vertical line for the Vuillerens sample mean
plt
.
axvline
(
x
=
np
.
mean
(
sample
),
color
=
color
,
linestyle
=
'-.'
,
linewidth
=
2
,
label
=
title
+
" mean $m$"
)
plt
.
legend
()
def
plot_t_distribution
(
df
,
alpha
=
None
,
tail
=
"two"
,
loc
=
0
,
scale
=
1
):
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
10
,
4
))
plt
.
title
(
't distribution for {:} degrees of freedom'
.
format
(
df
))
plt
.
xlabel
(
't'
)
plt
.
ylabel
(
'Probability density'
)
tdist
=
stats
.
t
(
df
=
df
,
loc
=
loc
,
scale
=
scale
)
# Get 100 values along the x axis from the least probable t value (0.0001) to the most probable t value (0.9999)
x
=
(
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
0.9999
),
100
))
# Plot the corresponding probabilities to get these t-values
ax
.
plot
(
x
,
tdist
.
pdf
(
x
),
color
=
"firebrick"
,
linestyle
=
'-'
,
lw
=
1
,
alpha
=
1
)
# label='t[{:}]'.format(df)
ax
.
grid
(
b
=
None
,
which
=
'major'
,
axis
=
'y'
)
ax
.
set_ylim
(
bottom
=
0
)
if
(
not
alpha
==
None
):
if
(
tail
==
"two"
):
low_cutoff
=
tdist
.
ppf
(
alpha
/
2
)
low_p
=
tdist
.
pdf
(
low_cutoff
)
ax
.
axvline
(
x
=
low_cutoff
,
color
=
'firebrick'
,
linestyle
=
'-.'
,
linewidth
=
1
)
ax
.
annotate
(
"Cutoff $t$={:.3f}
\n
for $
\\
alpha/2$={:.3f}"
.
format
(
low_cutoff
,
alpha
/
2
),
xy
=
(
low_cutoff
,
low_p
),
xytext
=
(
-
80
,
10
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
high_cutoff
=
tdist
.
ppf
(
1
-
alpha
/
2
)
high_p
=
tdist
.
pdf
(
high_cutoff
)
ax
.
axvline
(
x
=
high_cutoff
,
color
=
'firebrick'
,
linestyle
=
'-.'
,
linewidth
=
1
)
ax
.
annotate
(
"Cutoff $t$={:.3f}
\n
for $1-
\\
alpha/2$={:.3f}"
.
format
(
high_cutoff
,
1
-
alpha
/
2
),
xy
=
(
high_cutoff
,
high_p
),
xytext
=
(
5
,
10
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
x_zone_1
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
alpha
/
2
),
100
)
x_zone_2
=
np
.
linspace
(
tdist
.
ppf
(
1
-
alpha
/
2
),
tdist
.
ppf
(
0.9999
),
100
)
y_zone_1
=
tdist
.
pdf
(
x_zone_1
)
y_zone_2
=
tdist
.
pdf
(
x_zone_2
)
ax
.
fill_between
(
x_zone_1
,
y_zone_1
,
0
,
alpha
=
0.3
,
color
=
'red'
,
zorder
=
10
)
ax
.
fill_between
(
x_zone_2
,
y_zone_2
,
0
,
alpha
=
0.3
,
color
=
'red'
,
zorder
=
10
)
elif
(
tail
==
"lower"
):
low_cutoff
=
tdist
.
ppf
(
alpha
)
low_p
=
tdist
.
pdf
(
low_cutoff
)
ax
.
axvline
(
x
=
low_cutoff
,
color
=
'firebrick'
,
linestyle
=
'-.'
,
linewidth
=
1
)
ax
.
annotate
(
"Cutoff $t$={:.3f}
\n
for $
\\
alpha$={:.3f}"
.
format
(
low_cutoff
,
alpha
),
xy
=
(
low_cutoff
,
low_p
),
xytext
=
(
-
80
,
10
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
x_zone_1
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
alpha
),
100
)
y_zone_1
=
tdist
.
pdf
(
x_zone_1
)
ax
.
fill_between
(
x_zone_1
,
y_zone_1
,
0
,
alpha
=
0.3
,
color
=
'red'
,
zorder
=
10
)
elif
(
tail
==
"upper"
):
high_cutoff
=
tdist
.
ppf
(
1
-
alpha
)
high_p
=
tdist
.
pdf
(
high_cutoff
)
ax
.
axvline
(
x
=
high_cutoff
,
color
=
'firebrick'
,
linestyle
=
'-.'
,
linewidth
=
1
)
ax
.
annotate
(
"Cutoff $t$={:.3f}
\n
for $1-
\\
alpha$={:.3f}"
.
format
(
high_cutoff
,
1
-
alpha
),
xy
=
(
high_cutoff
,
high_p
),
xytext
=
(
5
,
10
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
x_zone_2
=
np
.
linspace
(
tdist
.
ppf
(
1
-
alpha
),
tdist
.
ppf
(
0.9999
),
100
)
y_zone_2
=
tdist
.
pdf
(
x_zone_2
)
ax
.
fill_between
(
x_zone_2
,
y_zone_2
,
0
,
alpha
=
0.3
,
color
=
'red'
,
zorder
=
10
)
return
ax
def
plot_t_test
(
sample
,
mu
,
alpha
=
None
,
tail
=
"two"
):
sample_size
=
np
.
size
(
sample
)
df
=
sample_size
-
1
ax
=
plot_t_distribution
(
df
,
alpha
=
alpha
,
tail
=
tail
)
t
,
p
=
stats
.
ttest_1samp
(
sample
,
mu
)
tdist
=
stats
.
t
(
df
=
df
,
loc
=
0
,
scale
=
1
)
if
(
tail
==
"two"
):
cutoff
=
tdist
.
ppf
(
alpha
/
2
)
else
:
cutoff
=
tdist
.
ppf
(
alpha
)
cutoff_y
=
tdist
.
pdf
(
cutoff
)
if
(
not
tail
==
"two"
):
if
(
tail
==
"lower"
and
t
<
0
):
p
=
p
/
2
elif
(
tail
==
"lower"
and
t
>
0
):
p
=
1
-
p
/
2
elif
(
tail
==
"upper"
and
t
<
0
):
p
=
1
-
p
/
2
elif
(
tail
==
"upper"
and
t
>
0
):
p
=
p
/
2
# Plot the p-value
#if t >= 0: x_t = np.linspace(t, tdist.ppf(0.9999), 100)
#else: x_t = np.linspace(tdist.ppf(0.0001), t, 100)
x_t
=
np
.
linspace
(
t
,
tdist
.
ppf
(
0.9999
),
100
)
if
(
tail
==
"two"
and
t
<
0
):
x_t
=
np
.
linspace
(
-
t
,
tdist
.
ppf
(
0.9999
),
100
)
y_t
=
tdist
.
pdf
(
x_t
)
if
(
tail
==
"two"
or
tail
==
"upper"
):
ax
.
fill_between
(
x_t
,
y_t
,
0
,
facecolor
=
"none"
,
edgecolor
=
"firebrick"
,
hatch
=
"///"
,
linewidth
=
0.0
,
label
=
r'p-value $p$={:.3f}'
.
format
(
p
))
x_t
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
t
,
100
)
if
(
tail
==
"two"
and
t
>=
0
):
x_t
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
-
t
,
100
)
y_t
=
tdist
.
pdf
(
x_t
)
if
(
tail
==
"two"
or
tail
==
"lower"
):
ax
.
fill_between
(
x_t
,
y_t
,
0
,
facecolor
=
"none"
,
edgecolor
=
"firebrick"
,
hatch
=
"///"
,
linewidth
=
0.0
,
label
=
r'p-value $p$={:.3f}'
.
format
(
p
))
ax
.
axvline
(
x
=
t
,
color
=
'firebrick'
,
linestyle
=
'-'
,
linewidth
=
1
)
if
(
tail
==
"two"
):
## Add a small vertical segment for the side of the hatching which is not on the side of t
point1
=
[
-
t
,
0
]
point2
=
[
-
t
,
tdist
.
pdf
(
-
t
)]
x_values
=
[
point1
[
0
],
point2
[
0
]]
y_values
=
[
point1
[
1
],
point2
[
1
]]
ax
.
plot
(
x_values
,
y_values
,
color
=
"firebrick"
,
linestyle
=
'-'
,
linewidth
=
1
)
if
(
p
<
0.3
):
annotation_y
=
p
else
:
annotation_y
=
0.3
annotation_offset
=
0
# are they overlapping ?
if
(
np
.
abs
(
cutoff_y
-
annotation_y
)
<
0.05
):
if
(
annotation_y
>
cutoff_y
):
annotation_offset
=
25
else
:
annotation_offset
=
-
5
ax
.
annotate
(
't-test $t$={:.3f}, p={:.3f}'
.
format
(
t
,
p
),
xy
=
(
t
,
annotation_y
),
xytext
=
(
5
,
annotation_offset
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
def
plot_mean_distribution
(
mu
,
alpha
=
None
,
means
=
None
,
sample_size
=
50
):
plt
.
figure
(
figsize
=
(
8
,
4
))
plt
.
title
(
'Distribution of sample means for a population with $\mu$= {:}'
.
format
(
mu
))
plt
.
xlabel
(
'Mean of samples'
)
plt
.
ylabel
(
'Count'
)
if
(
means
==
None
):
x
=
np
.
linspace
(
5.2
,
5.9
,
100
)
loc
=
mu
scale
=
0.0796
df
=
sample_size
-
1
tdist
=
stats
.
t
.
pdf
(
x
,
sample_size
-
1
,
loc
=
mu
,
scale
=
0.0796
)
## * (prop_tot * (n_samples / len(nn)))
scale_factor
=
1.0
frame1
=
plt
.
gca
()
frame1
.
plot
(
x
,
tdist
,
color
=
'black'
,
linestyle
=
'-'
,
lw
=
1
,
alpha
=
1
);
frame1
.
set_ylim
([
0
,
6
])
frame1
.
axes
.
get_yaxis
()
.
set_visible
(
False
)
# In black the theoretical population mean mu
plt
.
axvline
(
x
=
mu
,
color
=
'black'
,
linestyle
=
'-.'
,
linewidth
=
2
)
# Display mu
frame1
.
annotate
(
"$\mu$={:.3f}"
.
format
(
mu
),
xy
=
(
mu
,
5.5
),
xytext
=
(
-
20
,
0
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"black"
,
alpha
=
0.8
))
else
:
# We get a list of means
# A histogram of the means
(
nn
,
bb
,
pp
)
=
plt
.
hist
(
means
,
color
=
"lightgrey"
,
bins
=
100
,
density
=
False
);
### Control
#(nn,bb) = np.histogram(means, bins=100, density=False)
# Add the t-distribution
# The positions in the histogram
x
=
np
.
linspace
(
bb
[
1
],
bb
[
len
(
bb
)
-
1
],
len
(
bb
-
1
))
frame1
=
plt
.
gca
()
loc
=
mu
scale
=
np
.
std
(
means
,
ddof
=
1
)
df
=
sample_size
-
1
tdist
=
stats
.
t
.
pdf
(
x
,
df
,
loc
=
mu
,
scale
=
scale
)
## * (prop_tot * (n_samples / len(nn)))
scale_factor
=
np
.
size
(
means
)
/
np
.
sum
(
tdist
)
tdist
=
tdist
*
scale_factor
plt
.
plot
(
x
,
tdist
,
color
=
'black'
,
linestyle
=
'-'
,
lw
=
1
,
alpha
=
1
);
frame1
.
axes
.
get_yaxis
()
.
set_visible
(
False
)
frame1
.
set_ylim
(
bottom
=
0
)
# In black the theoretical population mean mu
plt
.
axvline
(
x
=
mu
,
color
=
'black'
,
linestyle
=
'-.'
,
linewidth
=
2
)
# Display mu
frame1
.
annotate
(
"$\mu$={:.3f}"
.
format
(
mu
),
xy
=
(
mu
,
np
.
max
(
nn
)),
xytext
=
(
-
20
,
0
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"black"
,
alpha
=
0.8
))
if
(
not
alpha
==
None
):
df
=
sample_size
-
1
add_rejection_zones
(
frame1
,
loc
,
scale
,
df
,
alpha
,
scale_factor
)
def
add_rejection_zones
(
ax
,
loc
,
scale
,
df
,
alpha
,
scale_factor
):
mytdist
=
stats
.
t
(
df
=
df
,
loc
=
loc
,
scale
=
scale
)
# Plot the rejection zone two tailed
x_zone_1
=
np
.
linspace
(
mytdist
.
ppf
(
0.0001
),
mytdist
.
ppf
(
alpha
/
2
),
100
)
x_zone_2
=
np
.
linspace
(
mytdist
.
ppf
(
1
-
alpha
/
2
),
mytdist
.
ppf
(
0.9999
),
100
)
y_zone_1
=
mytdist
.
pdf
(
x_zone_1
)
*
scale_factor
y_zone_2
=
mytdist
.
pdf
(
x_zone_2
)
*
scale_factor
ax
.
fill_between
(
x_zone_1
,
y_zone_1
,
0
,
alpha
=
0.3
,
color
=
'red'
,
zorder
=
10
)
ax
.
fill_between
(
x_zone_2
,
y_zone_2
,
0
,
alpha
=
0.3
,
color
=
'red'
,
zorder
=
10
)
#ax.axvline(x=t, color='firebrick', linestyle='dashed', linewidth=1)
#ax.annotate('cut-off $t$={:.3f}'.format(mytdist.ppf(alpha/2)),
# xy=(mytdist.ppf(alpha/2), 0),
# xytext=(mytdist.ppf(alpha/2), mytdist.pdf(alpha/2)), textcoords='offset points',
# bbox=dict(boxstyle="round", facecolor = "white", edgecolor = "firebrick", alpha = 0.8))
def
plot_n_and_t
(
sample_mean
,
sample_std
,
mu
,
from_n
,
to_n
,
step_n
,
plotp
=
False
,
df
=
49
,
alpha
=
0.05
):
######
# Compute t-values (t) for different values of sample size (n)
# by using the Vuillerens sample_mean and sample_std.
nvalues
=
[]
tvalues
=
[]
pvalues
=
[]
# For sample sizes ranging from 10 to 60 in steps of 5
for
n
in
np
.
arange
(
from_n
,
to_n
,
step_n
):
# Compute the t-value if the sample size was n and with the
# sample_mean and sample_std from the Vuillerens sample
t
=
(
sample_mean
-
mu
)
/
(
sample_std
/
np
.
sqrt
(
n
))
# Collect the n and t values
df
=
n
-
1
p
=
stats
.
t
.
pdf
(
t
,
n
-
1
)
pvalues
.
append
(
p
)
nvalues
.
append
(
n
)
tvalues
.
append
(
t
)
######
# Plot the relation between sample size and t-value and p-value
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
8
,
4
))
# Set the axes
plt
.
title
(
'$|t|$ as a function of sample size (from {:} to {:})'
.
format
(
from_n
,
to_n
))
ax
.
set_xlabel
(
'Sample size'
,
color
=
"black"
)
ax
.
set_ylabel
(
'|t|'
,
color
=
"firebrick"
)
ax
.
tick_params
(
axis
=
'y'
,
labelcolor
=
"firebrick"
)
spacing
=
step_n
# This can be your user specified spacing.
minorLocator
=
MultipleLocator
(
spacing
)
# Set minor tick locations.
ax
.
xaxis
.
set_minor_locator
(
minorLocator
)
# Set grid to use minor tick locations.
ax
.
grid
(
which
=
'minor'
,
axis
=
'x'
,
linestyle
=
'dotted'
)
ax
.
grid
(
which
=
'major'
,
axis
=
'x'
,
linestyle
=
'-'
)
ax
.
grid
(
b
=
None
,
which
=
'major'
,
axis
=
'y'
)
# Plot the t-values that correspond to different samples sizes
ax
.
plot
(
nvalues
,
np
.
abs
(
tvalues
),
color
=
"firebrick"
)
tdist
=
stats
.
t
(
df
=
50
-
1
,
loc
=
0
,
scale
=
1
)
cutoff
=
tdist
.
ppf
(
1
-
alpha
/
2
)
# Draw a horizontal line for the cutoff point at alpha = 0.025 (upper tail)
# If our t is negative, the cutoff point can also be negative (lower tail)
#if (tvalues[0] < 0):
# cutoff = -cutoff
plt
.
rcParams
[
'path.sketch'
]
=
(
1
,
100
,
2
)
ax
.
axhline
(
y
=
2.01
,
color
=
'firebrick'
,
linestyle
=
'-.'
,
linewidth
=
1
);
plt
.
rcParams
[
'path.sketch'
]
=
None
ax
.
annotate
(
'cutoff $t_{
\\
alpha=0.05}
\\
approx 2.00$'
,
xy
=
(
10
,
2.00
),
xytext
=
(
10
,
0
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
plt
.
rcParams
[
'path.sketch'
]
=
(
1
,
100
,
2
)
ax
.
axhline
(
y
=
2.61
,
color
=
'firebrick'
,
linestyle
=
'-.'
,
linewidth
=
1
);
plt
.
rcParams
[
'path.sketch'
]
=
None
ax
.
annotate
(
'cutoff $t_{
\\
alpha=0.01}
\\
approx 2.66$'
,
xy
=
(
10
,
2.66
),
xytext
=
(
10
,
0
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
# Draw a vertical grey line at the sample size that corresponds to the cutoff point for alpha=0.05
#n = ((cutoff * sample_std) / (sample_mean - mu)) ** 2
#ax.axvline(x=n, color='grey', linestyle='-.', linewidth=1);
if
(
plotp
):
ax2
=
ax
.
twinx
()
# Plot the p-values that correspond to different sample sizes
ax2
.
plot
(
nvalues
,
pvalues
,
color
=
"blue"
)
ax2
.
set_xlabel
(
"Sample Size"
)
ax2
.
set_ylabel
(
'p-value'
,
color
=
"blue"
)
ax2
.
tick_params
(
axis
=
'y'
,
labelcolor
=
"blue"
)
ax2
.
grid
(
which
=
'major'
,
axis
=
'x'
,
linestyle
=
'-'
)
ax2
.
axhline
(
y
=
0.05
,
color
=
'blue'
,
linestyle
=
'-.'
,
linewidth
=
1
);
ax2
.
annotate
(
'$
\\
alpha = 0.05$'
,
xy
=
(
to_n
,
0.05
),
xytext
=
(
-
40
,
0
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"blue"
,
alpha
=
0.8
))
ax2
.
axhline
(
y
=
0.01
,
color
=
'blue'
,
linestyle
=
'-.'
,
linewidth
=
1
);
ax2
.
annotate
(
'$
\\
alpha = 0.01$'
,
xy
=
(
to_n
,
0.01
),
xytext
=
(
-
40
,
0
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"blue"
,
alpha
=
0.8
))
#ax2.axhline(y=alpha, color='blue', linestyle='-.', linewidth=2);
def
one_sample_one_tailed
(
sample_data
,
mu
,
alpha
=
0.05
,
alternative
=
'greater'
):
t
,
p
=
stats
.
ttest_1samp
(
sample_data
,
mu
)
p
=
p
*
2
print
(
't:'
,
t
)
print
(
'p:'
,
p
)
if
alternative
==
'greater'
and
(
p
<
alpha
)
and
t
>
0
:
print
(
'Reject Null Hypothesis for greater-than test'
)
if
alternative
==
'less'
and
(
p
<
alpha
)
and
t
<
0
:
print
(
'Reject Null Hypothesis for less-thane test'
)
###### TO DELETE
def
build_ttest_visualization
(
ttest_result
,
alpha
):
# Extract information from the result of the t-test
n
=
round
(
ttest_result
.
loc
[
"T-test"
,
"dof"
])
t
=
ttest_result
.
loc
[
"T-test"
,
"T"
]
p
=
ttest_result
.
loc
[
"T-test"
,
"p-val"
]
d
=
ttest_result
.
loc
[
"T-test"
,
"cohen-d"
]
# Create the figure
fig
=
plt
.
figure
(
figsize
=
(
14
,
4
))
### 1. Create the t-test visualization
ax1
=
plt
.
subplot
(
121
)
ax1
.
set_title
(
"Result of the t-test"
)
# Let's plot the T distribution for this sample size
tdist
=
stats
.
t
(
df
=
n
,
loc
=
0
,
scale
=
1
)
x
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
0.9999
),
100
)
y
=
tdist
.
pdf
(
x
)
ax1
.
plot
(
x
,
y
,
color
=
'black'
,
linewidth
=
1
)
# Polish the look of the graph
ax1
.
get_yaxis
()
.
set_visible
(
False
)
# hide the y axis
ax1
.
set_ylim
(
bottom
=
0
)
ax1
.
grid
(
False
)
# hide the grid
ax1
.
spines
[
'top'
]
.
set_visible
(
False
)
# hide the frame except bottom line
ax1
.
spines
[
'right'
]
.
set_visible
(
False
)
ax1
.
spines
[
'left'
]
.
set_visible
(
False
)
# Plot the rejection zone two tailed
x_zone_1
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
tdist
.
ppf
(
alpha
/
2
),
100
)
x_zone_2
=
np
.
linspace
(
tdist
.
ppf
(
1
-
alpha
/
2
),
tdist
.
ppf
(
0.9999
),
100
)
y_zone_1
=
tdist
.
pdf
(
x_zone_1
)
y_zone_2
=
tdist
.
pdf
(
x_zone_2
)
ax1
.
fill_between
(
x_zone_1
,
y_zone_1
,
0
,
alpha
=
0.3
,
color
=
'red'
,
label
=
r'threshold $\alpha={}$'
.
format
(
alpha
))
ax1
.
fill_between
(
x_zone_2
,
y_zone_2
,
0
,
alpha
=
0.3
,
color
=
'red'
)
# Plot the t-test stats
ax1
.
axvline
(
x
=
t
,
color
=
'firebrick'
,
linestyle
=
'dashed'
,
linewidth
=
1
)
ax1
.
annotate
(
't-test $t$={:.3f}'
.
format
(
t
),
xy
=
(
t
,
0
),
xytext
=
(
-
10
,
130
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"firebrick"
,
alpha
=
0.8
))
# Plot the p-value
if
t
>=
0
:
x_t
=
np
.
linspace
(
t
,
tdist
.
ppf
(
0.9999
),
100
)
else
:
x_t
=
np
.
linspace
(
tdist
.
ppf
(
0.0001
),
t
,
100
)
y_t
=
tdist
.
pdf
(
x_t
)
ax1
.
fill_between
(
x_t
,
y_t
,
0
,
facecolor
=
"none"
,
edgecolor
=
"firebrick"
,
hatch
=
"///"
,
linewidth
=
0.0
,
label
=
r'p-value $p$={:.3f}'
.
format
(
p
))
# Add a legend
ax1
.
legend
(
loc
=
'upper right'
)
### 2. Create the effect size visualization
ax2
=
plt
.
subplot
(
122
)
ax2
.
set_title
(
"Effect size"
)
# Plot the theoretical distribution of first sample
norm
=
stats
.
norm
(
loc
=
0
,
scale
=
1
)
x
=
np
.
linspace
(
norm
.
ppf
(
0.0001
),
norm
.
ppf
(
0.9999
),
100
)
y
=
norm
.
pdf
(
x
)
ax2
.
plot
(
x
,
y
,
color
=
'black'
,
alpha
=
0.3
,
linewidth
=
1
)
ax2
.
fill_between
(
x
,
y
,
0
,
color
=
'blue'
,
alpha
=
0.3
,
label
=
'Year 1 (theoretical)'
)
ax2
.
axvline
(
x
=
0
,
color
=
'blue'
,
alpha
=
0.5
,
linestyle
=
'dashed'
,
linewidth
=
1
)
# Plot the theoretical distribution of second sample (if t > 0 means 2 < 1 so we plot the second sample on the left)
loc_d
=
-
d
if
t
>
0
else
d
norm_d
=
stats
.
norm
(
loc
=
loc_d
,
scale
=
1
)
x_d
=
np
.
linspace
(
norm_d
.
ppf
(
0.0001
),
norm_d
.
ppf
(
0.9999
),
100
)
y_d
=
norm_d
.
pdf
(
x_d
)
ax2
.
plot
(
x_d
,
y_d
,
color
=
'black'
,
alpha
=
0.3
,
linewidth
=
1
)
ax2
.
fill_between
(
x_d
,
y_d
,
0
,
color
=
'green'
,
alpha
=
0.3
,
label
=
'Year 2 (theoretical)'
)
ax2
.
axvline
(
x
=
loc_d
,
color
=
'green'
,
alpha
=
0.5
,
linestyle
=
'dashed'
,
linewidth
=
1
)
# Display the value of Cohen's d
max_y
=
np
.
max
(
y
)
+.
02
ax2
.
plot
([
0
,
loc_d
],
[
max_y
,
max_y
],
color
=
'red'
,
linewidth
=
1
,
marker
=
"."
)
ax2
.
annotate
(
"effect size $d$={:.3f}"
.
format
(
d
),
xy
=
(
loc_d
,
max_y
),
xytext
=
(
15
,
-
5
),
textcoords
=
'offset points'
,
bbox
=
dict
(
boxstyle
=
"round"
,
facecolor
=
"white"
,
edgecolor
=
"red"
,
alpha
=
0.8
))
# Polish the look of the graph
ax2
.
get_yaxis
()
.
set_visible
(
False
)
# hide the y axis
ax2
.
set_ylim
(
bottom
=
0
)
ax2
.
grid
(
False
)
# hide the grid
ax2
.
spines
[
'top'
]
.
set_visible
(
False
)
# hide the frame except bottom line
ax2
.
spines
[
'right'
]
.
set_visible
(
False
)
ax2
.
spines
[
'left'
]
.
set_visible
(
False
)
# Add a legend
ax2
.
legend
(
loc
=
'upper left'
)
# Display the graph
plt
.
subplots_adjust
(
wspace
=.
1
)
plt
.
show
()
# EOF
Event Timeline
Log In to Comment