import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

# STYLE.md asks for a fixed seed; this appendix's PUBLISHED numbers were produced with
# numpy's modern Generator seeded at 42, so every simulation cell below opens with
# rng = np.random.default_rng(42) to reproduce the webbook tables exactly.
np.random.seed(418)  # governs any incidental (figure-only) randomness

plt.rcParams.update({
    'figure.figsize': (7.2, 4.2), 'figure.dpi': 110,
    'axes.grid': True, 'grid.alpha': 0.25,
    'axes.spines.top': False, 'axes.spines.right': False,
    'font.size': 11,
})
BLUE, RED, GREEN, ORANGE, PURPLE = '#4C72B0', '#C44E52', '#55A868', '#DD8452', '#8172B3'

# One real dataset anchors the standard-error / CI / test discussion (E.2): the classic
# Michelson-Morley speed-of-light measurements, loaded straight from the course bucket.
MORLEY_URL = ('https://pqyjaywwccbnqpwgeiuv.supabase.co/storage/v1/object/public/'
              'STAT%20418%20Images/Data/Chapter5/morley.csv')

print('environment ready · numpy', np.__version__, '· scipy', stats.__name__)

environment ready · numpy 1.26.4 · scipy scipy.stats


# Biased (MLE, 1/n) vs unbiased (S^2, 1/(n-1)) variance estimator -- MSE decomposition.
rng = np.random.default_rng(42)
mu, sigma = 5.0, 2.0
sigma2 = sigma**2
n_reps = 50000

ns, mse_b, mse_u, b2 = [], [], [], []
print(f"{'n':>5}  {'MSE(MLE 1/n)':>13}  {'MSE(S^2 1/(n-1))':>16}  {'Bias^2(MLE)':>12}  {'Winner':>8}")
print('-' * 64)
for n in [5, 10, 25, 50, 100]:
    biased = np.zeros(n_reps)
    unbiased = np.zeros(n_reps)
    for i in range(n_reps):
        s = rng.normal(mu, sigma, n)
        biased[i] = np.var(s, ddof=0)    # sigma^2_MLE
        unbiased[i] = np.var(s, ddof=1)  # S^2
    mb = np.mean((biased - sigma2)**2)
    mu_ = np.mean((unbiased - sigma2)**2)
    bias_sq = (np.mean(biased) - sigma2)**2
    winner = 'MLE' if mb < mu_ else 'S^2'
    ns.append(n); mse_b.append(mb); mse_u.append(mu_); b2.append(bias_sq)
    print(f"{n:>5}  {mb:>13.4f}  {mu_:>16.4f}  {bias_sq:>12.4f}  {winner:>8}")
print()
print('The BIASED MLE wins MSE at every n: its variance reduction outweighs its bias^2.')
print('Both MSEs -> 2*sigma^4/n = {:.4f} at n=100 as the bias term vanishes.'.format(2*sigma2**2/100))

    n   MSE(MLE 1/n)  MSE(S^2 1/(n-1))   Bias^2(MLE)    Winner
----------------------------------------------------------------
    5         5.8806            8.2137        0.6239       MLE
   10         3.0612            3.5863        0.1563       MLE
   25         1.2490            1.3263        0.0267       MLE
   50         0.6337            0.6524        0.0071       MLE
  100         0.3193            0.3241        0.0017       MLE

The BIASED MLE wins MSE at every n: its variance reduction outweighs its bias^2.
Both MSEs -> 2*sigma^4/n = 0.3200 at n=100 as the bias term vanishes.


# Figure: MSE(MLE) vs MSE(S^2) across n, with the bias^2 slice of the MLE's error.
fig, ax = plt.subplots()
ax.plot(ns, mse_u, 'o-', color=RED, lw=2.2, label=r'MSE($S^2$, unbiased)')
ax.plot(ns, mse_b, 's-', color=GREEN, lw=2.2, label=r'MSE($\hat\sigma^2_{MLE}$, biased)')
ax.plot(ns, b2, '^--', color=PURPLE, lw=1.8, label=r'Bias$^2$ of $\hat\sigma^2_{MLE}$')
ax.set_yscale('log'); ax.set_xscale('log')
ax.set_xlabel('sample size $n$'); ax.set_ylabel('error (log scale)')
ax.set_title('Bias-variance trade-off: the biased MLE has lower MSE at every $n$')
ax.legend(); plt.tight_layout(); plt.show()


# Method of Moments vs MLE for Gamma(alpha=3, beta=2), n=200.
rng = np.random.default_rng(42)
alpha_true, beta_true, n = 3.0, 2.0, 200
sample = rng.gamma(alpha_true, beta_true, n)

m1 = np.mean(sample)
m2 = np.mean(sample**2)
beta_mom = m2 / m1 - m1
alpha_mom = m1 / beta_mom
alpha_mle, _, beta_mle = stats.gamma.fit(sample, floc=0)  # MLE, location fixed at 0

print(f'Gamma(alpha={alpha_true}, beta={beta_true}) sample, n={n}')
print(f'  sample moments : m1 = {m1:.4f}, m2 = {m2:.4f}')
print(f'  Method of Moments : alpha = {alpha_mom:.4f}, beta = {beta_mom:.4f}')
print(f'  Maximum Likelihood: alpha = {alpha_mle:.4f}, beta = {beta_mle:.4f}')
print(f'  True values       : alpha = {alpha_true:.4f}, beta = {beta_true:.4f}')
print('  Both land near the truth; MoM is closed-form, the MLE needs numerical optimization.')

Gamma(alpha=3.0, beta=2.0) sample, n=200
  sample moments : m1 = 6.2028, m2 = 50.5877
  Method of Moments : alpha = 3.1761, beta = 1.9529
  Maximum Likelihood: alpha = 3.2931, beta = 1.8836
  True values       : alpha = 3.0000, beta = 2.0000
  Both land near the truth; MoM is closed-form, the MLE needs numerical optimization.


# Figure: the Gamma sample with the MoM and MLE fitted densities.
fig, ax = plt.subplots()
xx = np.linspace(0, sample.max(), 400)
ax.hist(sample, bins=30, density=True, color=BLUE, alpha=0.45, edgecolor='white',
        label=f'sample (n={n})')
ax.plot(xx, stats.gamma.pdf(xx, alpha_mom, scale=beta_mom), color=ORANGE, lw=2.4,
        label=fr'MoM  $\hat\alpha$={alpha_mom:.2f}, $\hat\beta$={beta_mom:.2f}')
ax.plot(xx, stats.gamma.pdf(xx, alpha_mle, scale=beta_mle), '--', color=RED, lw=2.4,
        label=fr'MLE  $\hat\alpha$={alpha_mle:.2f}, $\hat\beta$={beta_mle:.2f}')
ax.plot(xx, stats.gamma.pdf(xx, alpha_true, scale=beta_true), ':', color='k', lw=1.6,
        label='true Gamma(3, 2)')
ax.set_xlabel('x'); ax.set_ylabel('density')
ax.set_title('Method of Moments vs MLE for the Gamma'); ax.legend(fontsize=9)
plt.tight_layout(); plt.show()


# Sampling distributions of x-bar and S^2 for Normal(mu=5, sigma=2).
rng = np.random.default_rng(42)
mu, sigma = 5.0, 2.0
n_reps = 10000

for n in [15, 100]:
    xbar = np.zeros(n_reps); s2 = np.zeros(n_reps)
    for i in range(n_reps):
        s = rng.normal(mu, sigma, n)
        xbar[i] = np.mean(s); s2[i] = np.var(s, ddof=1)
    chi2_vals = (n - 1) * s2 / sigma**2
    corr = np.corrcoef(xbar, s2)[0, 1]
    print(f'n = {n} ({n_reps:,} replicates):')
    print(f'  x-bar : E = {np.mean(xbar):.4f} (theory {mu:.1f}),  Var = {np.var(xbar):.4f} (theory {sigma**2/n:.4f})')
    print(f'  S^2   : E = {np.mean(s2):.4f} (theory {sigma**2:.1f}),  SD = {np.std(s2):.4f}')
    print(f'  (n-1)S^2/sigma^2 : mean = {np.mean(chi2_vals):.4f} (theory {n-1}),  var = {np.var(chi2_vals):.4f} (theory {2*(n-1):.1f})')
    print(f'  Corr(x-bar, S^2) = {corr:.4f} (theory 0 -- independence)')
    print()

n = 15 (10,000 replicates):
  x-bar : E = 4.9933 (theory 5.0),  Var = 0.2674 (theory 0.2667)
  S^2   : E = 4.0346 (theory 4.0),  SD = 1.5425
  (n-1)S^2/sigma^2 : mean = 14.1211 (theory 14),  var = 29.1477 (theory 28.0)
  Corr(x-bar, S^2) = 0.0202 (theory 0 -- independence)

n = 100 (10,000 replicates):
  x-bar : E = 5.0021 (theory 5.0),  Var = 0.0391 (theory 0.0400)
  S^2   : E = 3.9965 (theory 4.0),  SD = 0.5682
  (n-1)S^2/sigma^2 : mean = 98.9146 (theory 99),  var = 197.7431 (theory 198.0)
  Corr(x-bar, S^2) = 0.0235 (theory 0 -- independence)


# Figure: at n=15, x-bar matches N(mu, sigma^2/n) and (n-1)S^2/sigma^2 matches chi^2_{n-1}.
rng = np.random.default_rng(42)
n = 15; R = 8000
xbar = np.zeros(R); chi = np.zeros(R)
for i in range(R):
    s = rng.normal(mu, sigma, n)
    xbar[i] = np.mean(s)
    chi[i] = (n - 1) * np.var(s, ddof=1) / sigma**2

fig, axes = plt.subplots(1, 2, figsize=(11, 4))
ax = axes[0]
ax.hist(xbar, bins=40, density=True, color=BLUE, alpha=0.5, edgecolor='white')
gx = np.linspace(xbar.min(), xbar.max(), 300)
ax.plot(gx, stats.norm.pdf(gx, mu, sigma/np.sqrt(n)), color=RED, lw=2.4,
        label=r'$N(\mu,\ \sigma^2/n)$')
ax.set_title(r'Sampling distribution of $\bar X$  ($n=15$)')
ax.set_xlabel(r'$\bar X$'); ax.set_ylabel('density'); ax.legend()

ax = axes[1]
ax.hist(chi, bins=40, density=True, color=GREEN, alpha=0.5, edgecolor='white')
cx = np.linspace(0, chi.max(), 300)
ax.plot(cx, stats.chi2.pdf(cx, n - 1), color=RED, lw=2.4, label=r'$\chi^2_{n-1}$')
ax.set_title(r'$(n-1)S^2/\sigma^2$  ($n=15$)')
ax.set_xlabel('value'); ax.set_ylabel('density'); ax.legend()
plt.tight_layout(); plt.show()


# Z (known sigma) vs T (estimated sigma) pivots for Normal(5, 2^2), n=15.
rng = np.random.default_rng(42)
mu, sigma, n = 5.0, 2.0, 15
n_reps = 10000
z_vals = np.zeros(n_reps); t_vals = np.zeros(n_reps); se_vals = np.zeros(n_reps)
for i in range(n_reps):
    s = rng.normal(mu, sigma, n)
    xb = np.mean(s); sd = np.std(s, ddof=1)
    z_vals[i] = (xb - mu) / (sigma / np.sqrt(n))
    t_vals[i] = (xb - mu) / (sd / np.sqrt(n))
    se_vals[i] = sd / np.sqrt(n)

print('Pivots for Normal(mu=5, sigma=2), n=15:')
print(f"{'':>18}  {'Z (known s)':>12}  {'T (est. s)':>12}")
print('-' * 46)
print(f"{'Variance':>18}  {np.var(z_vals):>12.4f}  {np.var(t_vals):>12.4f}")
print(f"{'  theory':>18}  {1.0:>12.4f}  {(n-1)/(n-3):>12.4f}")
print(f"{'Excess kurtosis':>18}  {stats.kurtosis(z_vals):>12.4f}  {stats.kurtosis(t_vals):>12.4f}")
print(f"{'  theory':>18}  {0.0:>12.4f}  {6/(n-5):>12.4f}")
print(f"{'2.5th pct':>18}  {np.percentile(z_vals,2.5):>12.4f}  {np.percentile(t_vals,2.5):>12.4f}")
print(f"{'  theory':>18}  {stats.norm.ppf(0.025):>12.4f}  {stats.t.ppf(0.025, n-1):>12.4f}")
print()
print(f'Estimated SE: E[S/sqrt(n)] = {np.mean(se_vals):.4f}  (theory sigma/sqrt(n) = {sigma/np.sqrt(n):.4f})')

Pivots for Normal(mu=5, sigma=2), n=15:
                     Z (known s)    T (est. s)
----------------------------------------------
          Variance        1.0028        1.1608
            theory        1.0000        1.1667
   Excess kurtosis       -0.1494        0.3359
            theory        0.0000        0.6000
         2.5th pct       -1.9363       -2.1474
            theory       -1.9600       -2.1448

Estimated SE: E[S/sqrt(n)] = 0.5093  (theory sigma/sqrt(n) = 0.5164)


# Figure: T has visibly heavier tails than the standard Normal.
fig, ax = plt.subplots()
xx = np.linspace(-5, 5, 400)
ax.hist(t_vals, bins=60, range=(-5, 5), density=True, color=ORANGE, alpha=0.5,
        edgecolor='white', label=r'simulated $T$ statistic')
ax.plot(xx, stats.norm.pdf(xx), color=BLUE, lw=2.2, label=r'$N(0,1)$')
ax.plot(xx, stats.t.pdf(xx, n - 1), '--', color=RED, lw=2.2, label=r'$t_{14}$')
ax.axvline(stats.norm.ppf(0.025), color=BLUE, lw=1, ls=':')
ax.axvline(stats.t.ppf(0.025, n - 1), color=RED, lw=1, ls=':')
ax.set_xlabel('pivot value'); ax.set_ylabel('density')
ax.set_title(r'Estimating $\sigma$ fattens the tails: $T\sim t_{14}$ vs $N(0,1)$')
ax.legend(); plt.tight_layout(); plt.show()


# CLT: moments of the standardized mean Z_n should approach N(0,1): mean 0, var 1, skew 0, kurt 0.
rng = np.random.default_rng(42)
n_reps = 10000
specs = [
    ('Exp(1)',      lambda k: rng.exponential(1.0, k),   1.0, 1.0),
    ('Chi^2(3)',    lambda k: rng.chisquare(3, k),       3.0, np.sqrt(6)),
    ('Bern(0.3)',   lambda k: rng.binomial(1, 0.3, k),   0.3, np.sqrt(0.3*0.7)),
]
print('Standardized mean Z_n -> N(0,1)   (targets: Mean 0, Var 1, Skew 0, Kurt 0)')
print(f"{'parent':>12}  {'n':>5}  {'Mean':>7}  {'Var':>7}  {'Skew':>7}  {'Kurt':>7}")
print('-' * 52)
for name, sampler, m, sd in specs:
    for n in [5, 15, 50, 200]:
        z = np.zeros(n_reps)
        for i in range(n_reps):
            z[i] = (np.mean(sampler(n)) - m) / (sd / np.sqrt(n))
        print(f"{name:>12}  {n:>5}  {np.mean(z):>7.3f}  {np.var(z):>7.3f}  "
              f"{stats.skew(z):>7.3f}  {stats.kurtosis(z):>7.3f}")
    print()
print('Mean & variance lock in immediately; skewness is the slow coordinate and shrinks ~1/sqrt(n).')

Standardized mean Z_n -> N(0,1)   (targets: Mean 0, Var 1, Skew 0, Kurt 0)
      parent      n     Mean      Var     Skew     Kurt
----------------------------------------------------
      Exp(1)      5    0.001    1.021    0.934    1.284
      Exp(1)     15   -0.005    1.007    0.502    0.315
      Exp(1)     50   -0.009    0.977    0.282    0.154
      Exp(1)    200   -0.002    1.004    0.175    0.055

    Chi^2(3)      5   -0.003    1.012    0.760    0.823
    Chi^2(3)     15    0.023    1.005    0.405    0.253
    Chi^2(3)     50    0.011    1.007    0.232    0.096
    Chi^2(3)    200   -0.002    1.003    0.120    0.135

   Bern(0.3)      5   -0.001    1.002    0.404   -0.197
   Bern(0.3)     15    0.002    0.978    0.270    0.012
   Bern(0.3)     50   -0.003    0.993    0.069   -0.085
   Bern(0.3)    200   -0.003    0.987    0.064   -0.046

Mean & variance lock in immediately; skewness is the slow coordinate and shrinks ~1/sqrt(n).


# Figure: histograms of Z_n for the three parents as n grows (red curve = N(0,1)).
rng = np.random.default_rng(42)
fig, axes = plt.subplots(3, 3, figsize=(11, 8.4))
cols = [5, 30, 200]
xx = np.linspace(-4, 4, 300)
for r, (name, sampler, m, sd) in enumerate(specs):
    for c, n in enumerate(cols):
        z = np.zeros(5000)
        for i in range(5000):
            z[i] = (np.mean(sampler(n)) - m) / (sd / np.sqrt(n))
        ax = axes[r, c]
        ax.hist(z, bins=40, range=(-4, 4), density=True, color=BLUE, alpha=0.5, edgecolor='white')
        ax.plot(xx, stats.norm.pdf(xx), color=RED, lw=2)
        ax.set_yticks([])
        if r == 0:
            ax.set_title(f'n = {n}')
        if c == 0:
            ax.set_ylabel(name, fontsize=11)
fig.suptitle('CLT convergence: skewed parents are pulled toward $N(0,1)$ as $n$ grows', y=1.0)
plt.tight_layout(); plt.show()


# Real data: Michelson-Morley speed-of-light measurements (km/s - 299000).
morley = pd.read_csv(MORLEY_URL)
x = morley['Speed'].astype(float).to_numpy()
n = len(x)
xbar = x.mean()
s = x.std(ddof=1)
se = s / np.sqrt(n)
t_crit = stats.t.ppf(0.975, n - 1)
ci = (xbar - t_crit * se, xbar + t_crit * se)

print(f'Michelson 1879: n = {n} measurements of (speed of light - 299000) km/s')
print(f'  x-bar = {xbar:.2f}   S = {s:.3f}   SE = S/sqrt(n) = {se:.3f}')
print(f'  95% t-interval: [{ci[0]:.2f}, {ci[1]:.2f}]   (t_crit = {t_crit:.4f}, df = {n-1})')
print(f'  => Michelson estimated c ~ {299000 + xbar:.1f} km/s')

# Duality preview: one-sample t-test against the modern accepted value.
mu0 = 792.458   # 299792.458 - 299000
t_stat = (xbar - mu0) / se
p_val = 2 * stats.t.sf(abs(t_stat), n - 1)
print()
print(f'  H0: mu = {mu0} (modern value).  t = {t_stat:.3f}, two-sided p = {p_val:.2e}')
print(f'  {mu0} lies OUTSIDE the 95% interval -> reject H0: a real ~{xbar-mu0:.0f} km/s upward bias.')
print('  (CI and test agree by construction -- the duality formalized in E.4.)')

Michelson 1879: n = 100 measurements of (speed of light - 299000) km/s
  x-bar = 852.40   S = 79.011   SE = S/sqrt(n) = 7.901
  95% t-interval: [836.72, 868.08]   (t_crit = 1.9842, df = 99)
  => Michelson estimated c ~ 299852.4 km/s

  H0: mu = 792.458 (modern value).  t = 7.587, two-sided p = 1.82e-11
  792.458 lies OUTSIDE the 95% interval -> reject H0: a real ~60 km/s upward bias.
  (CI and test agree by construction -- the duality formalized in E.4.)


# Coverage of the 95% t-interval: simulate many intervals for a KNOWN mu and count hits.
rng = np.random.default_rng(42)
mu, sigma, n = 5.0, 2.0, 25
alpha = 0.05
t_crit = stats.t.ppf(1 - alpha/2, df=n - 1)

# 200 intervals for the headline coverage number...
captures = 0
for i in range(200):
    s = rng.normal(mu, sigma, n)
    se = np.std(s, ddof=1) / np.sqrt(n)
    lo, hi = np.mean(s) - t_crit*se, np.mean(s) + t_crit*se
    captures += (lo <= mu <= hi)
print(f'95% t-intervals (n={n}, 200 intervals): captured mu in {captures}/200 = {captures/200:.1%}')
print(f'  expected ~190 captures; t critical value = {t_crit:.4f}')

# ...and store the first 100 (with capture flags) for the caterpillar plot.
rng = np.random.default_rng(7)
los, his, mids, hit = [], [], [], []
for i in range(100):
    s = rng.normal(mu, sigma, n)
    se = np.std(s, ddof=1) / np.sqrt(n)
    m = np.mean(s)
    lo, hi = m - t_crit*se, m + t_crit*se
    los.append(lo); his.append(hi); mids.append(m); hit.append(lo <= mu <= hi)
print(f'  caterpillar sample: {sum(hit)}/100 intervals cover mu = {mu}')

95% t-intervals (n=25, 200 intervals): captured mu in 187/200 = 93.5%
  expected ~190 captures; t critical value = 2.0639
  caterpillar sample: 94/100 intervals cover mu = 5.0


# Figure: 100 confidence intervals; green = covers mu, red = misses.
fig, ax = plt.subplots(figsize=(7.6, 5.0))
for i in range(100):
    c = GREEN if hit[i] else RED
    ax.plot([los[i], his[i]], [i, i], color=c, lw=1.4)
    ax.plot(mids[i], i, 'o', color=c, ms=2.5)
ax.axvline(mu, color=BLUE, lw=1.6, ls='--', label=fr'true $\mu={mu}$')
ax.set_xlabel('interval'); ax.set_ylabel('replicate')
ax.set_title(f'100 independent 95% $t$-intervals: {sum(hit)} cover $\\mu$, {100-sum(hit)} miss')
ax.legend(loc='upper right'); plt.tight_layout(); plt.show()


# Wald vs Wilson coverage for a small proportion p = 0.05.
rng = np.random.default_rng(42)
p_true = 0.05
n_reps = 50000
z = 1.96
ns_p, cov_wald, cov_wilson = [], [], []
print('Coverage of nominal-95% CI for Binomial proportion (p = 0.05)')
print(f"{'n':>5}  {'n*p':>5}  {'Wald':>8}  {'Wilson':>8}")
print('-' * 33)
for n in [10, 20, 50, 200]:
    wald = wilson = 0
    for i in range(n_reps):
        x = rng.binomial(n, p_true); ph = x / n
        se = np.sqrt(ph*(1-ph)/n)
        if ph - z*se <= p_true <= ph + z*se:
            wald += 1
        den = 1 + z**2/n
        center = (ph + z**2/(2*n)) / den
        margin = z*np.sqrt(ph*(1-ph)/n + z**2/(4*n**2)) / den
        if center - margin <= p_true <= center + margin:
            wilson += 1
    ns_p.append(n); cov_wald.append(wald/n_reps); cov_wilson.append(wilson/n_reps)
    print(f"{n:>5}  {n*p_true:>5.1f}  {wald/n_reps:>8.3f}  {wilson/n_reps:>8.3f}")
print()
print('Wald is catastrophic at n=10 (40.3% vs nominal 95%) and still under-covers at n=200.')
print('Wilson holds above 90% throughout -- it inverts the better-behaved score test.')

Coverage of nominal-95% CI for Binomial proportion (p = 0.05)
    n    n*p      Wald    Wilson
---------------------------------
   10    0.5     0.403     0.914
   20    1.0     0.640     0.924
   50    2.5     0.922     0.963
  200   10.0     0.925     0.966

Wald is catastrophic at n=10 (40.3% vs nominal 95%) and still under-covers at n=200.
Wilson holds above 90% throughout -- it inverts the better-behaved score test.


# Figure: actual coverage vs n for the two proportion intervals.
fig, ax = plt.subplots()
ax.plot(ns_p, [100*c for c in cov_wald], 'o-', color=RED, lw=2.2, label='Wald')
ax.plot(ns_p, [100*c for c in cov_wilson], 's-', color=GREEN, lw=2.2, label='Wilson (score)')
ax.axhline(95, color='k', ls='--', lw=1.2, label='nominal 95%')
ax.set_xscale('log')
for n, c in zip(ns_p, cov_wald):
    ax.annotate(f'{100*c:.1f}%', (n, 100*c), textcoords='offset points', xytext=(0, -14),
                ha='center', fontsize=8, color=RED)
ax.set_xlabel('sample size $n$  (log scale)'); ax.set_ylabel('actual coverage (%)')
ax.set_title('Coverage of nominal-95% CIs for $p=0.05$: Wald collapses, Wilson holds')
ax.set_ylim(35, 100); ax.legend(loc='lower right'); plt.tight_layout(); plt.show()


# P-values under H0 (mu=0) and H1 (mu=0.5): two-sided one-sample t-test, n=25.
rng = np.random.default_rng(42)
n = 25; n_reps = 10000

pvals_h0 = np.zeros(n_reps)
for i in range(n_reps):
    s = rng.normal(0.0, 1.0, n)
    t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n))
    pvals_h0[i] = 2 * stats.t.sf(abs(t), df=n - 1)

pvals_h1 = np.zeros(n_reps)
for i in range(n_reps):
    s = rng.normal(0.5, 1.0, n)
    t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n))
    pvals_h1[i] = 2 * stats.t.sf(abs(t), df=n - 1)

print('Two-sided one-sample t-test, n=25')
print('Under H0 (mu=0):')
print(f'  mean p-value      = {np.mean(pvals_h0):.4f}  (theory 0.5000)')
print(f'  reject at a=0.05  = {np.mean(pvals_h0 < 0.05):.4f}  (theory 0.0500 = Type-I rate)')
print(f'  reject at a=0.01  = {np.mean(pvals_h0 < 0.01):.4f}  (theory 0.0100)')
print(f'  fraction in [0.4,0.6) = {np.mean((pvals_h0>=0.4)&(pvals_h0<0.6)):.4f}  (theory 0.2000 -- uniform)')
print('Under H1 (mu=0.5, delta/sigma=0.5):')
print(f'  mean p-value      = {np.mean(pvals_h1):.4f}')
print(f'  power at a=0.05   = {np.mean(pvals_h1 < 0.05):.4f}')
print(f'  power at a=0.01   = {np.mean(pvals_h1 < 0.01):.4f}')
print('  => n=25 has only ~67% power vs a medium effect: it misses one in three real effects.')

Two-sided one-sample t-test, n=25
Under H0 (mu=0):
  mean p-value      = 0.5001  (theory 0.5000)
  reject at a=0.05  = 0.0518  (theory 0.0500 = Type-I rate)
  reject at a=0.01  = 0.0103  (theory 0.0100)
  fraction in [0.4,0.6) = 0.1985  (theory 0.2000 -- uniform)
Under H1 (mu=0.5, delta/sigma=0.5):
  mean p-value      = 0.0808
  power at a=0.05   = 0.6654
  power at a=0.01   = 0.3992
  => n=25 has only ~67% power vs a medium effect: it misses one in three real effects.


# Figure: p-value histograms -- flat under H0, piled near 0 under H1.
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
ax = axes[0]
ax.hist(pvals_h0, bins=20, range=(0, 1), density=True, color=BLUE, alpha=0.65, edgecolor='white')
ax.axhline(1.0, color=RED, lw=2, ls='--', label='Uniform(0,1)')
ax.axvline(0.05, color='k', lw=1, ls=':')
ax.set_title(r'Under $H_0$: p-values are Uniform'); ax.set_xlabel('p-value'); ax.set_ylabel('density')
ax.legend()
ax = axes[1]
ax.hist(pvals_h1, bins=20, range=(0, 1), density=True, color=ORANGE, alpha=0.7, edgecolor='white')
ax.axvline(0.05, color='k', lw=1, ls=':', label=r'$\alpha=0.05$')
ax.set_title(r'Under $H_1$ ($\delta/\sigma=0.5$): mass shifts to 0 (= power)')
ax.set_xlabel('p-value'); ax.set_ylabel('density'); ax.legend()
plt.tight_layout(); plt.show()


# Power grid: two-sided one-sample t-test, alpha = 0.05.
rng = np.random.default_rng(42)
n_reps = 10000; alpha = 0.05
effects = [0.0, 0.2, 0.5, 0.8, 1.0]
sizes = [10, 25, 50, 100]
power = {n: [] for n in sizes}

print('Power of the two-sided one-sample t-test (alpha = 0.05)')
header = 'delta/sigma  ' + '  '.join(f'n={n:>4}' for n in sizes)
print(header); print('-' * len(header))
for d in effects:
    row = []
    for n in sizes:
        rej = 0
        for i in range(n_reps):
            s = rng.normal(d, 1.0, n)
            t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n))
            rej += (2 * stats.t.sf(abs(t), df=n - 1) < alpha)
        pw = rej / n_reps
        power[n].append(pw); row.append(pw)
    label = f'{d:>9.1f}  '
    print(label + '  '.join(f'{v:>6.3f}' for v in row))
print()
print('Row delta=0 ~ 0.05 confirms the level; a "small" effect needs large n, a "large" effect little.')

Power of the two-sided one-sample t-test (alpha = 0.05)
delta/sigma  n=  10  n=  25  n=  50  n= 100
-------------------------------------------
      0.0   0.051   0.050   0.051   0.048
      0.2   0.090   0.161   0.291   0.509
      0.5   0.291   0.662   0.935   0.998
      0.8   0.617   0.970   1.000   1.000
      1.0   0.807   0.998   1.000   1.000

Row delta=0 ~ 0.05 confirms the level; a "small" effect needs large n, a "large" effect little.


# Figure: power curves vs effect size for each n, with Cohen marks and the 0.80 threshold.
rng = np.random.default_rng(123)
grid = np.linspace(0, 1.0, 21)
fig, ax = plt.subplots()
colors = {10: RED, 25: ORANGE, 50: GREEN, 100: BLUE}
for n in sizes:
    curve = []
    for d in grid:
        rej = 0
        for i in range(4000):
            s = rng.normal(d, 1.0, n)
            t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n))
            rej += (2 * stats.t.sf(abs(t), df=n - 1) < 0.05)
        curve.append(rej / 4000)
    ax.plot(grid, curve, '-', color=colors[n], lw=2.2, label=f'n = {n}')
ax.axhline(0.80, color='k', ls='--', lw=1, label='80% power')
for b, name in [(0.2, 'small'), (0.5, 'medium'), (0.8, 'large')]:
    ax.axvline(b, color='gray', ls=':', lw=0.9)
    ax.text(b, 0.02, name, rotation=90, va='bottom', ha='right', fontsize=8, color='gray')
ax.set_xlabel(r'effect size $\delta/\sigma$'); ax.set_ylabel('power')
ax.set_title('Power curves: larger $n$ detects smaller effects'); ax.legend(loc='lower right')
plt.tight_layout(); plt.show()


# Bonferroni vs Benjamini-Hochberg vs unadjusted across 200 simultaneous t-tests.
rng = np.random.default_rng(42)
n_tests, n_true_null, delta, n_per, alpha = 200, 180, 0.5, 30, 0.05
is_null = np.zeros(n_tests, dtype=bool); is_null[:n_true_null] = True

p_values = np.zeros(n_tests)
for i in range(n_tests):
    s = rng.normal(0.0 if is_null[i] else delta, 1.0, n_per)
    t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n_per))
    p_values[i] = 2 * stats.t.sf(abs(t), df=n_per - 1)

raw = p_values < alpha
bonf = p_values < alpha / n_tests
order = np.argsort(p_values); sp = p_values[order]
thresh = np.arange(1, n_tests + 1) * alpha / n_tests
k_max = 0
for k in range(n_tests):
    if sp[k] <= thresh[k]:
        k_max = k + 1
bh = np.zeros(n_tests, dtype=bool); bh[order[:k_max]] = True

def report(name, rej):
    fp = int(np.sum(rej & is_null)); tp = int(np.sum(rej & ~is_null)); tot = int(np.sum(rej))
    fdr = fp / tot if tot else 0.0; pw = tp / np.sum(~is_null); fwer = int(fp > 0)
    print(f"{name:>12}  {tot:>8}  {fp:>4}  {tp:>4}  {fdr:>7.3f}  {pw:>7.3f}  {fwer:>5}")

print(f'200 tests: {n_true_null} true nulls, {n_tests-n_true_null} effects (delta={delta}, n={n_per})')
print(f"{'Method':>12}  {'Rejected':>8}  {'FP':>4}  {'TP':>4}  {'FDR':>7}  {'Power':>7}  {'FWER':>5}")
print('-' * 58)
report('Unadjusted', raw)
report('Bonferroni', bonf)
report('BH (FDR)', bh)
print()
print('Unadjusted: 70% power but 44% of "discoveries" are false. Bonferroni kills false')
print('positives but finds only 5/20. BH recovers 8/20 with no false discoveries here.')

200 tests: 180 true nulls, 20 effects (delta=0.5, n=30)
      Method  Rejected    FP    TP      FDR    Power   FWER
----------------------------------------------------------
  Unadjusted        25    11    14    0.440    0.700      1
  Bonferroni         5     0     5    0.000    0.250      0
    BH (FDR)         8     0     8    0.000    0.400      0

Unadjusted: 70% power but 44% of "discoveries" are false. Bonferroni kills false
positives but finds only 5/20. BH recovers 8/20 with no false discoveries here.


# Fisher information via the variance of the score, single-observation samples.
rng = np.random.default_rng(42)
N = 50000
print('Fisher information I_1(theta) = Var(score)   (50,000 single-obs samples)')
print(f"{'distribution':>20}  {'theory':>12}  {'sim Var(score)':>15}  {'ratio':>7}")
print('-' * 62)

lam = 3.0; x = rng.poisson(lam, N); sc = x/lam - 1
print(f"{'Poisson(3)':>20}  {1/lam:>12.6f}  {np.var(sc):>15.6f}  {np.var(sc)/(1/lam):>7.4f}")

mu, s2 = 5.0, 4.0; x = rng.normal(mu, np.sqrt(s2), N); sc = (x - mu)/s2
print(f"{'Normal(5, 4)':>20}  {1/s2:>12.6f}  {np.var(sc):>15.6f}  {np.var(sc)/(1/s2):>7.4f}")

le = 2.0; x = rng.exponential(1.0/le, N); sc = 1.0/le - x
print(f"{'Exp(2)':>20}  {1/le**2:>12.6f}  {np.var(sc):>15.6f}  {np.var(sc)/(1/le**2):>7.4f}")

p = 0.3; x = rng.binomial(1, p, N); sc = x/p - (1-x)/(1-p)
print(f"{'Bernoulli(0.3)':>20}  {1/(p*(1-p)):>12.6f}  {np.var(sc):>15.6f}  {np.var(sc)/(1/(p*(1-p))):>7.4f}")
print()
print('All ratios within ~1% of 1.0 -- the score-variance definition checks out.')

Fisher information I_1(theta) = Var(score)   (50,000 single-obs samples)
        distribution        theory   sim Var(score)    ratio
--------------------------------------------------------------
          Poisson(3)      0.333333         0.334244   1.0027
        Normal(5, 4)      0.250000         0.247060   0.9882
              Exp(2)      0.250000         0.249684   0.9987
      Bernoulli(0.3)      4.761905         4.756090   0.9988

All ratios within ~1% of 1.0 -- the score-variance definition checks out.


# CRLB: Var(x-bar) vs 1/(n I_1) for Poisson(lambda=3).
rng = np.random.default_rng(42)
lam = 3.0; I1 = 1.0/lam; n_reps = 50000
ns_c, var_sim, crlb = [], [], []
print('CRLB check, Poisson(3):  I_1 = 1/lambda = %.6f' % I1)
print(f"{'n':>6}  {'Var(x-bar) sim':>14}  {'1/(n I_1)':>12}  {'ratio':>7}")
print('-' * 46)
for n in [10, 25, 50, 100, 500]:
    xb = np.array([np.mean(rng.poisson(lam, n)) for _ in range(n_reps)])
    v = np.var(xb); b = 1.0/(n*I1)
    ns_c.append(n); var_sim.append(v); crlb.append(b)
    print(f"{n:>6}  {v:>14.6f}  {b:>12.6f}  {v/b:>7.4f}")
print('Var(x-bar) tracks the bound to <1.5% -- the MLE is efficient for this exponential family.')

CRLB check, Poisson(3):  I_1 = 1/lambda = 0.333333
     n  Var(x-bar) sim     1/(n I_1)    ratio
----------------------------------------------
    10        0.296462      0.300000   0.9882
    25        0.118517      0.120000   0.9876
    50        0.059987      0.060000   0.9998
   100        0.029772      0.030000   0.9924
   500        0.006016      0.006000   1.0026
Var(x-bar) tracks the bound to <1.5% -- the MLE is efficient for this exponential family.


# Figure: simulated Var(x-bar) sits on the Cramer-Rao bound 1/(n I_1) (log-log).
fig, ax = plt.subplots()
ax.plot(ns_c, crlb, '-', color=RED, lw=2.2, label=r'CRLB $1/(nI_1)$')
ax.plot(ns_c, var_sim, 'o', color=BLUE, ms=8, label=r'simulated $\mathrm{Var}(\bar X)$')
ax.set_xscale('log'); ax.set_yscale('log')
ax.set_xlabel('sample size $n$'); ax.set_ylabel(r'$\mathrm{Var}(\bar X)$')
ax.set_title('The Poisson MLE achieves the Cramér-Rao bound'); ax.legend()
plt.tight_layout(); plt.show()


# (a) MLE asymptotic normality for Poisson(3); (b) delta method for g(lambda) = sqrt(lambda).
rng = np.random.default_rng(42)
lam = 3.0; I1 = 1.0/lam; n_reps = 50000

print('(a) Standardized MLE Z_n = sqrt(n I_1)(lambda_hat - lambda) -> N(0,1)')
print(f"{'n':>6}  {'Mean':>7}  {'Var':>7}  {'Skew':>7}  {'Kurt':>7}")
print('-' * 40)
for n in [10, 50, 200, 1000]:
    z = np.array([np.sqrt(n*I1)*(np.mean(rng.poisson(lam, n)) - lam) for _ in range(n_reps)])
    print(f"{n:>6}  {np.mean(z):>7.4f}  {np.var(z):>7.4f}  {stats.skew(z):>7.4f}  {stats.kurtosis(z):>7.4f}")

print()
print('(b) Delta method: Var(sqrt(lambda_hat)) ~ [g\'(lambda)]^2 Var(lambda_hat) = 1/(4n)')
for n in [25, 100, 500]:
    g = np.array([np.sqrt(np.mean(rng.poisson(lam, n))) for _ in range(n_reps)])
    sim = np.var(g); delta = 1.0/(4*n)
    print(f"  n={n:>4}: simulated = {sim:.6f}, delta = {delta:.6f}, ratio = {sim/delta:.4f}")
print('  The 1/(4n) variance is independent of lambda -- sqrt stabilizes the Poisson variance.')

(a) Standardized MLE Z_n = sqrt(n I_1)(lambda_hat - lambda) -> N(0,1)
     n     Mean      Var     Skew     Kurt
----------------------------------------
    10  -0.0005   0.9882   0.1750   0.0266
    50  -0.0015   0.9950   0.0720  -0.0004
   200  -0.0002   0.9986   0.0345  -0.0141
  1000  -0.0010   1.0038   0.0117   0.0175

(b) Delta method: Var(sqrt(lambda_hat)) ~ [g'(lambda)]^2 Var(lambda_hat) = 1/(4n)
  n=  25: simulated = 0.010015, delta = 0.010000, ratio = 1.0015
  n= 100: simulated = 0.002519, delta = 0.002500, ratio = 1.0076
  n= 500: simulated = 0.000494, delta = 0.000500, ratio = 0.9872
  The 1/(4n) variance is independent of lambda -- sqrt stabilizes the Poisson variance.


# Solution 1.  One rng stream, seeded 42: Normal(0,1) first, then Exp(1) -- matches the webbook.
rng = np.random.default_rng(42)
n_reps = 10000; alpha = 0.05
print(f"{'parent':>12}  {'n':>4}  {'coverage':>9}")
print('-' * 29)
cap = 0
for i in range(n_reps):
    s = rng.normal(0.0, 1.0, 15); se = np.std(s, ddof=1)/np.sqrt(15)
    tc = stats.t.ppf(0.975, 14)
    cap += (np.mean(s) - tc*se <= 0.0 <= np.mean(s) + tc*se)
print(f"{'Normal(0,1)':>12}  {15:>4}  {cap/n_reps:>9.4f}")
for n in [5, 15, 50]:
    cap = 0
    for i in range(n_reps):
        s = rng.exponential(1.0, n); se = np.std(s, ddof=1)/np.sqrt(n)
        tc = stats.t.ppf(0.975, n - 1)
        cap += (np.mean(s) - tc*se <= 1.0 <= np.mean(s) + tc*se)
    print(f"{'Exp(1)':>12}  {n:>4}  {cap/n_reps:>9.4f}")
print('Normal: ~0.95 (exact). Exp(1): ~0.88 at n=5, climbing toward 0.95 as n grows.')
print('Skew breaks both CLT-normality of x-bar AND the x-bar / S independence the t-pivot needs.')

      parent     n   coverage
-----------------------------
 Normal(0,1)    15     0.9510
      Exp(1)     5     0.8822
      Exp(1)    15     0.9071
      Exp(1)    50     0.9386
Normal: ~0.95 (exact). Exp(1): ~0.88 at n=5, climbing toward 0.95 as n grows.
Skew breaks both CLT-normality of x-bar AND the x-bar / S independence the t-pivot needs.


# Solution 2.
rng = np.random.default_rng(42)
n = 25; n_reps = 10000; alpha = 0.05
tcrit = stats.t.ppf(1 - alpha/2, df=n - 1)
print(f"{'mu1':>5}  {'simulated':>10}  {'analytical':>11}")
print('-' * 30)
for mu1 in np.arange(0, 1.05, 0.1):
    rej = 0
    for i in range(n_reps):
        s = rng.normal(mu1, 1.0, n)
        t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n))
        rej += (2 * stats.t.sf(abs(t), df=n - 1) < alpha)
    ncp = mu1 * np.sqrt(n)
    anal = stats.nct.sf(tcrit, df=n - 1, nc=ncp) + stats.nct.cdf(-tcrit, df=n - 1, nc=ncp)
    print(f"{mu1:>5.1f}  {rej/n_reps:>10.4f}  {anal:>11.4f}")
print('Simulated and noncentral-t power agree to ~0.01; 80% power is reached near mu1 = 0.6.')

  mu1   simulated   analytical
------------------------------
  0.0      0.0518       0.0500
  0.1      0.0765       0.0768
  0.2      0.1522       0.1605
  0.3      0.3078       0.3020
  0.4      0.4883       0.4840
  0.5      0.6756       0.6697
  0.6      0.8237       0.8207
  0.7      0.9160       0.9188
  0.8      0.9688       0.9696
  0.9      0.9903       0.9907
  1.0      0.9981       0.9977
Simulated and noncentral-t power agree to ~0.01; 80% power is reached near mu1 = 0.6.


# Solution 3.
rng = np.random.default_rng(42)
n = 25; n_reps = 10000
p0 = np.zeros(n_reps)
for i in range(n_reps):
    s = rng.normal(0.0, 1.0, n)
    t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n))
    p0[i] = 2 * stats.t.sf(abs(t), df=n - 1)
print('P-value deciles under H0 (each should be ~0.10):')
for k in range(10):
    lo, hi = k/10, (k+1)/10
    print(f'  [{lo:.1f}, {hi:.1f})  {np.mean((p0>=lo)&(p0<hi)):.4f}')
print()
print('Under H1:   delta   mean p   P(p<0.05)')
for d in [0.2, 0.5, 0.8]:
    p1 = np.zeros(n_reps)
    for i in range(n_reps):
        s = rng.normal(d, 1.0, n)
        t = np.mean(s) / (np.std(s, ddof=1) / np.sqrt(n))
        p1[i] = 2 * stats.t.sf(abs(t), df=n - 1)
    print(f'           {d:>5.1f}   {np.mean(p1):>6.4f}   {np.mean(p1<0.05):>8.4f}')
print('Deciles ~0.10 confirm uniformity; mean p falls and power rises as the effect grows.')

P-value deciles under H0 (each should be ~0.10):
  [0.0, 0.1)  0.1021
  [0.1, 0.2)  0.0950
  [0.2, 0.3)  0.1021
  [0.3, 0.4)  0.1010
  [0.4, 0.5)  0.1002
  [0.5, 0.6)  0.0983
  [0.6, 0.7)  0.1025
  [0.7, 0.8)  0.1015
  [0.8, 0.9)  0.0992
  [0.9, 1.0)  0.0981

Under H1:   delta   mean p   P(p<0.05)
             0.2   0.3714     0.1622
             0.5   0.0810     0.6702
             0.8   0.0068     0.9713
Deciles ~0.10 confirm uniformity; mean p falls and power rises as the effect grows.

§	Topic	Core simulation
E.1	Point estimation: bias / variance / MSE	biased vs. unbiased $\hat\sigma^2$; Method of Moments
E.2	Sampling distributions & the CLT	$\bar X$, $S^2$, $\chi^2$, the $t$ pivot; CLT convergence grid; real data (Morley)
E.3	Confidence intervals & coverage	100-interval caterpillar; Wald vs. Wilson breakdown
E.4	Hypothesis testing: p-values & power	Uniform-under-$H_0$; power grid; multiple testing
E.5	Optional / self-study: information, efficiency & asymptotics	Fisher info, CRLB, MLE normality, delta method
—	Exercises + Summary	coverage / power / p-value drills → Chapters 3–5

Concept	Key result	Confirmed by
Bias-variance	$\text{MSE}=\text{Bias}^2+\text{Var}$	biased $\hat\sigma^2$ wins MSE at every $n$
Sampling dist.	$\bar X\sim N(\mu,\sigma^2/n)$, $(n-1)S^2/\sigma^2\sim\chi^2_{n-1}$, $\bar X\perp S^2$	matched means/vars, $\text{corr}\approx0$
$t$ pivot	heavier tails, $\text{Var}=\tfrac{n-1}{n-3}$	$1.16$ at $n=15$
CLT	$\sqrt n(\bar X-\mu)/\sigma\to N(0,1)$	skewness $\sim1/\sqrt n$ for 3 parents
Coverage	$P(\theta\in\text{CI})=1-\alpha$	$93.5\%$ over 200 $t$-intervals
Wald failure	collapses at small $np$	$40.3\%$ coverage at $n=10$
p-value	Uniform$(0,1)$ under $H_0$	flat deciles, reject rate $=\alpha$
Power	$1-\beta$ grows in $\delta,\,n$	grid + noncentral-$t$ agree
Fisher info / CRLB	$\text{Var}(\hat\theta)\ge 1/(nI_1)$	Poisson $\bar X$ on the bound

Appendix E — Statistical Inference Review¶

Learning outcomes¶

Section map¶

E.1 — Point Estimation: bias, variance, and MSE¶

Method of Moments¶

E.2 — Sampling distributions and the Central Limit Theorem¶

The $t$ pivot: the price of estimating $\sigma$¶

The Central Limit Theorem in action¶

Real-data anchor: Michelson-Morley speed of light¶

E.3 — Confidence intervals and the coverage guarantee¶

When coverage breaks: the Wald proportion interval¶

E.4 — Hypothesis testing: p-values and power¶

Power across effect size and sample size¶

Multiple testing: paying for many looks¶

E.5 — Optional / self-study: information, efficiency, and asymptotics¶

Exercises¶

Summary & connections forward¶