import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

np.random.seed(418)                     # reproducibility (STYLE.md contract)
plt.rcParams.update({
    'figure.figsize': (7.2, 4.2), 'figure.dpi': 110,
    'axes.grid': True, 'grid.alpha': 0.25,
    'axes.spines.top': False, 'axes.spines.right': False,
    'font.size': 11,
})
BLUE, RED, GREEN, ORANGE, PURPLE = '#4C72B0', '#C44E52', '#55A868', '#DD8452', '#8172B3'

# Course data bucket. Chapter 2 has no data of its own, so the two real-data
# touchpoints borrow datasets that live under other chapters' folders.
SUPA = ('https://pqyjaywwccbnqpwgeiuv.supabase.co/storage/v1/object/public/'
        'STAT%20418%20Images/Data/')
def data_url(chapter, name):
    return f'{SUPA}Chapter{chapter}/{name}'

print('environment ready · numpy', np.__version__, '· pandas', pd.__version__)

environment ready · numpy 1.26.4 · pandas 2.3.3


# Monte Carlo estimate of pi (seed 42 reproduces the webbook's 3.143080).
rng = np.random.default_rng(42)
n = 100_000
x = rng.uniform(-1, 1, n)
y = rng.uniform(-1, 1, n)
inside = (x**2 + y**2) <= 1.0

p_hat = inside.mean()
pi_hat = 4 * p_hat
se_pi = 4 * np.sqrt(p_hat * (1 - p_hat) / n)        # Bernoulli SE, scaled by 4
ci = (pi_hat - 1.96 * se_pi, pi_hat + 1.96 * se_pi)

print(f'pi estimate : {pi_hat:.6f}   (n = {n:,})')
print(f'true pi     : {np.pi:.6f}')
print(f'error       : {abs(pi_hat - np.pi):.6f}')
print(f'std error   : {se_pi:.6f}')
print(f'95% CI      : ({ci[0]:.6f}, {ci[1]:.6f})   -> contains pi: {ci[0] < np.pi < ci[1]}')

pi estimate : 3.143080   (n = 100,000)
true pi     : 3.141593
error       : 0.001487
std error   : 0.005190
95% CI      : (3.132908, 3.153252)   -> contains pi: True


# Figure: (left) the dartboard, (right) the noisy O(1/sqrt n) convergence.
fig, (axL, axR) = plt.subplots(1, 2, figsize=(11.5, 4.4))

sub = slice(0, 4000)                                  # plot a subsample for clarity
axL.scatter(x[sub][inside[sub]], y[sub][inside[sub]], s=4, color=BLUE, alpha=0.5, label='inside')
axL.scatter(x[sub][~inside[sub]], y[sub][~inside[sub]], s=4, color=ORANGE, alpha=0.5, label='outside')
th = np.linspace(0, 2*np.pi, 200)
axL.plot(np.cos(th), np.sin(th), color='k', lw=1.5)
axL.set_aspect('equal'); axL.set_title(f'{4000} darts: blue/total approx pi/4')
axL.set_xlabel('x'); axL.set_ylabel('y'); axL.legend(loc='upper right', fontsize=8)

run_p = np.cumsum(inside) / np.arange(1, n + 1)       # running estimate of p
run_pi = 4 * run_p
ns = np.arange(1, n + 1)
band = 4 * 1.96 * np.sqrt(run_p * (1 - run_p) / ns)
axR.plot(ns, run_pi, color=BLUE, lw=1.2, label='running estimate')
axR.fill_between(ns, run_pi - band, run_pi + band, color=BLUE, alpha=0.15, label='95% band')
axR.axhline(np.pi, color=RED, ls='--', lw=1.4, label=f'true pi = {np.pi:.4f}')
axR.set_xscale('log'); axR.set_xlim(10, n); axR.set_ylim(2.9, 3.4)
axR.set_xlabel('samples n (log scale)'); axR.set_ylabel('estimate of pi')
axR.set_title('convergence: band shrinks like 1/sqrt(n)'); axR.legend(loc='upper right', fontsize=8)
plt.tight_layout(); plt.show()


# Example 1 -- Gaussian integral via importance reweighting (seed 42 -> 1.7708).
rng = np.random.default_rng(42)
X = rng.standard_normal(100_000)
h = np.sqrt(2*np.pi) * np.exp(-X**2 / 2)              # exp(-x^2)/phi(x)
gauss, gauss_se = h.mean(), h.std(ddof=1)/np.sqrt(len(X))
print(f'1. Gaussian integral: {gauss:.6f} +/- {gauss_se:.6f}   true sqrt(pi) = {np.sqrt(np.pi):.6f}')

# Example 2 -- rare event P(Z > 4): naive MC is unreliable.
rng = np.random.default_rng(42)
Z = rng.standard_normal(100_000)
p_naive = (Z > 4).mean()
p_true = 1 - stats.norm.cdf(4)
n_exceed = int((Z > 4).sum())
print(f'2. Rare event P(Z>4): naive = {p_naive:.3e} ({n_exceed} exceedances in 100k)   '
      f'true = {p_true:.3e}  -> off by {p_naive/p_true:.1f}x (see 2.6)')

# Example 3 -- Phi(1.96), no closed form, as an indicator average.
rng = np.random.default_rng(42)
Z = rng.standard_normal(1_000_000)
phi_hat = (Z <= 1.96).mean()
print(f'3. Normal CDF Phi(1.96): {phi_hat:.6f}   true = {stats.norm.cdf(1.96):.6f}')

# Example 4 -- a d=100 product integral; true value -> sqrt(e) as d grows.
rng = np.random.default_rng(42)
d = 100
Xd = rng.random((100_000, d))
vals = np.prod(1 + Xd / d, axis=1)
truth = (1 + 0.5/d)**d
print(f'4. High-dim (d={d}): {vals.mean():.6f} +/- {vals.std(ddof=1)/np.sqrt(len(vals)):.6f}   '
      f'true = {truth:.6f}   (sqrt(e) = {np.sqrt(np.e):.6f})')

1. Gaussian integral: 1.770751 +/- 0.002214   true sqrt(pi) = 1.772454
2. Rare event P(Z>4): naive = 9.000e-05 (9 exceedances in 100k)   true = 3.167e-05  -> off by 2.8x (see 2.6)
3. Normal CDF Phi(1.96): 0.974928   true = 0.975002
4. High-dim (d=100): 1.646530 +/- 0.000149   true = 1.646668   (sqrt(e) = 1.648721)


# Fraction of the hypercube [-1,1]^d that lies inside the inscribed unit ball.
from scipy.special import gamma
print(f"{'d':>4}  {'cube vol':>12}  {'ball vol':>12}  {'ratio (% in ball)':>18}")
print('-' * 52)
for d in [1, 2, 3, 5, 10, 20, 50, 100]:
    cube = 2.0**d
    ball = np.pi**(d/2) / gamma(d/2 + 1)
    print(f'{d:>4}  {cube:>12.3e}  {ball:>12.3e}  {100*ball/cube:>17.3e}%')
print('-> by d=10 only ~0.25% of the cube is in the ball; by d=100 it is ~1e-68%.')
print('   Uniform darts almost never hit where a peaked integrand lives.')

   d      cube vol      ball vol   ratio (% in ball)
----------------------------------------------------
   1     2.000e+00     2.000e+00          1.000e+02%
   2     4.000e+00     3.142e+00          7.854e+01%
   3     8.000e+00     4.189e+00          5.236e+01%
   5     3.200e+01     5.264e+00          1.645e+01%
  10     1.024e+03     2.550e+00          2.490e-01%
  20     1.049e+06     2.581e-02          2.461e-06%
  50     1.126e+15     1.730e-13          1.537e-26%
 100     1.268e+30     2.368e-40          1.868e-68%
-> by d=10 only ~0.25% of the cube is in the ball; by d=100 it is ~1e-68%.
   Uniform darts almost never hit where a peaked integrand lives.


# Real dataset: Michelson speed-of-light measurements (km/s - 299000).
morley = pd.read_csv(data_url(4, 'morley.csv'))
speed = morley['Speed'].to_numpy(dtype=float)
n = speed.size
xbar, s = speed.mean(), speed.std(ddof=1)
se_clt = s / np.sqrt(n)
true_excess = 299792.458 - 299000                      # modern accepted value, same units
print(f'Michelson: n = {n} runs,  mean = {xbar:.1f} km/s (i.e. {299000+xbar:.1f}),  sd = {s:.1f}')
print(f'           modern truth in these units = {true_excess:.1f}  ->  Michelson was high by '
      f'{xbar - true_excess:.1f} km/s')

# Bootstrap = Monte Carlo over the data: B resamples, mean of each.
rng = np.random.default_rng(418)
B = 10_000
idx = rng.integers(0, n, size=(B, n))                  # B x n resample indices
boot_means = speed[idx].mean(axis=1)
se_boot = boot_means.std(ddof=1)
ci_pct = np.percentile(boot_means, [2.5, 97.5])
print(f'\nBootstrap (B = {B:,} resamples):')
print(f'  bootstrap SE of the mean = {se_boot:.3f}   vs   classical s/sqrt(n) = {se_clt:.3f}   '
      f'(agree within {100*abs(se_boot-se_clt)/se_clt:.1f}%)')
print(f'  percentile 95% CI for the mean = ({ci_pct[0]:.1f}, {ci_pct[1]:.1f})')
print(f'  -> the modern value {true_excess:.1f} is OUTSIDE this CI: a real, famous bias.')

Michelson: n = 100 runs,  mean = 852.4 km/s (i.e. 299852.4),  sd = 79.0
           modern truth in these units = 792.5  ->  Michelson was high by 59.9 km/s

Bootstrap (B = 10,000 resamples):
  bootstrap SE of the mean = 8.023   vs   classical s/sqrt(n) = 7.901   (agree within 1.5%)
  percentile 95% CI for the mean = (836.5, 867.7)
  -> the modern value 792.5 is OUTSIDE this CI: a real, famous bias.


# Figure: the Monte Carlo sampling distribution of the mean, with the CLT overlay.
fig, ax = plt.subplots()
ax.hist(boot_means, bins=45, density=True, color=BLUE, alpha=0.55, edgecolor='white',
        label=f'{B:,} bootstrap means')
grid = np.linspace(boot_means.min(), boot_means.max(), 300)
ax.plot(grid, stats.norm.pdf(grid, xbar, se_clt), color=RED, lw=2.2,
        label=r'CLT $\mathcal{N}(\bar x,\, s^2/n)$')
ax.axvline(xbar, color='k', lw=1.4, ls=':', label=f'mean = {xbar:.1f}')
ax.axvline(true_excess, color=GREEN, lw=2.0, ls='--', label=f'modern truth = {true_excess:.1f}')
ax.set_xlabel('resampled mean speed (km/s - 299000)'); ax.set_ylabel('density')
ax.set_title('Bootstrap sampling distribution of the mean = Monte Carlo over data')
ax.legend(fontsize=8); plt.tight_layout(); plt.show()


# A minimal LCG, plus a numerical proof of RANDU's hidden plane structure.
def lcg(a, c, m, seed, n):
    x = np.empty(n, dtype=np.int64)
    x[0] = seed
    for i in range(1, n):
        x[i] = (a * x[i-1] + c) % m
    return x

# RANDU: a = 65539, c = 0, m = 2**31, odd seed.
M = 2**31
X = lcg(65539, 0, M, seed=1, n=100_000)
U = X / M

# The algebra: X[n+2] - 6 X[n+1] + 9 X[n] == 0  (mod 2**31) for EVERY triple.
residual = (X[2:] - 6*X[1:-1] + 9*X[:-2]) % M
# residual is either 0 or 2**31 (== 0 mod m); fold both to 0:
folded = np.minimum(residual, M - residual)
print(f'RANDU triples obeying X[n+2] = 6 X[n+1] - 9 X[n] (mod 2^31): '
      f'{(folded == 0).mean()*100:.1f}% of {len(folded):,}')
print(f'max |residual| (mod m) across all triples = {folded.max()}   -> exact linear relation')

# Sanity on the uniform marginal (the 1-D test it PASSES, hiding the 3-D failure):
print(f'mean(U) = {U.mean():.4f} (target 0.5),  var(U) = {U.var():.4f} (target {1/12:.4f})')

RANDU triples obeying X[n+2] = 6 X[n+1] - 9 X[n] (mod 2^31): 100.0% of 99,998
max |residual| (mod m) across all triples = 0   -> exact linear relation
mean(U) = 0.5019 (target 0.5),  var(U) = 0.0833 (target 0.0833)


# Figure: a small-modulus LCG shows the parallel-line lattice in 2-D directly.
Xs = lcg(a=1229, c=1, m=2048, seed=1, n=2048) / 2048      # tiny m to expose lines
fig, (a1, a2) = plt.subplots(1, 2, figsize=(11.5, 4.4))
a1.scatter(Xs[:-1], Xs[1:], s=6, color=ORANGE, alpha=0.6)
a1.set_title('weak LCG (m=2048): pairs fall on few parallel lines')
a1.set_xlabel('U[n]'); a1.set_ylabel('U[n+1]'); a1.set_aspect('equal')

good = np.random.default_rng(418).random(2048)
a2.scatter(good[:-1], good[1:], s=6, color=GREEN, alpha=0.6)
a2.set_title("modern PCG64 (default_rng): no visible structure")
a2.set_xlabel('U[n]'); a2.set_ylabel('U[n+1]'); a2.set_aspect('equal')
plt.tight_layout(); plt.show()


# Modern best practice: default_rng (PCG64) + independent parallel streams.
master = np.random.default_rng(418)
print('reproducible draws (seed 418):', np.round(master.random(4), 4))

# Independent, non-overlapping streams for parallel jobs via SeedSequence.spawn:
children = np.random.SeedSequence(418).spawn(3)
streams = [np.random.default_rng(c) for c in children]
for k, g in enumerate(streams):
    print(f'  worker {k} first draws: {np.round(g.random(3), 4)}')
print('Each worker gets a statistically independent stream -- never reseed in a loop.')

reproducible draws (seed 418): [0.3192 0.9897 0.2241 0.2963]
  worker 0 first draws: [0.1288 0.3528 0.6102]
  worker 1 first draws: [0.239  0.7851 0.3758]
  worker 2 first draws: [0.6725 0.8036 0.876 ]
Each worker gets a statistically independent stream -- never reseed in a loop.


# Exponential via inverse-CDF (seed 42 reproduces the webbook's 0.5031/0.2486/0.3483).
rng = np.random.default_rng(42)
rate = 2.0
U = rng.random(10_000)
U = np.maximum(U, np.finfo(float).tiny)               # guard log(0)
Xe = -np.log(U) / rate

ks = stats.kstest(Xe, 'expon', args=(0, 1/rate))      # loc=0, scale=1/rate
print(f'Exponential(rate={rate}) via X = -ln(U)/rate, n = {len(Xe):,}')
print(f'  sample mean   = {Xe.mean():.4f}   theory 1/rate     = {1/rate:.4f}')
print(f'  sample var    = {Xe.var(ddof=1):.4f}   theory 1/rate^2   = {1/rate**2:.4f}')
print(f'  sample median = {np.median(Xe):.4f}   theory ln2/rate   = {np.log(2)/rate:.4f}')
print(f'  KS test vs Exponential: statistic = {ks.statistic:.4f}, p-value = {ks.pvalue:.3f}  '
      f'(p large -> good fit)')

Exponential(rate=2.0) via X = -ln(U)/rate, n = 10,000
  sample mean   = 0.5031   theory 1/rate     = 0.5000
  sample var    = 0.2486   theory 1/rate^2   = 0.2500
  sample median = 0.3483   theory ln2/rate   = 0.3466
  KS test vs Exponential: statistic = 0.0074, p-value = 0.634  (p large -> good fit)


# Figure: inverse-CDF samples vs the true density, and the geometric construction.
fig, (a1, a2) = plt.subplots(1, 2, figsize=(11.5, 4.4))
a1.hist(Xe, bins=50, density=True, color=BLUE, alpha=0.55, edgecolor='white', label='inverse-CDF samples')
xx = np.linspace(0, Xe.max(), 300)
a1.plot(xx, rate*np.exp(-rate*xx), color=RED, lw=2.2, label=f'true Exp({rate}) pdf')
a1.set_xlabel('x'); a1.set_ylabel('density'); a1.set_title('inverse-CDF matches the target'); a1.legend()

# Geometric picture: pick u on the y-axis of F, read x = F^{-1}(u) on the x-axis.
xg = np.linspace(0, 3, 300); Fg = 1 - np.exp(-rate*xg)
a2.plot(xg, Fg, color=BLUE, lw=2.2, label=r'CDF $F(x)=1-e^{-\lambda x}$')
for u in [0.25, 0.5, 0.8]:
    xinv = -np.log(1-u)/rate
    a2.plot([0, xinv], [u, u], color=ORANGE, lw=1.2)
    a2.plot([xinv, xinv], [0, u], color=ORANGE, lw=1.2)
    a2.plot(xinv, u, 'o', color=RED, ms=6)
a2.set_xlabel('x'); a2.set_ylabel('u = F(x)'); a2.set_title('U on y-axis -> X on x-axis'); a2.legend()
plt.tight_layout(); plt.show()


# Target categorical distribution.
values = np.array([0, 1, 2, 3, 4])
probs = np.array([0.10, 0.15, 0.20, 0.25, 0.30])
N = 200_000
rng = np.random.default_rng(418)

# (A) Inverse-CDF via binary search on the cumulative probabilities.
cum = np.cumsum(probs)
draws_inv = np.searchsorted(cum, rng.random(N))

# (B) Walker's alias method: O(K) build, O(1) per draw.
def build_alias(p):
    K = len(p); q = p * K
    prob = np.zeros(K); alias = np.zeros(K, dtype=int)
    small = [i for i in range(K) if q[i] < 1.0]
    large = [i for i in range(K) if q[i] >= 1.0]
    while small and large:
        s = small.pop(); l = large.pop()
        prob[s] = q[s]; alias[s] = l
        q[l] -= (1.0 - q[s])
        (small if q[l] < 1.0 else large).append(l)
    for i in large + small:
        prob[i] = 1.0
    return prob, alias

aprob, aalias = build_alias(probs.copy())
col = rng.integers(0, len(probs), N)
draws_alias = np.where(rng.random(N) < aprob[col], col, aalias[col])

emp_inv = np.bincount(draws_inv, minlength=5) / N
emp_ali = np.bincount(draws_alias, minlength=5) / N
print('category        :', '  '.join(f'{v:6d}' for v in values))
print('target prob     :', '  '.join(f'{p:6.3f}' for p in probs))
print('inverse-CDF freq:', '  '.join(f'{p:6.3f}' for p in emp_inv))
print('alias-method freq:', ' '.join(f'{p:6.3f}' for p in emp_ali))
print(f'max |empirical - target|: inverse = {np.abs(emp_inv-probs).max():.4f}, '
      f'alias = {np.abs(emp_ali-probs).max():.4f}  (both exact up to MC noise)')

category        :      0       1       2       3       4
target prob     :  0.100   0.150   0.200   0.250   0.300
inverse-CDF freq:  0.099   0.151   0.200   0.251   0.299
alias-method freq:  0.100  0.152  0.199  0.249  0.300
max |empirical - target|: inverse = 0.0010, alias = 0.0017  (both exact up to MC noise)


# Box-Muller: two uniforms -> two independent standard normals.
rng = np.random.default_rng(418)
m = 100_000
U1 = np.maximum(rng.random(m), np.finfo(float).tiny)
U2 = rng.random(m)
R = np.sqrt(-2*np.log(U1))
Z1 = R * np.cos(2*np.pi*U2)
Z2 = R * np.sin(2*np.pi*U2)
Z = np.concatenate([Z1, Z2])

ks = stats.kstest(Z, 'norm')
print(f'Box-Muller standard normals (n = {Z.size:,}):')
print(f'  mean = {Z.mean():+.4f} (target 0),  var = {Z.var():.4f} (target 1)')
print(f'  corr(Z1, Z2) = {np.corrcoef(Z1, Z2)[0,1]:+.4f} (target 0 -> the pair is independent)')
print(f'  KS vs N(0,1): statistic = {ks.statistic:.4f}, p-value = {ks.pvalue:.3f}')

Box-Muller standard normals (n = 200,000):
  mean = +0.0003 (target 0),  var = 0.9997 (target 1)
  corr(Z1, Z2) = -0.0035 (target 0 -> the pair is independent)
  KS vs N(0,1): statistic = 0.0014, p-value = 0.818


# Cholesky: turn independent normals into a target-correlated multivariate normal.
mu = np.array([1.0, -2.0])
Sigma = np.array([[2.0, 0.8],
                  [0.8, 1.0]])
L = np.linalg.cholesky(Sigma)
rng = np.random.default_rng(418)
Zmat = rng.standard_normal((50_000, 2))
Xmv = mu + Zmat @ L.T

emp_mean = Xmv.mean(axis=0)
emp_cov = np.cov(Xmv, rowvar=False)
print('multivariate normal via X = mu + L Z:')
print(f'  empirical mean = [{emp_mean[0]:+.3f}, {emp_mean[1]:+.3f}]   target = [{mu[0]:+.1f}, {mu[1]:+.1f}]')
print('  empirical cov =\n', np.round(emp_cov, 3))
print('  target cov    =\n', Sigma)

fig, ax = plt.subplots(figsize=(5.6, 5.0))
ax.scatter(Xmv[:3000, 0], Xmv[:3000, 1], s=5, alpha=0.25, color=PURPLE)
# overlay the 95% covariance ellipse from the eigen-decomposition of Sigma
vals, vecs = np.linalg.eigh(Sigma)
ang = np.degrees(np.arctan2(vecs[1, 1], vecs[0, 1]))
from matplotlib.patches import Ellipse
w, h = 2*np.sqrt(vals[::-1]*stats.chi2.ppf(0.95, 2))
ax.add_patch(Ellipse(mu, w, h, angle=ang, fill=False, color=RED, lw=2.2, label='95% ellipse'))
ax.plot(*mu, 'X', color='k', ms=10)
ax.set_xlabel('X1'); ax.set_ylabel('X2'); ax.set_aspect('equal')
ax.set_title('Cholesky-correlated normals'); ax.legend()
plt.tight_layout(); plt.show()

multivariate normal via X = mu + L Z:
  empirical mean = [+1.002, -2.002]   target = [+1.0, -2.0]
  empirical cov =
 [[1.997 0.795]
 [0.795 0.997]]
  target cov    =
 [[2.  0.8]
 [0.8 1. ]]


# Rejection sampling from Beta(2.5, 6) with a Uniform(0,1) proposal.
a, b = 2.5, 6.0
mode = (a - 1) / (a + b - 2)
M = stats.beta.pdf(mode, a, b)                         # smallest valid envelope constant
print(f'Beta({a}, {b}): mode x* = {mode:.4f},  M* = f(x*) = {M:.4f},  '
      f'predicted accept rate 1/M = {1/M:.4f}')

rng = np.random.default_rng(418)
n_prop = 300_000
Xp = rng.random(n_prop)                                # proposal draws ~ Uniform(0,1)
Uacc = rng.random(n_prop)
accept = Uacc <= stats.beta.pdf(Xp, a, b) / M          # g(x)=1, so M*g(x)=M
samples = Xp[accept]
print(f'  empirical accept rate = {accept.mean():.4f}   ({samples.size:,} accepted of {n_prop:,})')
print(f'  sample mean = {samples.mean():.4f}   theory a/(a+b) = {a/(a+b):.4f}')
ks = stats.kstest(samples, 'beta', args=(a, b))
print(f'  KS vs Beta({a},{b}): statistic = {ks.statistic:.4f}, p-value = {ks.pvalue:.3f}')

Beta(2.5, 6.0): mode x* = 0.2308,  M* = f(x*) = 2.6268,  predicted accept rate 1/M = 0.3807
  empirical accept rate = 0.3807   (114,205 accepted of 300,000)
  sample mean = 0.2939   theory a/(a+b) = 0.2941
  KS vs Beta(2.5,6.0): statistic = 0.0023, p-value = 0.579


# Figure: the envelope, plus accepted (blue) vs rejected (orange) proposal points.
xx = np.linspace(0, 1, 400)
fig, ax = plt.subplots()
ax.plot(xx, stats.beta.pdf(xx, a, b), color=RED, lw=2.4, label=f'target Beta({a},{b})')
ax.axhline(M, color='k', ls='--', lw=1.4, label=f'envelope M*g(x) = {M:.2f}')
sub = slice(0, 4000)
yvis = Uacc[sub] * M                                    # height of each dart under the envelope
ax.scatter(Xp[sub][accept[sub]], yvis[accept[sub]], s=5, color=BLUE, alpha=0.35, label='accepted')
ax.scatter(Xp[sub][~accept[sub]], yvis[~accept[sub]], s=5, color=ORANGE, alpha=0.25, label='rejected')
ax.set_xlabel('x'); ax.set_ylabel('density / dart height')
ax.set_title(f'Rejection sampling: accept fraction = area ratio = 1/M = {1/M:.3f}')
ax.legend(loc='upper right', fontsize=8); plt.tight_layout(); plt.show()


# Real dataset: Curry's cumulative free throws -> a Beta-Binomial posterior.
ft = pd.read_csv(data_url(3, 'nba_freethrows_2023_24.csv'))
curry = ft[ft['player'] == 'Stephen Curry']
k = int(curry['cum_ft'].iloc[-1])                      # makes
m = int(curry['cum_fta'].iloc[-1])                     # attempts
mle = k / m
print(f'Curry 2023-24: {k} makes / {m} attempts  ->  MLE theta_hat = {mle:.4f}')

# Unnormalized posterior kernel (log scale for stability); envelope peak at the MLE.
def log_kernel(t):
    return k*np.log(t) + (m-k)*np.log1p(-t)
log_M = log_kernel(mle)                                # log of the peak height

rng = np.random.default_rng(418)
n_prop = 600_000
Tp = rng.random(n_prop)
with np.errstate(divide='ignore'):
    log_ratio = log_kernel(Tp) - log_M                 # log[ f(t) / (M*g(t)) ], g=1
accept = np.log(rng.random(n_prop)) <= log_ratio
post = Tp[accept]

exact_mean = (1 + k) / (2 + m)                         # Beta(1+k, 1+m-k) mean
exact_sd = np.sqrt((1+k)*(1+m-k) / ((2+m)**2 * (3+m)))
print(f'  accept rate = {accept.mean():.4f}  ({post.size:,} draws)  '
      f'-> a Uniform proposal is wasteful for a peaked posterior (a Normal/Beta proposal is better)')
print(f'  rejection-sample posterior mean = {post.mean():.4f} +/- {post.std(ddof=1):.4f}')
print(f'  exact Beta({1+k}, {1+m-k}) mean = {exact_mean:.4f},  sd = {exact_sd:.4f}   (match)')
print(f'  rejection 95% credible interval = ({np.percentile(post,2.5):.4f}, {np.percentile(post,97.5):.4f})')

Curry 2023-24: 299 makes / 324 attempts  ->  MLE theta_hat = 0.9228
  accept rate = 0.0374  (22,427 draws)  -> a Uniform proposal is wasteful for a peaked posterior (a Normal/Beta proposal is better)
  rejection-sample posterior mean = 0.9203 +/- 0.0149
  exact Beta(300, 26) mean = 0.9202,  sd = 0.0150   (match)
  rejection 95% credible interval = (0.8892, 0.9470)


# Importance sampling for the rare event P(Z>4): exponential tilting to N(4,1).
# Seed 42 + the webbook's draw order reproduce VRF ~ 7000x.
rng = np.random.default_rng(42)
n = 10_000
thr = 4.0
p_true = 1 - stats.norm.cdf(thr)

Z_naive = rng.standard_normal(n)                       # naive MC (drawn first, as in webbook)
p_naive = (Z_naive > thr).mean()
se_naive = np.sqrt(p_true*(1-p_true)/n)

X_is = rng.normal(loc=thr, size=n)                     # proposal g = N(4,1)
log_w = -thr*X_is + thr**2/2                           # log(f/g) = log[phi(x)/phi(x-4)]
w = np.exp(log_w)
ind = (X_is > thr).astype(float)
wh = ind * w
p_is = wh.mean()
se_is = np.sqrt(wh.var()/n)
norm_w = w / w.sum()
ess = 1.0 / np.sum(norm_w**2)
vrf = (p_true*(1-p_true)) / wh.var()

print(f'True P(Z>4) = {p_true:.3e}')
print(f'Naive MC (n={n:,}): est = {p_naive:.3e}, SE = {se_naive:.3e}   '
      f'({int((Z_naive>thr).sum())} exceedance(s))')
print(f'Importance sampling: est = {p_is:.3e}, SE = {se_is:.2e}, ESS = {ess:.1f}')
print(f'  -> variance reduction factor ~ {vrf:.0f}x  '
      f'(IS reaches with 10k samples what naive MC needs ~{vrf:.0f}x more to match)')

True P(Z>4) = 3.167e-05
Naive MC (n=10,000): est = 1.000e-04, SE = 5.628e-05   (1 exceedance(s))
Importance sampling: est = 3.142e-05, SE = 6.71e-07, ESS = 7.5
  -> variance reduction factor ~ 7024x  (IS reaches with 10k samples what naive MC needs ~7024x more to match)


# Figure: why tilting works -- move the proposal onto the rare region.
xx = np.linspace(-4, 8, 400)
fig, ax = plt.subplots()
ax.plot(xx, stats.norm.pdf(xx), color=BLUE, lw=2.2, label=r'target $f=\phi(x)$')
ax.plot(xx, stats.norm.pdf(xx, loc=thr), color=GREEN, lw=2.2, label=r'proposal $g=\phi(x-4)$')
ax.axvline(thr, color='k', ls=':', lw=1.4)
tail = xx[xx >= thr]
ax.fill_between(tail, 0, stats.norm.pdf(tail), color=RED, alpha=0.5, label=r'rare region $x>4$')
ax.set_xlabel('x'); ax.set_ylabel('density')
ax.set_title('Importance sampling: tilt the proposal onto the tail'); ax.legend()
plt.tight_layout(); plt.show()


# Control variates and antithetic variates for I = E[e^U], U ~ Uniform(0,1).
# True value e - 1; both methods exploit the strong monotone structure of e^u.
I_true = np.e - 1
rng = np.random.default_rng(418)
n = 100_000

# --- Control variate: use C = U (known mean 1/2), beta* = Cov(H,C)/Var(C). ---
U = rng.random(n)
H = np.exp(U)
C = U
beta = np.cov(H, C, ddof=1)[0, 1] / np.var(C, ddof=1)
H_cv = H - beta * (C - 0.5)
rho_cv = np.corrcoef(H, C)[0, 1]
vrf_cv = 1.0 / (1.0 - rho_cv**2)
print(f'I = E[e^U], true = {I_true:.5f}')
print(f'Control variate (C=U): beta* = {beta:.4f}, corr(H,C) = {rho_cv:.4f}')
print(f'  naive  est = {H.mean():.5f}, SE = {H.std(ddof=1)/np.sqrt(n):.5f}')
print(f'  CV     est = {H_cv.mean():.5f}, SE = {H_cv.std(ddof=1)/np.sqrt(n):.5f}   '
      f'(predicted VRF 1/(1-rho^2) = {vrf_cv:.0f}x)')

# --- Antithetic variate: pair U with 1-U (monotone h -> negative correlation). ---
rng = np.random.default_rng(418)
npairs = 50_000
Ua = rng.random(npairs)
H1, H2 = np.exp(Ua), np.exp(1 - Ua)
anti = (H1 + H2) / 2
rho_a = np.corrcoef(H1, H2)[0, 1]
vrf_a = 1.0 / (1.0 + rho_a)
Hind = np.exp(rng.random(2*npairs))                    # equal-cost independent baseline
print(f'\nAntithetic (U, 1-U): corr = {rho_a:.4f} (negative -> helps)')
print(f'  antithetic est = {anti.mean():.5f}, SE = {anti.std(ddof=1)/np.sqrt(npairs):.6f}')
print(f'  independent est = {Hind.mean():.5f}, SE = {Hind.std(ddof=1)/np.sqrt(2*npairs):.6f}   '
      f'(predicted per-eval VRF 1/(1+rho) = {vrf_a:.0f}x)')

# --- Counterexample: a symmetric h makes antithetic variates BACKFIRE. ---
g1, g2 = (Ua - 0.5)**2, ((1 - Ua) - 0.5)**2            # h(u)=(u-0.5)^2 is symmetric
print(f'\nCounterexample h(u)=(u-0.5)^2: corr(h(U),h(1-U)) = {np.corrcoef(g1, g2)[0,1]:+.3f} '
      f'(= +1 -> antithetic DOUBLES variance; only use it for monotone h).')

I = E[e^U], true = 1.71828
Control variate (C=U): beta* = 1.6897, corr(H,C) = 0.9918
  naive  est = 1.71672, SE = 0.00155
  CV     est = 1.71795, SE = 0.00020   (predicted VRF 1/(1-rho^2) = 62x)

Antithetic (U, 1-U): corr = -0.9678 (negative -> helps)
  antithetic est = 1.71773, SE = 0.000278
  independent est = 1.71883, SE = 0.001558   (predicted per-eval VRF 1/(1+rho) = 31x)

Counterexample h(u)=(u-0.5)^2: corr(h(U),h(1-U)) = +1.000 (= +1 -> antithetic DOUBLES variance; only use it for monotone h).


# Stratified sampling for the same I = E[e^U]: split [0,1] into K equal strata.
rng = np.random.default_rng(418)
n_total = 100_000
K = 10
per = n_total // K
p_k = 1.0 / K                                          # equal stratum probabilities

# Naive baseline.
U = rng.random(n_total)
naive_mean = np.exp(U).mean()
naive_var = np.exp(U).var(ddof=1) / n_total

# Proportional allocation: 'per' uniforms inside each [k/K, (k+1)/K].
strat_means = np.empty(K); strat_vars = np.empty(K)
for kk in range(K):
    Uk = (kk + rng.random(per)) / K                    # Uniform within stratum kk
    hk = np.exp(Uk)
    strat_means[kk] = hk.mean()
    strat_vars[kk] = hk.var(ddof=1)
strat_est = np.sum(p_k * strat_means)
strat_var = np.sum(p_k**2 * strat_vars / per)
print(f'I = E[e^U], true = {np.e-1:.5f}')
print(f'  naive      est = {naive_mean:.5f}, variance = {naive_var:.3e}')
print(f'  stratified est = {strat_est:.5f}, variance = {strat_var:.3e}   '
      f'(K={K} strata)')
print(f'  variance reduction factor ~ {naive_var/strat_var:.0f}x  '
      f'(stratification removes the between-stratum variance of the steep e^u)')

I = E[e^U], true = 1.71828
  naive      est = 1.71672, variance = 2.407e-06
  stratified est = 1.71832, variance = 2.653e-08   (K=10 strata)
  variance reduction factor ~ 91x  (stratification removes the between-stratum variance of the steep e^u)


# Solution 1.
p = np.pi / 4
sigma = np.sqrt(16 * p * (1 - p))                      # SD of h = 4 * indicator
eps = 0.001
n_req = (1.96 * sigma / eps)**2
print(f'sigma of 4*indicator = {sigma:.4f}')
print(f'required n for 95% CI half-width <= {eps}: {n_req:,.0f}  (~10 million darts)')
print('The quadratic 1/eps^2 cost is the price of Monte Carlo: each extra digit = 100x work.')

sigma of 4*indicator = 1.6422
required n for 95% CI half-width <= 0.001: 10,359,897  (~10 million darts)
The quadratic 1/eps^2 cost is the price of Monte Carlo: each extra digit = 100x work.


# Solution 2.
rng = np.random.default_rng(418)
alpha, xm = 2.5, 1.0
U = np.maximum(rng.random(100_000), np.finfo(float).tiny)
Xp = xm * U**(-1/alpha)
theory_mean = alpha * xm / (alpha - 1)
print(f'Pareto(alpha={alpha}, x_m={xm}) via x_m * U^(-1/alpha):')
print(f'  sample mean = {Xp.mean():.4f}   theory a*x_m/(a-1) = {theory_mean:.4f}')
print(f'  sample min  = {Xp.min():.4f}   (>= x_m = {xm}, as required)')
print('  Heavy right tail: the sample mean is noisier than for light-tailed targets.')

Pareto(alpha=2.5, x_m=1.0) via x_m * U^(-1/alpha):
  sample mean = 1.6657   theory a*x_m/(a-1) = 1.6667
  sample min  = 1.0000   (>= x_m = 1.0, as required)
  Heavy right tail: the sample mean is noisier than for light-tailed targets.


# Solution 3.
a, b = 2.0, 5.0
mode = (a - 1) / (a + b - 2)
M = stats.beta.pdf(mode, a, b)
rng = np.random.default_rng(418)
Xp = rng.random(200_000)
accept = rng.random(200_000) <= stats.beta.pdf(Xp, a, b) / M
print(f'Beta({a},{b}): mode = {mode:.4f}, M = {M:.4f}, predicted accept = 1/M = {1/M:.4f}')
print(f'  empirical accept rate = {accept.mean():.4f}')
print(f'  sample mean = {Xp[accept].mean():.4f}   theory a/(a+b) = {a/(a+b):.4f}')

Beta(2.0,5.0): mode = 0.2000, M = 2.4576, predicted accept = 1/M = 0.4069
  empirical accept rate = 0.4088
  sample mean = 0.2860   theory a/(a+b) = 0.2857


# Solution 4.
rng = np.random.default_rng(418)
B = 10_000
idx = rng.integers(0, speed.size, size=(B, speed.size))
boot_med = np.median(speed[idx], axis=1)
ci = np.percentile(boot_med, [2.5, 97.5])
print(f'Michelson median speed (km/s - 299000): point = {np.median(speed):.1f}')
print(f'  bootstrap 95% CI for the median = ({ci[0]:.1f}, {ci[1]:.1f})')
print(f'  bootstrap SE of the median = {boot_med.std(ddof=1):.2f}')
print('  No closed-form SE for a median exists -- the bootstrap (Monte Carlo) just resamples.')

Michelson median speed (km/s - 299000): point = 850.0
  bootstrap 95% CI for the median = (840.0, 870.0)
  bootstrap SE of the median = 8.29
  No closed-form SE for a median exists -- the bootstrap (Monte Carlo) just resamples.

§	Topic	Status	Real-data / worked example
2.1	Monte Carlo fundamentals	core	$\pi$, Gaussian integral, rare event, curse of dimensionality + Michelson speed-of-light bootstrap preview
2.2	Uniform random variates	optional	LCG period & the RANDU 15-plane disaster
2.3	Inverse-CDF method	core	Exponential (+KS test), discrete sampling + alias method
2.4	Transformation methods	optional	Box–Muller, multivariate normal via Cholesky
2.5	Rejection sampling	core	Beta(2.5, 6) envelope + free-throw Bayesian posterior
2.6	Variance reduction	core	importance sampling, control / antithetic / stratified
2.7	Summary & connections	core	formula table $\to$ Chapter 4 bootstrap

Concept	Formula	This notebook's example
Monte Carlo estimator	$\hat I_n = \frac1n\sum_i h(X_i)$	$\pi \approx 3.1431$
Standard error / 95% CI	$s/\sqrt n$; $\hat I_n \pm 1.96\,\mathrm{SE}$	Michelson mean
$O(n^{-1/2})$ rate	dimension-independent	$d=100$ product integral
Bootstrap (Ch 4 preview)	resample data, recompute	speed-of-light SE & median CI
Inverse CDF	$X = F^{-1}(U)$	Exponential (+KS), alias method
Box–Muller	$Z=\sqrt{-2\ln U_1}\cos(2\pi U_2)$	standard normals
Rejection accept prob.	$\tilde f(X)/[M g(X)]$, rate $1/M$	Beta(2.5,6) $\to 38\%$; FT posterior
Importance weight / ESS	$w=f/g$; $\mathrm{ESS}=1/\sum\bar w_i^2$	$P(Z>4)$, VRF $\approx 7000$
Control variate VRF	$1/(1-\rho^2)$	$\mathbb E[e^U]$
Antithetic VRF	$1/(1+\rho)$, needs $\rho<0$	$\mathbb E[e^U]$ (+ counterexample)

Chapter 2 — Monte Carlo Simulation¶

Learning outcomes¶

Section map¶

Section 2.1 — Monte Carlo Fundamentals¶

First example: estimating $\pi$ by throwing darts¶

Four worked integrals: a tour of what Monte Carlo can do¶

Why high dimensions need Monte Carlo: the curse of dimensionality¶

Real data: the bootstrap is Monte Carlo (Michelson's speed of light)¶

Section 2.2 — Uniform Random Variates¶

Section 2.3 — The Inverse-CDF Method¶

Discrete sampling: linear scan vs the $O(1)$ alias method¶

Section 2.4 — Transformation Methods¶

Section 2.5 — Rejection Sampling¶

Real data: a free-throw posterior, sampled by rejection¶

Section 2.6 — Variance Reduction Methods¶

Exercises¶

Section 2.7 — Summary & connections forward¶