import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats, optimize
from statsmodels.stats.proportion import proportion_confint
%matplotlib inline

np.random.seed(418)            # reproducibility (STYLE.md contract)
plt.rcParams.update({
    'figure.figsize': (7.2, 4.2), 'figure.dpi': 110,
    'axes.grid': True, 'grid.alpha': 0.25,
    'axes.spines.top': False, 'axes.spines.right': False,
    'font.size': 11,
})
BLUE, RED, GREEN, ORANGE = '#4C72B0', '#C44E52', '#55A868', '#DD8452'

# Course data bucket: every dataset for Chapter 3 lives here (no repo needed).
BASE = ('https://pqyjaywwccbnqpwgeiuv.supabase.co/storage/v1/object/public/'
        'STAT%20418%20Images/Data/Chapter3/')
def url(name):
    return BASE + name

print('environment ready · numpy', np.__version__, '· pandas', pd.__version__)

environment ready · numpy 1.26.4 · pandas 2.3.3


# Real data: 2023-24 game-by-game free-throw log.
ft = pd.read_csv(url('nba_freethrows_2023_24.csv'))
curry = ft[ft['player'] == 'Stephen Curry'].reset_index(drop=True)
makes, attempts = int(curry['cum_ft'].iloc[-1]), int(curry['cum_fta'].iloc[-1])

# ---- Bernoulli as an exponential family --------------------------------
# Each free throw is x in {0,1}.  Natural param eta = logit(p);  T(x) = x;
# h(x) = 1;  A(eta) = log(1 + e^eta).  Sufficient stat for the whole sample
# is sum(x) = makes.
x = np.r_[np.ones(makes), np.zeros(attempts - makes)]   # the Bernoulli sample
phat = x.mean()
eta_hat = np.log(phat / (1 - phat))                      # natural parameter
T_sum = x.sum()                                          # sufficient statistic
A = np.log1p(np.exp(eta_hat))                            # log-partition
A_prime = np.exp(eta_hat) / (1 + np.exp(eta_hat))        # A'(eta) = mean
A_dprime = A_prime * (1 - A_prime)                       # A''(eta) = variance

print(f'Curry 2023-24 free throws: {makes}/{attempts}  ->  p_hat = {phat:.4f}')
print(f'  natural parameter   eta_hat = logit(p_hat) = {eta_hat:+.4f}')
print(f'  sufficient statistic  sum(x) = {T_sum:.0f}   (the data reduce to this one number)')
print(f"  log-partition  A(eta)   = {A:.4f}")
print(f"  A'(eta) = {A_prime:.4f}  vs  sample mean = {phat:.4f}   (match -> mean)")
print(f"  A''(eta) = {A_dprime:.4f}  vs  sample var = {x.var():.4f}   (match -> variance)")

Curry 2023-24 free throws: 299/324  ->  p_hat = 0.9228
  natural parameter   eta_hat = logit(p_hat) = +2.4816
  sufficient statistic  sum(x) = 299   (the data reduce to this one number)
  log-partition  A(eta)   = 2.5619
  A'(eta) = 0.9228  vs  sample mean = 0.9228   (match -> mean)
  A''(eta) = 0.0712  vs  sample var = 0.0712   (match -> variance)


# ---- Poisson as an exponential family ----------------------------------
# Per-game free-throw ATTEMPTS as counts.  eta = log(lambda);  T(x) = x;
# h(x) = 1/x!;  A(eta) = e^eta = lambda.  Then A'(eta) = A''(eta) = lambda,
# i.e. the Poisson mean == variance (a testable assumption).
counts = curry['fta'].to_numpy()
lam = counts.mean()
eta_p = np.log(lam)
print(f'Per-game FT attempts: n = {len(counts)} games,  lambda_hat = mean = {lam:.3f}')
print(f"  eta_hat = log(lambda) = {eta_p:+.4f},  sufficient statistic sum(x) = {counts.sum()}")
print(f"  A'(eta) = e^eta = {np.exp(eta_p):.3f}  (predicted mean = variance)")
print(f'  sample variance = {counts.var(ddof=1):.3f}  ->  dispersion ratio = '
      f'{counts.var(ddof=1)/lam:.2f}')
print('  (ratio > 1 = mild OVERdispersion: real attempt counts are a bit more')
print('   variable than Poisson predicts -- exactly the check the family makes easy.)')

Per-game FT attempts: n = 74 games,  lambda_hat = mean = 4.378
  eta_hat = log(lambda) = +1.4767,  sufficient statistic sum(x) = 324
  A'(eta) = e^eta = 4.378  (predicted mean = variance)
  sample variance = 9.499  ->  dispersion ratio = 2.17
  (ratio > 1 = mild OVERdispersion: real attempt counts are a bit more
   variable than Poisson predicts -- exactly the check the family makes easy.)


# Figure: the log-partition A(eta)=log(1+e^eta) for the Bernoulli family.
# Its SLOPE at eta_hat equals p_hat -- 'first derivative of A is the mean'.
etas = np.linspace(-3, 3, 400)
Acurve = np.log1p(np.exp(etas))
fig, ax = plt.subplots()
ax.plot(etas, Acurve, color=BLUE, lw=2.2, label=r'$A(\eta)=\log(1+e^{\eta})$')
tangent = A + A_prime * (etas - eta_hat)
ax.plot(etas, tangent, '--', color=RED, lw=1.6,
        label=fr"tangent slope $A'(\eta)=\hat p={phat:.3f}$")
ax.plot([eta_hat], [A], 'o', color=RED, ms=8)
ax.annotate(fr'$\hat\eta={eta_hat:.2f}$', (eta_hat, A), textcoords='offset points',
            xytext=(10, -18), color=RED)
ax.set_xlabel(r'natural parameter $\eta$'); ax.set_ylabel(r'$A(\eta)$')
ax.set_title('Bernoulli log-partition: its slope is the mean')
ax.legend(loc='upper left'); plt.tight_layout(); plt.show()


# Curry's opening game: a naturally occurring boundary proportion p_hat = 1.
g1 = curry.iloc[0]
x, n = int(g1['ft']), int(g1['fta'])
phat1 = x / n
print(f"Curry opening game {g1['date']}: ft = {x} / fta = {n}  ->  p_hat = {phat1:.2f}")

# (1) WALD -- normal approximation; degenerates because SE = 0 at the boundary.
se = np.sqrt(phat1 * (1 - phat1) / n)
z = stats.norm.ppf(0.975)
wald = (max(0.0, phat1 - z * se), min(1.0, phat1 + z * se))
print(f'  Wald     95% CI: [{wald[0]:.3f}, {wald[1]:.3f}]   '
      f'(SE = {se:.3f} -> ZERO width; a useless "exactly 1")')

# (2) WILSON (score) interval -- inverts the score test; stays inside [0,1].
wil = proportion_confint(x, n, alpha=0.05, method='wilson')
print(f'  Wilson   95% CI: [{wil[0]:.3f}, {wil[1]:.3f}]   (score interval, honest width)')

# (3) PROFILE / LRT interval -- invert the likelihood-ratio test.
# At x = n the closed form lower bound is exp(-chi2_{1,.95} / (2x)).
c = stats.chi2.ppf(0.95, 1)
grid = np.linspace(1e-6, 1.0, 200001)
ll = x * np.log(grid)                      # log-lik at x=n reduces to x*log(p)
inside = grid[2 * (0.0 - ll) <= c]         # ll_max = 0 at p_hat = 1
prof = (inside.min(), inside.max())
print(f'  Profile  95% CI: [{prof[0]:.3f}, {prof[1]:.3f}]   '
      f'(closed form lower = e^(-chi2/2x) = {np.exp(-c/(2*x)):.3f})')

Curry opening game 2023-10-24: ft = 7 / fta = 7  ->  p_hat = 1.00
  Wald     95% CI: [1.000, 1.000]   (SE = 0.000 -> ZERO width; a useless "exactly 1")
  Wilson   95% CI: [0.646, 1.000]   (score interval, honest width)
  Profile  95% CI: [0.760, 1.000]   (closed form lower = e^(-chi2/2x) = 0.760)


# Figure: the binomial log-likelihood for 7/7, with the three intervals.
p = np.linspace(0.3, 0.9999, 500)
loglik = x * np.log(p)            # at x = n the log-lik reduces to x*log(p)
fig, ax = plt.subplots()
ax.plot(p, loglik, color=BLUE, lw=2.2)
ax.axvline(phat1, color='k', lw=1, ls=':', label=fr'MLE $\hat p={phat1:.0f}$')
ax.axhline(-c/2, color=GREEN, lw=1.2, ls='--', label=r'LRT cutoff $-\chi^2_{1,.95}/2$')
for (lo, hi), col, name, yo in [(wald, RED, 'Wald', 0.0),
                                (wil, ORANGE, 'Wilson', -1.1),
                                (prof, GREEN, 'Profile', -2.2)]:
    ax.plot([lo, hi], [yo - 2, yo - 2], color=col, lw=4, solid_capstyle='butt')
    ax.plot([lo], [yo - 2], '|', color=col, ms=14, mew=2)
    ax.text(lo - 0.01, yo - 2, name, ha='right', va='center', color=col, fontsize=9)
ax.set_xlabel(r'free-throw probability $p$'); ax.set_ylabel('log-likelihood')
ax.set_title('Binomial log-likelihood for 7/7 — the Wald interval collapses')
ax.set_ylim(-6, 0.5); ax.legend(loc='lower left'); plt.tight_layout(); plt.show()


# Gradient-evaluation benchmark (scripts/ch3/07).  Reproducible, seed = 0.
rng = np.random.default_rng(0)
Nb, Pb = 400, 15
Xb = np.column_stack([np.ones(Nb), rng.normal(size=(Nb, Pb - 1))])
beta_true = 0.5 * rng.normal(size=Pb)
yb = (rng.uniform(size=Nb) < 1.0 / (1.0 + np.exp(-(Xb @ beta_true)))).astype(float)

def nll(b):
    z = Xb @ b
    return float(np.sum(np.log1p(np.exp(z)) - yb * z))
def grad(b):
    return Xb.T @ (1.0 / (1.0 + np.exp(-(Xb @ b))) - yb)

fd = optimize.minimize(nll, np.zeros(Pb), method='BFGS')              # finite differences
an = optimize.minimize(nll, np.zeros(Pb), method='BFGS', jac=grad)    # exact gradient
print(f'Logistic MLE (n={Nb}, p={Pb}), BFGS, seed=0')
print(f'  finite-difference gradient (jac=None): nfev = {fd.nfev}')
print(f'  exact gradient (analytic / autodiff)  : nfev = {an.nfev}')
print(f'  -> {fd.nfev / an.nfev:.0f}x fewer objective evaluations '
      f'(same optimum: max|beta diff| = {np.max(np.abs(fd.x - an.x)):.1e})')

Logistic MLE (n=400, p=15), BFGS, seed=0
  finite-difference gradient (jac=None): nfev = 688
  exact gradient (analytic / autodiff)  : nfev = 43
  -> 16x fewer objective evaluations (same optimum: max|beta diff| = 3.3e-08)


# Pfizer-BioNTech BNT162b2 primary efficacy endpoint (Polack et al. 2020).
cases_v, n_v = 8, 18198
cases_p, n_p = 162, 18325
p_v, p_p = cases_v / n_v, cases_p / n_p
rr = p_v / p_p
ve = 1 - rr
print(f'  p_vaccine = {p_v:.6f}   p_placebo = {p_p:.6f}')
print(f'  VE = 1 - p_v/p_p = {ve:.4f}  ({100*ve:.1f}%)')

# --- delta method, written out explicitly ---
# g(p_v, p_p) = 1 - p_v/p_p.  Gradient:
dg_dpv = -1 / p_p                 # partial wrt p_v
dg_dpp = p_v / p_p**2             # partial wrt p_p
var_pv = p_v * (1 - p_v) / n_v    # binomial variances of the two rate estimates
var_pp = p_p * (1 - p_p) / n_p
var_ve = dg_dpv**2 * var_pv + dg_dpp**2 * var_pp   # delta method (independent arms)
se_ve = np.sqrt(var_ve)
se_closed = rr * np.sqrt((1 - p_v)/(n_v*p_v) + (1 - p_p)/(n_p*p_p))  # algebra check
print(f'  d(VE)/dp_v = {dg_dpv:.2f},  d(VE)/dp_p = {dg_dpp:.2f}')
print(f'  delta-method SE(VE) = {se_ve:.4f}   (closed-form check = {se_closed:.4f})')
z = stats.norm.ppf(0.975)
wald_ve = (ve - z*se_ve, ve + z*se_ve)
print(f'  Wald 95% CI: ({wald_ve[0]:.3f}, {wald_ve[1]:.3f})   <- symmetric, normal-based')

  p_vaccine = 0.000440   p_placebo = 0.008840
  VE = 1 - p_v/p_p = 0.9503  (95.0%)
  d(VE)/dp_v = -113.12,  d(VE)/dp_p = 5.63
  delta-method SE(VE) = 0.0180   (closed-form check = 0.0180)
  Wald 95% CI: (0.915, 0.986)   <- symmetric, normal-based


# The published Pfizer interval is BAYESIAN, not the Wald delta interval.
# Pre-specified analysis: theta = p_v/(p_v+p_p), prior Beta(0.700102, 1),
# posterior Beta(0.700102 + cases_v, 1 + cases_p);  VE = (1 - 2 theta)/(1 - theta).
post = stats.beta(0.700102 + cases_v, 1.0 + cases_p)
ve_of_theta = lambda th: (1 - 2*th) / (1 - th)        # decreasing in theta
cred = (ve_of_theta(post.ppf(0.975)), ve_of_theta(post.ppf(0.025)))
print(f'  Bayesian beta-binomial 95% CREDIBLE interval: '
      f'({cred[0]:.3f}, {cred[1]:.3f})  = the published (90.3%, 97.6%)')
print('  The HEADLINE interval is Bayesian; with only 8 vaccine-arm events the')
print('  symmetric Wald lower bound (~0.915) is optimistic vs the published 0.903.')

  Bayesian beta-binomial 95% CREDIBLE interval: (0.904, 0.976)  = the published (90.3%, 97.6%)
  The HEADLINE interval is Bayesian; with only 8 vaccine-arm events the
  symmetric Wald lower bound (~0.915) is optimistic vs the published 0.903.


# Figure: VE point estimate with the Wald (delta) vs Bayesian credible interval.
fig, ax = plt.subplots(figsize=(7.2, 2.6))
ax.plot([wald_ve[0], wald_ve[1]], [1, 1], color=BLUE, lw=6, solid_capstyle='butt',
        label='Wald (delta-method)')
ax.plot([cred[0], cred[1]], [0.6, 0.6], color=ORANGE, lw=6, solid_capstyle='butt',
        label='Bayesian credible (published)')
ax.plot([ve], [0.8], 'D', color=RED, ms=10, label=fr'VE point = {100*ve:.1f}%')
for v in (wald_ve[0], wald_ve[1], cred[0], cred[1]):
    ax.annotate(f'{100*v:.1f}%', (v, 1 if v in wald_ve else 0.6),
                textcoords='offset points', xytext=(0, 8 if v in wald_ve else -16),
                ha='center', fontsize=9)
ax.set_yticks([]); ax.set_ylim(0.3, 1.4); ax.set_xlim(0.88, 1.0)
ax.set_xlabel('vaccine efficacy'); ax.set_title('Pfizer VE: two 95% intervals disagree')
ax.legend(loc='lower left', fontsize=9); plt.tight_layout(); plt.show()


adv = pd.read_csv(url('Advertising.csv'))
full = smf.ols('Sales ~ TV + Radio + Newspaper', data=adv).fit()
print(f'Full OLS  Sales ~ TV + Radio + Newspaper   (n = {int(full.nobs)})')
tab = pd.DataFrame({'coef': full.params, 'std_err': full.bse,
                    't': full.tvalues, 'p_value': full.pvalues})
print(tab.round(4).to_string())
print(f'\n  R^2 = {full.rsquared:.4f}   adj R^2 = {full.rsquared_adj:.4f}'
      f'   F = {full.fvalue:.1f} on (df {int(full.df_model)}, {int(full.df_resid)})')
print(f'  >> TV t-statistic in the FULL model = {full.tvalues["TV"]:.2f}  '
      f'(the partial effect, NOT the simple-regression value)')

Full OLS  Sales ~ TV + Radio + Newspaper   (n = 200)
             coef  std_err        t  p_value
Intercept  2.9389   0.3119   9.4223   0.0000
TV         0.0458   0.0014  32.8086   0.0000
Radio      0.1885   0.0086  21.8935   0.0000
Newspaper -0.0010   0.0059  -0.1767   0.8599

  R^2 = 0.8972   adj R^2 = 0.8956   F = 570.3 on (df 3, 196)
  >> TV t-statistic in the FULL model = 32.81  (the partial effect, NOT the simple-regression value)


# Individual t-tests and the nested F-tests.
print('Individual t-tests (H0: beta_j = 0):')
for v in ('TV', 'Radio', 'Newspaper'):
    verdict = 'fail to reject' if full.pvalues[v] > 0.05 else 'REJECT'
    print(f'  {v:9s}: t = {full.tvalues[v]:+6.2f}   p = {full.pvalues[v]:.4f}   -> {verdict}')

simple_tv = smf.ols('Sales ~ TV', data=adv).fit()
joint = full.f_test('Radio = 0, Newspaper = 0')
drop_news = full.f_test('Newspaper = 0')
print(f'\nNested F-tests:')
print(f'  reduced Sales~TV         R^2 = {simple_tv.rsquared:.3f}')
print(f'  full    Sales~TV+Rad+New R^2 = {full.rsquared:.3f}')
print(f'  joint H0: beta_Radio = beta_News = 0  ->  F = {float(joint.fvalue):.1f}  '
      f'p = {float(joint.pvalue):.1e}')
print(f'  single H0: beta_News = 0  ->  F = {float(drop_news.fvalue):.4f} '
      f'= t^2 = {full.tvalues["Newspaper"]**2:.4f}   (F = t^2 identity)')

# Confounding hook: Newspaper is significant ALONE, vanishes in the full model.
simple_news = smf.ols('Sales ~ Newspaper', data=adv).fit()
print(f'\nConfounding hook (holding the others fixed):')
print(f'  Newspaper ALONE: t = {simple_news.tvalues["Newspaper"]:.2f} '
      f'(p = {simple_news.pvalues["Newspaper"]:.4f}, significant!)')
print(f'  Newspaper in FULL model: t = {full.tvalues["Newspaper"]:.2f} (vanishes)')
print(f'  corr(Radio, Newspaper) = {adv["Radio"].corr(adv["Newspaper"]):.3f} '
      f'-> Newspaper rode on Radio')

Individual t-tests (H0: beta_j = 0):
  TV       : t = +32.81   p = 0.0000   -> REJECT
  Radio    : t = +21.89   p = 0.0000   -> REJECT
  Newspaper: t =  -0.18   p = 0.8599   -> fail to reject

Nested F-tests:
  reduced Sales~TV         R^2 = 0.612
  full    Sales~TV+Rad+New R^2 = 0.897
  joint H0: beta_Radio = beta_News = 0  ->  F = 272.0  p = 2.8e-57
  single H0: beta_News = 0  ->  F = 0.0312 = t^2 = 0.0312   (F = t^2 identity)

Confounding hook (holding the others fixed):
  Newspaper ALONE: t = 3.30 (p = 0.0011, significant!)
  Newspaper in FULL model: t = -0.18 (vanishes)
  corr(Radio, Newspaper) = 0.354 -> Newspaper rode on Radio


# Figure: |t| for each coefficient; the dashed line is the |t|~1.97 (5%) threshold.
fig, ax = plt.subplots(figsize=(7.2, 3.4))
vars_ = ['TV', 'Radio', 'Newspaper']
tvals = [abs(full.tvalues[v]) for v in vars_]
cols = [GREEN if abs(full.tvalues[v]) > stats.t.ppf(0.975, full.df_resid) else RED for v in vars_]
bars = ax.bar(vars_, tvals, color=cols, edgecolor='white')
ax.axhline(stats.t.ppf(0.975, full.df_resid), color='k', ls='--', lw=1,
           label=r'$|t|$ = 1.97 (reject at 5%)')
for b, t in zip(bars, tvals):
    ax.text(b.get_x()+b.get_width()/2, t+0.6, f'{t:.1f}', ha='center', fontsize=10)
ax.set_ylabel(r'$|t|$ statistic'); ax.set_title('Advertising: TV & Radio dominate; Newspaper is noise')
ax.legend(); plt.tight_layout(); plt.show()


# Coverage study (scripts/ch3/06): OLS vs HC3 95% CI coverage of the true slope.
rng = np.random.default_rng(42)
B, N, BETA = 10_000, 100, np.array([2.0, 3.0])   # intercept, slope
tcrit = stats.t.ppf(0.975, N - 2)
cover_ols = cover_hc3 = 0
for _ in range(B):
    x = rng.uniform(0.0, 3.0, N)
    sd = (0.5 + x) ** 2                              # heteroskedastic: SD grows with x
    y = BETA[0] + BETA[1] * x + rng.normal(0.0, sd)
    X = np.column_stack([np.ones(N), x])
    XtX_inv = np.linalg.inv(X.T @ X)
    beta = XtX_inv @ X.T @ y
    e = y - X @ beta
    s2 = (e @ e) / (N - 2)
    se_ols = np.sqrt(s2 * XtX_inv[1, 1])
    h = np.einsum('ij,jk,ik->i', X, XtX_inv, X)      # leverages
    meat = X.T @ (X * (e**2 / (1 - h)**2)[:, None])
    se_hc3 = np.sqrt((XtX_inv @ meat @ XtX_inv)[1, 1])
    cover_ols += abs(beta[1] - BETA[1]) <= tcrit * se_ols
    cover_hc3 += abs(beta[1] - BETA[1]) <= tcrit * se_hc3
cov_ols, cov_hc3 = 100*cover_ols/B, 100*cover_hc3/B
print(f'DGP: y = 2 + 3x + e, sigma=(0.5+x)^2, x~U(0,3); n={N}, B={B:,}, seed=42')
print(f'  classical OLS SE : 95% CI coverage of beta_1 = {cov_ols:.1f}%')
print(f'  HC3 sandwich SE  : 95% CI coverage of beta_1 = {cov_hc3:.1f}%')
print(f'  -> OLS under-covers by {95.0 - cov_ols:.1f} pp; HC3 restores near-nominal coverage.')

DGP: y = 2 + 3x + e, sigma=(0.5+x)^2, x~U(0,3); n=100, B=10,000, seed=42
  classical OLS SE : 95% CI coverage of beta_1 = 90.0%
  HC3 sandwich SE  : 95% CI coverage of beta_1 = 94.7%
  -> OLS under-covers by 5.0 pp; HC3 restores near-nominal coverage.


# Figure: coverage bars vs the 95% nominal target.
fig, ax = plt.subplots(figsize=(6.2, 3.6))
bars = ax.bar(['classical OLS', 'HC3 robust'], [cov_ols, cov_hc3],
              color=[RED, GREEN], edgecolor='white', width=0.6)
ax.axhline(95, color='k', ls='--', lw=1.2, label='nominal 95%')
for b, v in zip(bars, [cov_ols, cov_hc3]):
    ax.text(b.get_x()+b.get_width()/2, v+0.15, f'{v:.1f}%', ha='center', fontsize=11)
ax.set_ylim(88, 96); ax.set_ylabel('actual 95%-CI coverage')
ax.set_title('Heteroskedasticity: HC3 restores coverage'); ax.legend()
plt.tight_layout(); plt.show()


tit = pd.read_csv(url('titanic.csv'))
for c in ('Survived', 'Pclass', 'Age', 'Fare'):
    tit[c] = pd.to_numeric(tit[c], errors='coerce')
n_miss = int(tit['Age'].isna().sum())
print(f'Titanic: n = {len(tit)} passengers, Age missing = {n_miss} ({100*n_miss/len(tit):.1f}%)')

# Why OLS fails on binary y: the linear probability model predicts outside [0,1].
lpm1 = smf.ols('Survived ~ Fare', data=tit).fit()
f1 = lpm1.fittedvalues
print(f'  LPM Survived~Fare        : max p_hat = {f1.max():.2f}, #(p_hat>1) = {(f1>1).sum()}')
tit_age = tit.dropna(subset=['Age']).copy()
lpm2 = smf.ols('Survived ~ Pclass + Age + Fare', data=tit_age).fit()
f2 = lpm2.fittedvalues
print(f'  LPM Survived~Pclass+Age+Fare: min p_hat = {f2.min():.2f}, #(p_hat<0) = {(f2<0).sum()}'
      f'  (n = {len(tit_age)})')

Titanic: n = 891 passengers, Age missing = 177 (19.9%)
  LPM Survived~Fare        : max p_hat = 1.59, #(p_hat>1) = 3
  LPM Survived~Pclass+Age+Fare: min p_hat = -0.15, #(p_hat<0) = 6  (n = 714)


# Logistic GLM (statsmodels uses FEMALE as the reference category).
logit = smf.logit('Survived ~ C(Sex) + Pclass + Age', data=tit_age).fit(disp=0)
b_male = logit.params['C(Sex)[T.male]']
OR = pd.Series({'female vs male': np.exp(-b_male),
                'per Pclass level': np.exp(logit.params['Pclass']),
                'per year of Age': np.exp(logit.params['Age'])})
print(f'Logistic  Survived ~ Sex + Pclass + Age   (n = {int(logit.nobs)}, converged = '
      f"{logit.mle_retvals['converged']})")
print(f'  beta_male = {b_male:+.3f}  beta_Pclass = {logit.params["Pclass"]:+.3f}  '
      f'beta_Age = {logit.params["Age"]:+.4f}')
print('  Adjusted odds ratios exp(beta):')
for k, v in OR.items():
    print(f'    OR {k:18s} = {v:.3f}')
print('  Caveat: Age is ~20% missing and NOT MCAR, so listwise deletion (714 of 891)')
print('  adds mild selection bias; the data are observational -> odds ratios are associational.')

Logistic  Survived ~ Sex + Pclass + Age   (n = 714, converged = True)
  beta_male = -2.522  beta_Pclass = -1.289  beta_Age = -0.0369
  Adjusted odds ratios exp(beta):
    OR female vs male     = 12.455
    OR per Pclass level   = 0.276
    OR per year of Age    = 0.964
  Caveat: Age is ~20% missing and NOT MCAR, so listwise deletion (714 of 891)
  adds mild selection bias; the data are observational -> odds ratios are associational.


# Figure: fitted survival probability vs age, by sex, at 2nd class.
ages = np.linspace(tit_age['Age'].min(), tit_age['Age'].max(), 100)
fig, ax = plt.subplots()
for sex, col in [('female', RED), ('male', BLUE)]:
    grid = pd.DataFrame({'Sex': sex, 'Pclass': 2, 'Age': ages})
    ax.plot(ages, logit.predict(grid), color=col, lw=2.4, label=f'{sex} (2nd class)')
ax.scatter(tit_age['Age'], tit_age['Survived'] + np.random.uniform(-0.03, 0.03, len(tit_age)),
           s=8, alpha=0.12, color='gray')
ax.set_xlabel('Age (years)'); ax.set_ylabel('P(survive)')
ax.set_title('Titanic logistic fit: women far likelier to survive'); ax.legend()
plt.tight_layout(); plt.show()


st = pd.read_csv(url('stroop_dataset.csv'))
st['RT'] = pd.to_numeric(st['RT'], errors='coerce')
st['subj'] = pd.to_numeric(st['subj'], errors='coerce')
st = st.dropna(subset=['RT', 'subj', 'condition'])
print(f"Stroop: {st['subj'].nunique()} subjects, {len(st)} trials")

# Diagnostic 1: constant coefficient of variation (CV = sd/mean) -- VIOLATED here.
print('  per-condition mean / sd / CV  (Gamma assumes CV constant):')
for cond, s in st.groupby('condition')['RT']:
    print(f'    {cond:12s} mean = {s.mean():6.1f} ms   sd = {s.std():6.1f}   '
          f'CV = {s.std()/s.mean():.3f}   n = {len(s)}')

# Gamma GLM with log link -> exp(beta) is a mean ratio (Congruent = reference).
gamma = sm.families.Gamma(link=sm.families.links.Log())
fit = smf.glm('RT ~ C(condition)', data=st, family=gamma).fit()
b = fit.params['C(condition)[T.Incongruent]']
print(f'\nGamma GLM  RT ~ condition:  beta_Incongruent = {b:.4f}  ->  '
      f'mean ratio exp(beta) = {np.exp(b):.3f}  (~{(np.exp(b)-1)*100:.0f}% slower)')

Stroop: 50 subjects, 3058 trials
  per-condition mean / sd / CV  (Gamma assumes CV constant):
    Congruent    mean =  563.1 ms   sd =  197.7   CV = 0.351   n = 1032
    Incongruent  mean =  618.8 ms   sd =  325.2   CV = 0.526   n = 2026

Gamma GLM  RT ~ condition:  beta_Incongruent = 0.0944  ->  mean ratio exp(beta) = 1.099  (~10% slower)


# Diagnostic 2: within-subject dependence. Because `condition` varies WITHIN each
# subject, cluster-robust SEs barely move the CONDITION effect -- the dependence
# inflates the INTERCEPT (grand-mean) SE instead.  Do NOT claim the condition SE
# is understated.
fit_cl = smf.glm('RT ~ C(condition)', data=st, family=gamma).fit(
    cov_type='cluster', cov_kwds={'groups': st['subj']})
ck = 'C(condition)[T.Incongruent]'
print('Within-subject dependence (repeated trials per subject):')
print(f'  SE(condition): naive = {fit.bse[ck]:.4f}, cluster = {fit_cl.bse[ck]:.4f}  '
      f'(ratio {fit_cl.bse[ck]/fit.bse[ck]:.2f} -- WITHIN-subject, barely changes)')
print(f'  SE(intercept): naive = {fit.bse["Intercept"]:.4f}, '
      f'cluster = {fit_cl.bse["Intercept"]:.4f}  '
      f'(ratio {fit_cl.bse["Intercept"]/fit.bse["Intercept"]:.2f} -- the grand-mean SE inflates)')

Within-subject dependence (repeated trials per subject):
  SE(condition): naive = 0.0181, cluster = 0.0178  (ratio 0.98 -- WITHIN-subject, barely changes)
  SE(intercept): naive = 0.0148, cluster = 0.0189  (ratio 1.28 -- the grand-mean SE inflates)


# Figure: RT distributions by condition (positive, right-skewed -> Gamma).
fig, ax = plt.subplots()
for cond, col in [('Congruent', GREEN), ('Incongruent', ORANGE)]:
    s = st.loc[st['condition'] == cond, 'RT']
    ax.hist(s, bins=40, range=(0, 2000), alpha=0.55, color=col, density=True,
            label=f'{cond}  (mean {s.mean():.0f} ms)')
    ax.axvline(s.mean(), color=col, lw=2, ls='--')
ax.set_xlabel('reaction time (ms)'); ax.set_ylabel('density')
ax.set_title('Stroop RTs: positive, right-skewed, incongruent slower'); ax.legend()
plt.tight_layout(); plt.show()


# Solution 1.
k, m = makes, attempts                     # full-season makes / attempts (from 3.1)
phat = k / m
I = m / (phat * (1 - phat))                # Fisher information for Binomial(m, p)
se = 1 / np.sqrt(I)                         # = sqrt(p(1-p)/m)
wald = (phat - 1.96*se, phat + 1.96*se)
wil = proportion_confint(k, m, alpha=0.05, method='wilson')
print(f'  full season {k}/{m}: p_hat = {phat:.4f}, SE = {se:.4f}')
print(f'  Wald   95% CI: [{wald[0]:.4f}, {wald[1]:.4f}]')
print(f'  Wilson 95% CI: [{wil[0]:.4f}, {wil[1]:.4f}]')
print('  They nearly coincide: large m and p_hat well inside (0,1) make the normal')
print('  approximation accurate -- the Wald pathology was specific to the n=7 boundary.')

  full season 299/324: p_hat = 0.9228, SE = 0.0148
  Wald   95% CI: [0.8938, 0.9519]
  Wilson 95% CI: [0.8886, 0.9472]
  They nearly coincide: large m and p_hat well inside (0,1) make the normal
  approximation accurate -- the Wald pathology was specific to the n=7 boundary.


# Solution 2.
rd = p_p - p_v
se_rd = np.sqrt(var_pv + var_pp)           # gradient is (+1,-1) -> just add variances
ci_rd = (rd - 1.96*se_rd, rd + 1.96*se_rd)
print(f'  risk difference p_p - p_v = {rd:.5f}  ({rd*1000:.2f} fewer cases per 1000)')
print(f'  delta-method SE = {se_rd:.5f}   95% CI = ({ci_rd[0]:.5f}, {ci_rd[1]:.5f})')
print('  The CI excludes 0 by a wide margin -> a clear protective effect, expressed on')
print('  the absolute (risk) scale rather than the relative (efficacy) scale.')

  risk difference p_p - p_v = 0.00840  (8.40 fewer cases per 1000)
  delta-method SE = 0.00071   95% CI = (0.00701, 0.00979)
  The CI excludes 0 by a wide margin -> a clear protective effect, expressed on
  the absolute (risk) scale rather than the relative (efficacy) scale.


# Solution 3.
rob = smf.ols('Sales ~ TV + Radio + Newspaper', data=adv).fit(cov_type='HC3')
print(f'  TV coef = {full.params["TV"]:.4f}  (identical: HC3 changes SEs, not betas)')
print(f'  classical: SE(TV) = {full.bse["TV"]:.5f}, t = {full.tvalues["TV"]:.2f}')
print(f'  HC3      : SE(TV) = {rob.bse["TV"]:.5f}, t = {rob.tvalues["TV"]:.2f}')
print('  TV stays overwhelmingly significant either way; here the robust SE is close')
print('  to classical, but reporting HC3 is cheap insurance against heteroskedasticity.')

  TV coef = 0.0458  (identical: HC3 changes SEs, not betas)
  classical: SE(TV) = 0.00139, t = 32.81
  HC3      : SE(TV) = 0.00196, t = 23.38
  TV stays overwhelmingly significant either way; here the robust SE is close
  to classical, but reporting HC3 is cheap insurance against heteroskedasticity.


# Solution 4.
prof = pd.DataFrame({'Sex': ['female', 'male'], 'Pclass': [1, 1], 'Age': [30, 30]})
prof['P(survive)'] = logit.predict(prof).round(3)
print(prof.to_string(index=False))
print('  A 1st-class 30-year-old woman is predicted to survive with high probability;')
print('  an otherwise-identical man far less likely -- the OR ~12.5 made concrete.')

   Sex  Pclass  Age  P(survive)
female       1   30       0.935
  male       1   30       0.534
  A 1st-class 30-year-old woman is predicted to survive with high probability;
  an otherwise-identical man far less likely -- the OR ~12.5 made concrete.

§	Topic	Real-data worked example
3.1	Exponential families	Bernoulli & Poisson from NBA free throws
3.2	Maximum likelihood	Curry's 7/7 binomial CI (Wald vs Wilson vs profile) + gradient benchmark
3.3	Sampling variability & the delta method	Pfizer vaccine efficacy (VE ≈ 95%)
3.4	Linear models	ISLR Advertising OLS + robust-SE coverage study
3.5	Generalized linear models	Titanic logistic + Stroop Gamma GLM
3.6	Summary & connections	formula table → Chapter 4 (the bootstrap)

Quantity	Formula	This chapter's example
Exp.-family density	$h(x)\exp\{\eta^\top T(x) - A(\eta)\}$	Bernoulli / Poisson free throws
Mean from log-partition	$\mathbb{E}[T] = A'(\eta)$	$\hat p$ as slope of $A$
MLE (binomial)	$\hat p = x/n$	Curry 7/7
Fisher information	$I(p) = n/[p(1-p)]$	$\mathrm{SE}(\hat p)$
Confidence interval	Wald / Wilson / profile	boundary $\hat p = 1$
Delta method	$\mathrm{Var}(g) \approx \nabla g^\top \Sigma \nabla g$	VE $= 95\%$, SE $\approx 0.018$
OLS estimator	$\hat\beta = (X^\top X)^{-1}X^\top y$	Advertising, TV $t \approx 32.8$
Robust (HC3) SE	sandwich $(X^\top X)^{-1} X^\top \hat\Omega X (X^\top X)^{-1}$	90.0% → 94.7% coverage
Logistic OR	$\exp(\beta_j)$	female vs male $\approx 12.5$
Gamma mean ratio	$\exp(\beta_j)$, log link	incongruent $\approx 1.10$

Chapter 3 — Parametric Inference¶

Learning outcomes¶

Section map (all core — nothing optional in Chapter 3)¶

Section 3.1 — Exponential Families¶

Section 3.2 — Maximum Likelihood Estimation¶

When the MLE has no closed form: supply the gradient¶

Section 3.3 — Sampling Variability and the Delta Method¶

Section 3.4 — Linear Models (OLS)¶

Robust (HC3) standard errors under heteroskedasticity¶

Section 3.5 — Generalized Linear Models¶

Logistic regression — Titanic survival¶

Gamma regression — Stroop reaction times¶

Exercises¶

Section 3.6 — Summary & connections forward¶