# =============================================================================
# CHAPTER 3: PARAMETRIC INFERENCE - UNIFIED IMPORTS
# =============================================================================

import numpy as np
from numpy.random import default_rng, SeedSequence
from numpy.linalg import inv, solve, eigvals, norm, cholesky, qr, svd

import scipy
from scipy import stats, integrate, special, linalg
from scipy.special import (logsumexp, gammaln, digamma, polygamma,
                           gamma as gamma_func, beta as beta_func,
                           comb, factorial, expit, logit)
from scipy.optimize import minimize, minimize_scalar, brentq, newton, fsolve
from scipy.stats import (norm as normal_dist, t as t_dist, chi2, f as f_dist,
                         poisson, binom, bernoulli, expon, gamma as gamma_dist,
                         beta as beta_dist, nbinom, uniform)

import pandas as pd
import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib
import warnings
from typing import Callable, Tuple, Dict, List, Optional, Union
from dataclasses import dataclass
from abc import ABC, abstractmethod

# Version check
print("="*65)
print("CHAPTER 3: PARAMETRIC INFERENCE - ENVIRONMENT CHECK")
print("="*65)
print(f"NumPy: {np.__version__}")
print(f"SciPy: {scipy.__version__}")
print(f"Pandas: {pd.__version__}")
print("✓ All imports successful!")

# Plotting config
try:
    plt.style.use('seaborn-v0_8-whitegrid')
except:
    plt.style.use('seaborn-whitegrid')

plt.rcParams.update({'figure.figsize': (10, 6), 'font.size': 11, 'figure.dpi': 100})
COLORS = {'primary': '#1f77b4', 'secondary': '#ff7f0e', 'success': '#2ca02c', 
          'danger': '#d62728', 'purple': '#9467bd', 'gray': '#7f7f7f'}
MASTER_SEED = 42
print("✓ Plotting configuration complete!")

=================================================================
CHAPTER 3: PARAMETRIC INFERENCE - ENVIRONMENT CHECK
=================================================================
NumPy: 1.26.4
SciPy: 1.17.0
Pandas: 2.3.3
✓ All imports successful!
✓ Plotting configuration complete!

pip install statsmodels
# or
conda install statsmodels

# =============================================================================
# STATSMODELS QUICK DEMONSTRATION
# =============================================================================

def statsmodels_demo():
    """Demonstrate key statsmodels features used in this chapter."""
    print("="*70)
    print("STATSMODELS DEMONSTRATION")
    print("="*70)
    
    rng = default_rng(42)
    n = 100
    
    # Generate simple regression data
    x = rng.uniform(0, 10, n)
    y = 2 + 3*x + rng.normal(0, 2, n)
    X = sm.add_constant(x)  # Adds column of 1s for intercept
    
    print("\n1. ORDINARY LEAST SQUARES")
    print("-"*50)
    ols_model = sm.OLS(y, X)
    ols_results = ols_model.fit()
    
    print(f"Coefficients: {ols_results.params}")
    print(f"Std Errors:   {ols_results.bse}")
    print(f"R-squared:    {ols_results.rsquared:.4f}")
    
    print("\n2. ROBUST STANDARD ERRORS")
    print("-"*50)
    ols_hc = sm.OLS(y, X).fit(cov_type='HC3')  # Heteroskedasticity-robust
    print(f"Model-based SE(slope): {ols_results.bse[1]:.4f}")
    print(f"HC3 robust SE(slope):  {ols_hc.bse[1]:.4f}")
    
    print("\n3. GENERALIZED LINEAR MODEL (Logistic)")
    print("-"*50)
    # Binary outcome
    p = expit(-1 + 0.5*x)
    y_binary = rng.binomial(1, p)
    
    glm_model = sm.GLM(y_binary, X, family=families.Binomial())
    glm_results = glm_model.fit()
    
    print(f"Coefficients: {glm_results.params}")
    print(f"Odds ratios:  {np.exp(glm_results.params)}")
    print(f"Deviance:     {glm_results.deviance:.2f}")
    
    print("\n4. COMPREHENSIVE SUMMARY")
    print("-"*50)
    print("Call results.summary() for full output including:")
    print("  - R², Adj R², AIC, BIC")
    print("  - Coefficients with SE, t-stat, p-value, CI")
    print("  - Omnibus, Durbin-Watson, Jarque-Bera tests")
    
    print("\n" + "="*70)
    print("See https://www.statsmodels.org for complete documentation")
    print("="*70)

statsmodels_demo()

======================================================================
STATSMODELS DEMONSTRATION
======================================================================

1. ORDINARY LEAST SQUARES
--------------------------------------------------
Coefficients: [1.89739945 3.01532592]
Std Errors:   [0.40524764 0.07266589]
R-squared:    0.9462

2. ROBUST STANDARD ERRORS
--------------------------------------------------
Model-based SE(slope): 0.0727
HC3 robust SE(slope):  0.0708

3. GENERALIZED LINEAR MODEL (Logistic)
--------------------------------------------------
Coefficients: [-1.25396175  0.58015244]
Odds ratios:  [0.28537198 1.78631071]
Deviance:     83.78

4. COMPREHENSIVE SUMMARY
--------------------------------------------------
Call results.summary() for full output including:
  - R², Adj R², AIC, BIC
  - Coefficients with SE, t-stat, p-value, CI
  - Omnibus, Durbin-Watson, Jarque-Bera tests

======================================================================
See https://www.statsmodels.org for complete documentation
======================================================================

# =============================================================================
# 3.1 Exponential Family Demonstration
# =============================================================================

def demonstrate_exponential_families():
    """Show canonical forms and verify moment-generating property."""
    print("="*70)
    print("EXPONENTIAL FAMILY: MOMENT GENERATION FROM LOG-PARTITION")
    print("="*70)
    
    # Poisson example: η = log(λ), A(η) = exp(η)
    print("\nPOISSON DISTRIBUTION: η = log(λ), A(η) = exp(η)")
    print("-"*50)
    
    rng = default_rng(42)
    for lam in [2.0, 5.0, 10.0]:
        eta = np.log(lam)
        A_prime = np.exp(eta)      # E[X] = A'(η)
        A_double_prime = np.exp(eta)  # Var(X) = A''(η)
        
        samples = rng.poisson(lam, 50000)
        print(f"\nλ = {lam}:")
        print(f"  Theory: E[X] = A'(η) = {A_prime:.4f}, Var(X) = A''(η) = {A_double_prime:.4f}")
        print(f"  Sample: mean = {np.mean(samples):.4f}, var = {np.var(samples):.4f}")
    
    print("\n" + "="*70)
    print("✓ Log-partition function generates all moments!")
    print("="*70)

demonstrate_exponential_families()

======================================================================
EXPONENTIAL FAMILY: MOMENT GENERATION FROM LOG-PARTITION
======================================================================

POISSON DISTRIBUTION: η = log(λ), A(η) = exp(η)
--------------------------------------------------

λ = 2.0:
  Theory: E[X] = A'(η) = 2.0000, Var(X) = A''(η) = 2.0000
  Sample: mean = 2.0019, var = 2.0067

λ = 5.0:
  Theory: E[X] = A'(η) = 5.0000, Var(X) = A''(η) = 5.0000
  Sample: mean = 4.9972, var = 4.9405

λ = 10.0:
  Theory: E[X] = A'(η) = 10.0000, Var(X) = A''(η) = 10.0000
  Sample: mean = 9.9992, var = 9.9161

======================================================================
✓ Log-partition function generates all moments!
======================================================================

# =============================================================================
# 3.2 Newton-Raphson and Fisher Scoring for Gamma MLE
# =============================================================================

def gamma_mle_comparison():
    """Compare Newton-Raphson and Fisher Scoring for Gamma distribution."""
    print("="*70)
    print("GAMMA MLE: NEWTON-RAPHSON vs FISHER SCORING")
    print("="*70)
    
    rng = default_rng(42)
    alpha_true, beta_true = 3.0, 2.0
    data = rng.gamma(alpha_true, 1/beta_true, size=200)
    n = len(data)
    
    log_sum = np.sum(np.log(data))
    sum_x = np.sum(data)
    
    def log_lik(theta):
        alpha, beta = theta
        if alpha <= 0 or beta <= 0: return -np.inf
        return n*alpha*np.log(beta) - n*gammaln(alpha) + (alpha-1)*log_sum - beta*sum_x
    
    def score(theta):
        alpha, beta = theta
        return np.array([n*np.log(beta) - n*digamma(alpha) + log_sum,
                        n*alpha/beta - sum_x])
    
    def hessian(theta):
        alpha, beta = theta
        return np.array([[-n*polygamma(1, alpha), n/beta],
                        [n/beta, -n*alpha/beta**2]])
    
    # Method of moments initialization
    x_bar, s2 = np.mean(data), np.var(data)
    theta = np.array([x_bar**2/s2, x_bar/s2])
    
    print(f"\nTrue: α={alpha_true}, β={beta_true}")
    print(f"Initial (MoM): α={theta[0]:.4f}, β={theta[1]:.4f}")
    
    # Newton-Raphson
    for i in range(10):
        grad = score(theta)
        hess = hessian(theta)
        delta = np.linalg.solve(hess, -grad)
        theta = theta + delta
        if np.linalg.norm(delta) < 1e-8: break
    
    print(f"\nNewton-Raphson MLE: α̂={theta[0]:.4f}, β̂={theta[1]:.4f}")
    
    # Compare with scipy
    fit_alpha, _, fit_scale = gamma_dist.fit(data, floc=0)
    print(f"scipy.stats.gamma: α̂={fit_alpha:.4f}, β̂={1/fit_scale:.4f}")

gamma_mle_comparison()

======================================================================
GAMMA MLE: NEWTON-RAPHSON vs FISHER SCORING
======================================================================

True: α=3.0, β=2.0
Initial (MoM): α=3.1761, β=2.0482

Newton-Raphson MLE: α̂=3.2931, β̂=2.1236
scipy.stats.gamma: α̂=3.2931, β̂=2.1236

# =============================================================================
# 3.3 Delta Method Demonstration
# =============================================================================

def delta_method_demo():
    """Demonstrate delta method for transformed parameters."""
    print("="*70)
    print("DELTA METHOD: VARIANCE PROPAGATION")
    print("="*70)
    
    rng = default_rng(42)
    lambda_true = 0.5
    n = 100
    
    # Exponential data
    data = rng.exponential(1/lambda_true, n)
    x_bar = np.mean(data)
    lambda_hat = 1/x_bar
    
    # Delta method: g(x) = 1/x, g'(x) = -1/x²
    se_xbar = np.std(data)/np.sqrt(n)
    se_lambda_delta = (1/x_bar**2) * se_xbar
    
    print(f"\nExponential(λ={lambda_true}) data, n={n}")
    print(f"X̄ = {x_bar:.4f}, λ̂ = 1/X̄ = {lambda_hat:.4f}")
    print(f"\nDelta Method:")
    print(f"  SE(λ̂) = |g'(X̄)| × SE(X̄) = {se_lambda_delta:.4f}")
    
    # Verify via simulation
    lambda_estimates = [1/np.mean(rng.exponential(1/lambda_true, n)) for _ in range(10000)]
    print(f"\nSimulation verification (10,000 reps):")
    print(f"  Simulated SE(λ̂) = {np.std(lambda_estimates):.4f}")

delta_method_demo()

======================================================================
DELTA METHOD: VARIANCE PROPAGATION
======================================================================

Exponential(λ=0.5) data, n=100
X̄ = 1.7981, λ̂ = 1/X̄ = 0.5561

Delta Method:
  SE(λ̂) = |g'(X̄)| × SE(X̄) = 0.0547

Simulation verification (10,000 reps):
  Simulated SE(λ̂) = 0.0508

# =============================================================================
# 3.4 OLS Implementation
# =============================================================================

def ols_demonstration():
    """OLS from first principles."""
    print("="*70)
    print("OLS FROM FIRST PRINCIPLES")
    print("="*70)
    
    rng = default_rng(42)
    n = 100
    beta_true = np.array([2.0, 3.0, -1.5])
    sigma = 1.5
    
    x1, x2 = rng.uniform(0, 10, n), rng.uniform(-5, 5, n)
    X = np.column_stack([np.ones(n), x1, x2])
    y = X @ beta_true + rng.normal(0, sigma, n)
    
    # OLS via normal equations
    beta_hat = solve(X.T @ X, X.T @ y)
    
    # Variance estimation
    resid = y - X @ beta_hat
    sigma2_hat = np.sum(resid**2) / (n - 3)
    se_beta = np.sqrt(np.diag(sigma2_hat * inv(X.T @ X)))
    
    print(f"\nTrue model: y = {beta_true[0]} + {beta_true[1]}x₁ + {beta_true[2]}x₂ + ε")
    print(f"\n{'Param':<10} {'True':<12} {'Estimate':<12} {'SE':<12}")
    print("-"*50)
    for i, name in enumerate(['β₀', 'β₁', 'β₂']):
        print(f"{name:<10} {beta_true[i]:<12.4f} {beta_hat[i]:<12.4f} {se_beta[i]:<12.4f}")
    
    print(f"\nσ̂² = {sigma2_hat:.4f} (true σ² = {sigma**2:.4f})")

ols_demonstration()

======================================================================
OLS FROM FIRST PRINCIPLES
======================================================================

True model: y = 2.0 + 3.0x₁ + -1.5x₂ + ε

Param      True         Estimate     SE          
--------------------------------------------------
β₀         2.0000       1.2338       0.3050      
β₁         3.0000       3.1430       0.0548      
β₂         -1.5000      -1.4460      0.0514      

σ̂² = 2.1843 (true σ² = 2.2500)

# =============================================================================
# 3.5 Logistic Regression via IRLS
# =============================================================================

def logistic_irls():
    """Logistic regression via IRLS."""
    print("="*70)
    print("LOGISTIC REGRESSION VIA IRLS")
    print("="*70)
    
    rng = default_rng(42)
    n = 500
    x1, x2 = rng.normal(0, 1, n), rng.normal(0, 1, n)
    X = np.column_stack([np.ones(n), x1, x2])
    
    beta_true = np.array([-1.0, 2.0, -1.5])
    p_true = expit(X @ beta_true)
    y = rng.binomial(1, p_true)
    
    # IRLS
    beta = np.zeros(3)
    for iteration in range(25):
        eta = X @ beta
        mu = np.clip(expit(eta), 1e-10, 1-1e-10)
        W = np.diag(mu * (1 - mu))
        z = eta + (y - mu) / (mu * (1 - mu))
        beta_new = solve(X.T @ W @ X, X.T @ W @ z)
        if np.max(np.abs(beta_new - beta)) < 1e-8: break
        beta = beta_new
    
    se = np.sqrt(np.diag(inv(X.T @ W @ X)))
    
    print(f"\n{'Param':<10} {'True':<12} {'IRLS':<12} {'SE':<12} {'Odds Ratio':<12}")
    print("-"*60)
    for i, name in enumerate(['β₀', 'β₁', 'β₂']):
        print(f"{name:<10} {beta_true[i]:<12.4f} {beta[i]:<12.4f} {se[i]:<12.4f} {np.exp(beta[i]):<12.4f}")
    
    # Verify with statsmodels
    model = sm.GLM(y, X, family=families.Binomial()).fit()
    print(f"\nstatsmodels verification: β = {model.params.round(4)}")

logistic_irls()

======================================================================
LOGISTIC REGRESSION VIA IRLS
======================================================================

Param      True         IRLS         SE           Odds Ratio  
------------------------------------------------------------
β₀         -1.0000      -0.8573      0.1310       0.4243      
β₁         2.0000       1.7265       0.1791       5.6209      
β₂         -1.5000      -1.4663      0.1639       0.2308      

statsmodels verification: β = [-0.8573  1.7265 -1.4663]

# =============================================================================
# EXERCISE 3.1 SOLUTION: Beta as Exponential Family
# =============================================================================

def exercise_3_1_solution():
    """Beta distribution as exponential family with verification."""
    print("="*70)
    print("EXERCISE 3.1: BETA DISTRIBUTION AS EXPONENTIAL FAMILY")
    print("="*70)
    
    print("""
DERIVATION:
-----------
Beta PDF: f(x|α,β) = [Γ(α+β)/(Γ(α)Γ(β))] x^(α-1) (1-x)^(β-1)

Taking log and rearranging:
    log f = (α-1)log(x) + (β-1)log(1-x) - log B(α,β)

Exponential family form f(x) = h(x)exp{η'T(x) - A(η)}:
    η = (η₁, η₂) = (α-1, β-1)
    T(x) = (log x, log(1-x))
    A(η) = log Γ(η₁+1) + log Γ(η₂+1) - log Γ(η₁+η₂+2)
    h(x) = 1

Moment-generating property:
    E[log X] = ∂A/∂η₁ = ψ(α) - ψ(α+β)
    E[log(1-X)] = ∂A/∂η₂ = ψ(β) - ψ(α+β)
    """)
    
    # Numerical verification
    rng = default_rng(42)
    alpha, beta_param = 3.0, 5.0
    n_samples = 100000
    
    samples = rng.beta(alpha, beta_param, n_samples)
    
    # Theoretical vs Monte Carlo
    E_logx_theory = digamma(alpha) - digamma(alpha + beta_param)
    E_log1mx_theory = digamma(beta_param) - digamma(alpha + beta_param)
    
    E_logx_mc = np.mean(np.log(samples))
    E_log1mx_mc = np.mean(np.log(1 - samples))
    
    print(f"\nNumerical Verification (α={alpha}, β={beta_param}, n={n_samples:,}):")
    print(f"\n{'Quantity':<20} {'Theory ∇A(η)':<15} {'Monte Carlo':<15} {'Error':<15}")
    print("-"*65)
    print(f"{'E[log X]':<20} {E_logx_theory:<15.6f} {E_logx_mc:<15.6f} {abs(E_logx_theory-E_logx_mc):<15.6f}")
    print(f"{'E[log(1-X)]':<20} {E_log1mx_theory:<15.6f} {E_log1mx_mc:<15.6f} {abs(E_log1mx_theory-E_log1mx_mc):<15.6f}")
    
    print("\n" + "="*70)
    print("✓ Beta is a 2-parameter exponential family. Moments verified!")
    print("="*70)

exercise_3_1_solution()

======================================================================
EXERCISE 3.1: BETA DISTRIBUTION AS EXPONENTIAL FAMILY
======================================================================

DERIVATION:
-----------
Beta PDF: f(x|α,β) = [Γ(α+β)/(Γ(α)Γ(β))] x^(α-1) (1-x)^(β-1)

Taking log and rearranging:
    log f = (α-1)log(x) + (β-1)log(1-x) - log B(α,β)

Exponential family form f(x) = h(x)exp{η'T(x) - A(η)}:
    η = (η₁, η₂) = (α-1, β-1)
    T(x) = (log x, log(1-x))
    A(η) = log Γ(η₁+1) + log Γ(η₂+1) - log Γ(η₁+η₂+2)
    h(x) = 1

Moment-generating property:
    E[log X] = ∂A/∂η₁ = ψ(α) - ψ(α+β)
    E[log(1-X)] = ∂A/∂η₂ = ψ(β) - ψ(α+β)
    

Numerical Verification (α=3.0, β=5.0, n=100,000):

Quantity             Theory ∇A(η)    Monte Carlo     Error          
-----------------------------------------------------------------
E[log X]             -1.092857       -1.090536       0.002321       
E[log(1-X)]          -0.509524       -0.510890       0.001366       

======================================================================
✓ Beta is a 2-parameter exponential family. Moments verified!
======================================================================

# =============================================================================
# EXERCISE 3.2 SOLUTION: Gamma Profile Likelihood
# =============================================================================

def exercise_3_2_solution():
    """Gamma MLE via profile likelihood."""
    print("="*70)
    print("EXERCISE 3.2: GAMMA MLE VIA PROFILE LIKELIHOOD")
    print("="*70)
    
    rng = default_rng(42)
    alpha_true, beta_true = 2.5, 1.5
    n = 200
    data = rng.gamma(alpha_true, 1/beta_true, n)
    
    x_bar = np.mean(data)
    log_x_bar = np.mean(np.log(data))
    
    # Profile log-likelihood: only function of α
    def profile_ll(alpha):
        if alpha <= 0: return -np.inf
        return n * (alpha * np.log(alpha/x_bar) - gammaln(alpha) + (alpha-1)*log_x_bar - alpha)
    
    def profile_score(alpha):
        return n * (np.log(alpha/x_bar) + 1 - digamma(alpha) + log_x_bar)
    
    # Newton's method on profile likelihood
    alpha = x_bar**2 / np.var(data)  # MoM init
    print(f"\nInitial α (MoM): {alpha:.4f}")
    
    for i in range(15):
        score = profile_score(alpha)
        hess = n * (1/alpha - polygamma(1, alpha))
        alpha_new = alpha - score/hess
        alpha_new = max(alpha_new, 0.01)
        if abs(alpha_new - alpha) < 1e-10: break
        alpha = alpha_new
    
    beta_hat = alpha / x_bar
    
    print(f"\nProfile MLE Results:")
    print(f"  True:     α = {alpha_true:.4f}, β = {beta_true:.4f}")
    print(f"  Profile:  α̂ = {alpha:.4f}, β̂ = {beta_hat:.4f}")
    
    # Verify
    fit_alpha, _, fit_scale = gamma_dist.fit(data, floc=0)
    print(f"  scipy:    α̂ = {fit_alpha:.4f}, β̂ = {1/fit_scale:.4f}")
    
    # Standard errors via Fisher information
    I_aa = n * polygamma(1, alpha)
    I_ab = -n / beta_hat
    I_bb = n * alpha / beta_hat**2
    fisher = np.array([[I_aa, I_ab], [I_ab, I_bb]])
    var_mat = inv(fisher)
    
    print(f"\nStandard Errors:")
    print(f"  SE(α̂) = {np.sqrt(var_mat[0,0]):.4f}")
    print(f"  SE(β̂) = {np.sqrt(var_mat[1,1]):.4f}")
    
    print("\n" + "="*70)
    print("✓ Profile likelihood reduces 2D optimization to 1D!")
    print("="*70)

exercise_3_2_solution()

======================================================================
EXERCISE 3.2: GAMMA MLE VIA PROFILE LIKELIHOOD
======================================================================

Initial α (MoM): 2.6168

Profile MLE Results:
  True:     α = 2.5000, β = 1.5000
  Profile:  α̂ = 95.7507, β̂ = 55.4664
  scipy:    α̂ = 2.7180, β̂ = 1.5745

Standard Errors:
  SE(α̂) = 9.5585
  SE(β̂) = 5.5515

======================================================================
✓ Profile likelihood reduces 2D optimization to 1D!
======================================================================

/tmp/ipykernel_223755/128453705.py:34: RuntimeWarning: divide by zero encountered in scalar divide
  alpha_new = alpha - score/hess

# =============================================================================
# EXERCISE 3.3 SOLUTION: Delta Method for Odds Ratio
# =============================================================================

def exercise_3_3_solution():
    """Delta method for log odds ratio variance."""
    print("="*70)
    print("EXERCISE 3.3: DELTA METHOD FOR LOG ODDS RATIO")
    print("="*70)
    
    print("""
DERIVATION:
-----------
2×2 table with cells n₁₁, n₁₂, n₂₁, n₂₂

Log OR: g(n) = log(n₁₁) + log(n₂₂) - log(n₁₂) - log(n₂₁)

Gradient: ∇g = (1/n₁₁, -1/n₁₂, -1/n₂₁, 1/n₂₂)

Under Poisson approximation: Var(nᵢⱼ) ≈ nᵢⱼ

Delta method: Var(log OR) = Σᵢⱼ (∂g/∂nᵢⱼ)² × Var(nᵢⱼ)
            = 1/n₁₁ + 1/n₁₂ + 1/n₂₁ + 1/n₂₂
    """)
    
    # Example data
    n11, n12, n21, n22 = 40, 60, 80, 120
    
    print(f"\n2×2 Table Example:")
    print(f"              Event+  Event-")
    print(f"  Exposed+     {n11:>4}    {n12:>4}")
    print(f"  Exposed-     {n21:>4}    {n22:>4}")
    
    OR = (n11 * n22) / (n12 * n21)
    log_OR = np.log(OR)
    var_log_OR = 1/n11 + 1/n12 + 1/n21 + 1/n22
    se_log_OR = np.sqrt(var_log_OR)
    
    print(f"\nResults:")
    print(f"  OR = {OR:.4f}")
    print(f"  log(OR) = {log_OR:.4f}")
    print(f"  SE(log OR) = √(1/{n11} + 1/{n12} + 1/{n21} + 1/{n22}) = {se_log_OR:.4f}")
    print(f"  95% CI for OR: ({np.exp(log_OR - 1.96*se_log_OR):.4f}, {np.exp(log_OR + 1.96*se_log_OR):.4f})")
    
    # Simulation verification
    rng = default_rng(42)
    n_total = n11 + n12 + n21 + n22
    p_true = np.array([n11, n12, n21, n22]) / n_total
    
    log_ors = []
    for _ in range(10000):
        counts = rng.multinomial(n_total, p_true)
        if np.all(counts > 0):
            log_ors.append(np.log(counts[0]*counts[3]/(counts[1]*counts[2])))
    
    print(f"\nSimulation Verification (10,000 reps):")
    print(f"  Simulated SD(log OR) = {np.std(log_ors):.4f}")
    print(f"  Delta method SE = {se_log_OR:.4f}")
    print(f"  Ratio = {np.std(log_ors)/se_log_OR:.4f}")
    
    print("\n" + "="*70)
    print("✓ Classic formula Var(log OR) = Σ(1/nᵢⱼ) is delta method!")
    print("="*70)

exercise_3_3_solution()

======================================================================
EXERCISE 3.3: DELTA METHOD FOR LOG ODDS RATIO
======================================================================

DERIVATION:
-----------
2×2 table with cells n₁₁, n₁₂, n₂₁, n₂₂

Log OR: g(n) = log(n₁₁) + log(n₂₂) - log(n₁₂) - log(n₂₁)

Gradient: ∇g = (1/n₁₁, -1/n₁₂, -1/n₂₁, 1/n₂₂)

Under Poisson approximation: Var(nᵢⱼ) ≈ nᵢⱼ

Delta method: Var(log OR) = Σᵢⱼ (∂g/∂nᵢⱼ)² × Var(nᵢⱼ)
            = 1/n₁₁ + 1/n₁₂ + 1/n₂₁ + 1/n₂₂
    

2×2 Table Example:
              Event+  Event-
  Exposed+       40      60
  Exposed-       80     120

Results:
  OR = 1.0000
  log(OR) = 0.0000
  SE(log OR) = √(1/40 + 1/60 + 1/80 + 1/120) = 0.2500
  95% CI for OR: (0.6126, 1.6323)

Simulation Verification (10,000 reps):
  Simulated SD(log OR) = 0.2517
  Delta method SE = 0.2500
  Ratio = 1.0067

======================================================================
✓ Classic formula Var(log OR) = Σ(1/nᵢⱼ) is delta method!
======================================================================

# =============================================================================
# EXERCISE 3.4 SOLUTION: Gauss-Markov Violations
# =============================================================================

def exercise_3_4_solution():
    """Investigate Gauss-Markov violations via simulation."""
    print("="*70)
    print("EXERCISE 3.4: GAUSS-MARKOV VIOLATIONS")
    print("="*70)
    
    rng = default_rng(42)
    n, n_sim = 200, 2000
    beta_true = np.array([1.0, 2.0])
    
    # Fixed design
    x = np.linspace(1, 10, n)
    X = sm.add_constant(x)
    
    # Part 1: Heteroskedasticity
    print("\n" + "="*70)
    print("PART 1: HETEROSKEDASTICITY (Var(ε|x) = x²)")
    print("="*70)
    
    ols_ests, ols_ses, hc_ses = [], [], []
    for _ in range(n_sim):
        epsilon = rng.normal(0, x)  # SD proportional to x
        y = beta_true[0] + beta_true[1]*x + epsilon
        
        ols = sm.OLS(y, X).fit()
        ols_ests.append(ols.params[1])
        ols_ses.append(ols.bse[1])
        
        ols_hc = sm.OLS(y, X).fit(cov_type='HC1')
        hc_ses.append(ols_hc.bse[1])
    
    true_se = np.std(ols_ests)
    print(f"\nβ₁ estimation ({n_sim} simulations):")
    print(f"  Mean(β̂₁) = {np.mean(ols_ests):.4f} (true = {beta_true[1]:.4f}) - UNBIASED ✓")
    print(f"  True SE = {true_se:.4f}")
    print(f"  Model SE = {np.mean(ols_ses):.4f} (ratio = {np.mean(ols_ses)/true_se:.2f}) - WRONG!")
    print(f"  HC1 SE = {np.mean(hc_ses):.4f} (ratio = {np.mean(hc_ses)/true_se:.2f}) - CORRECT ✓")
    
    # Part 2: Autocorrelation
    print("\n" + "="*70)
    print("PART 2: AUTOCORRELATION (AR(1) with ρ=0.7)")
    print("="*70)
    
    rho = 0.7
    ols_ests_ar, ols_ses_ar, nw_ses = [], [], []
    
    for _ in range(n_sim):
        # Generate AR(1) errors
        eps = np.zeros(n)
        u = rng.normal(0, 1, n)
        eps[0] = u[0]/np.sqrt(1-rho**2)
        for t in range(1, n):
            eps[t] = rho*eps[t-1] + u[t]
        
        y = beta_true[0] + beta_true[1]*x + eps
        
        ols = sm.OLS(y, X).fit()
        ols_ests_ar.append(ols.params[1])
        ols_ses_ar.append(ols.bse[1])
        
        ols_nw = sm.OLS(y, X).fit(cov_type='HAC', cov_kwds={'maxlags': 5})
        nw_ses.append(ols_nw.bse[1])
    
    true_se_ar = np.std(ols_ests_ar)
    print(f"\nβ₁ estimation ({n_sim} simulations):")
    print(f"  Mean(β̂₁) = {np.mean(ols_ests_ar):.4f} (true = {beta_true[1]:.4f}) - UNBIASED ✓")
    print(f"  True SE = {true_se_ar:.4f}")
    print(f"  Model SE = {np.mean(ols_ses_ar):.4f} (ratio = {np.mean(ols_ses_ar)/true_se_ar:.2f}) - WRONG!")
    print(f"  Newey-West = {np.mean(nw_ses):.4f} (ratio = {np.mean(nw_ses)/true_se_ar:.2f}) - BETTER ✓")
    
    print("\n" + "="*70)
    print("SUMMARY: OLS is unbiased but SEs require correction!")
    print("  - Heteroskedasticity: Use HC (White) robust SEs")
    print("  - Autocorrelation: Use HAC (Newey-West) robust SEs")
    print("="*70)

exercise_3_4_solution()

======================================================================
EXERCISE 3.4: GAUSS-MARKOV VIOLATIONS
======================================================================

======================================================================
PART 1: HETEROSKEDASTICITY (Var(ε|x) = x²)
======================================================================

β₁ estimation (2000 simulations):
  Mean(β̂₁) = 2.0020 (true = 2.0000) - UNBIASED ✓
  True SE = 0.1711
  Model SE = 0.1646 (ratio = 0.96) - WRONG!
  HC1 SE = 0.1752 (ratio = 1.02) - CORRECT ✓

======================================================================
PART 2: AUTOCORRELATION (AR(1) with ρ=0.7)
======================================================================

β₁ estimation (2000 simulations):
  Mean(β̂₁) = 2.0009 (true = 2.0000) - UNBIASED ✓
  True SE = 0.0850
  Model SE = 0.0369 (ratio = 0.43) - WRONG!
  Newey-West = 0.0634 (ratio = 0.75) - BETTER ✓

======================================================================
SUMMARY: OLS is unbiased but SEs require correction!
  - Heteroskedasticity: Use HC (White) robust SEs
  - Autocorrelation: Use HAC (Newey-West) robust SEs
======================================================================

# =============================================================================
# EXERCISE 3.5 SOLUTION: Gamma Regression via IRLS
# =============================================================================

def exercise_3_5_solution():
    """Gamma regression with log link via IRLS."""
    print("="*70)
    print("EXERCISE 3.5: GAMMA REGRESSION VIA IRLS")
    print("="*70)
    
    print("""
GAMMA GLM WITH LOG LINK:
------------------------
Distribution: Y ~ Gamma(shape=α, scale=μ/α), E[Y]=μ, Var(Y)=μ²/α
Link: log(μ) = Xβ, so μ = exp(Xβ)
Variance function: V(μ) = μ²

IRLS components:
  η = Xβ
  μ = exp(η)
  g'(μ) = 1/μ
  W = 1/(V(μ)·g'(μ)²) = 1/(μ²·1/μ²) = 1  (constant weights!)
  z = η + (y-μ)/μ  (working response)
    """)
    
    rng = default_rng(42)
    n = 300
    
    # Predictors: age and BMI for medical costs
    age = rng.uniform(20, 70, n)
    bmi = rng.uniform(18, 35, n)
    X = np.column_stack([np.ones(n), age, bmi])
    
    # True model: log(cost) = 6 + 0.02*age + 0.05*bmi
    beta_true = np.array([6.0, 0.02, 0.05])
    mu_true = np.exp(X @ beta_true)
    shape = 5.0
    y = rng.gamma(shape, mu_true/shape, n)
    
    print(f"\nSimulated medical cost data (n={n}):")
    print(f"  True model: log(μ) = {beta_true[0]} + {beta_true[1]}·age + {beta_true[2]}·BMI")
    print(f"  Mean cost: ${np.mean(y):.0f}, Range: ${y.min():.0f} - ${y.max():.0f}")
    
    # IRLS for Gamma with log link
    beta = np.linalg.lstsq(X, np.log(y), rcond=None)[0]  # OLS on log(y) init
    
    for iteration in range(25):
        eta = X @ beta
        mu = np.exp(eta)
        z = eta + (y - mu)/mu  # Working response
        # W = I for Gamma log link, so this is just OLS
        beta_new = np.linalg.lstsq(X, z, rcond=None)[0]
        if np.max(np.abs(beta_new - beta)) < 1e-8: break
        beta = beta_new
    
    mu_final = np.exp(X @ beta)
    pearson_resid = (y - mu_final)/mu_final
    phi_hat = np.sum(pearson_resid**2)/(n - 3)
    se = np.sqrt(phi_hat * np.diag(inv(X.T @ X)))
    
    print(f"\nIRLS Results ({iteration+1} iterations):")
    print(f"\n{'Parameter':<12} {'True':<10} {'IRLS':<10} {'SE':<10}")
    print("-"*45)
    for i, name in enumerate(['Intercept', 'Age', 'BMI']):
        print(f"{name:<12} {beta_true[i]:<10.4f} {beta[i]:<10.4f} {se[i]:<10.4f}")
    
    print(f"\nDispersion: φ̂ = {phi_hat:.4f} (true 1/shape = {1/shape:.4f})")
    
    # Verify with statsmodels
    model = sm.GLM(y, X, family=families.Gamma(link=families.links.Log())).fit()
    print(f"\nstatsmodels verification: β = {model.params.round(4)}")
    
    print("\n" + "="*70)
    print("✓ Gamma with log link has constant weights - IRLS is just iterated OLS!")
    print("="*70)

exercise_3_5_solution()

======================================================================
EXERCISE 3.5: GAMMA REGRESSION VIA IRLS
======================================================================

GAMMA GLM WITH LOG LINK:
------------------------
Distribution: Y ~ Gamma(shape=α, scale=μ/α), E[Y]=μ, Var(Y)=μ²/α
Link: log(μ) = Xβ, so μ = exp(Xβ)
Variance function: V(μ) = μ²

IRLS components:
  η = Xβ
  μ = exp(η)
  g'(μ) = 1/μ
  W = 1/(V(μ)·g'(μ)²) = 1/(μ²·1/μ²) = 1  (constant weights!)
  z = η + (y-μ)/μ  (working response)
    

Simulated medical cost data (n=300):
  True model: log(μ) = 6.0 + 0.02·age + 0.05·BMI
  Mean cost: $3804, Range: $507 - $14203

IRLS Results (6 iterations):

Parameter    True       IRLS       SE        
---------------------------------------------
Intercept    6.0000     5.9987     0.1611    
Age          0.0200     0.0205     0.0018    
BMI          0.0500     0.0475     0.0053    

Dispersion: φ̂ = 0.1990 (true 1/shape = 0.2000)

statsmodels verification: β = [5.9987 0.0205 0.0475]

======================================================================
✓ Gamma with log link has constant weights - IRLS is just iterated OLS!
======================================================================

# =============================================================================
# EXERCISE 3.6 SOLUTION: Complete GLM Analysis
# =============================================================================

def exercise_3_6_solution():
    """Complete GLM analysis workflow for hospital readmission."""
    print("="*70)
    print("EXERCISE 3.6: COMPLETE GLM ANALYSIS WORKFLOW")
    print("="*70)
    
    rng = default_rng(42)
    n = 1000
    
    # Simulate predictors
    length_stay = rng.exponential(5, n) + 1  # Days
    n_comorbid = rng.poisson(2, n)           # Number of comorbidities
    age = rng.normal(65, 12, n)              # Age
    education = rng.binomial(1, 0.6, n)      # Discharge education (binary)
    
    X = np.column_stack([np.ones(n), length_stay, n_comorbid, age, education])
    
    # True model
    beta_true = np.array([-2.5, 0.05, 0.3, 0.02, -0.5])
    p_true = expit(X @ beta_true)
    readmit = rng.binomial(1, p_true)
    
    print(f"\nSimulated hospital readmission data:")
    print(f"  n = {n}, Readmission rate = {100*np.mean(readmit):.1f}%")
    print(f"\nTrue model: logit(p) = {beta_true[0]:.1f} + {beta_true[1]:.2f}·LOS + {beta_true[2]:.1f}·comorbid + {beta_true[3]:.2f}·age + {beta_true[4]:.1f}·education")
    
    # Fit model
    model = sm.GLM(readmit, X, family=families.Binomial()).fit()
    
    print(f"\n{'='*70}")
    print("MODEL SUMMARY")
    print("="*70)
    print(f"\n{'Variable':<15} {'True β':<10} {'β̂':<10} {'SE':<10} {'OR':<10} {'p-value':<10}")
    print("-"*70)
    names = ['Intercept', 'LOS', 'Comorbid', 'Age', 'Education']
    for i, name in enumerate(names):
        print(f"{name:<15} {beta_true[i]:<10.3f} {model.params[i]:<10.3f} {model.bse[i]:<10.3f} {np.exp(model.params[i]):<10.3f} {model.pvalues[i]:<10.4f}")
    
    # Model fit
    print(f"\n{'='*70}")
    print("MODEL FIT DIAGNOSTICS")
    print("="*70)
    print(f"  Deviance = {model.deviance:.2f}")
    print(f"  Pearson χ² = {model.pearson_chi2:.2f}")
    print(f"  df = {model.df_resid}")
    print(f"  Deviance/df = {model.deviance/model.df_resid:.3f} (should be ≈ 1)")
    
    # LRT for education effect
    model_reduced = sm.GLM(readmit, X[:, :-1], family=families.Binomial()).fit()
    lrt_stat = 2 * (model.llf - model_reduced.llf)
    lrt_pval = 1 - chi2.cdf(lrt_stat, df=1)
    
    print(f"\n{'='*70}")
    print("LIKELIHOOD RATIO TEST FOR EDUCATION EFFECT")
    print("="*70)
    print(f"  H₀: β_education = 0")
    print(f"  LRT statistic = {lrt_stat:.4f}")
    print(f"  p-value = {lrt_pval:.4f}")
    print(f"  Conclusion: {'Reject H₀' if lrt_pval < 0.05 else 'Fail to reject H₀'} at α=0.05")
    
    # Key interpretation
    print(f"\n{'='*70}")
    print("INTERPRETATION")
    print("="*70)
    print(f"  • Each additional day in hospital increases odds by {100*(np.exp(model.params[1])-1):.1f}%")
    print(f"  • Each additional comorbidity increases odds by {100*(np.exp(model.params[2])-1):.0f}%")
    print(f"  • Discharge education REDUCES odds by {100*(1-np.exp(model.params[4])):.0f}%")
    
    print("\n" + "="*70)
    print("✓ Complete GLM analysis workflow demonstrated!")
    print("="*70)

exercise_3_6_solution()

======================================================================
EXERCISE 3.6: COMPLETE GLM ANALYSIS WORKFLOW
======================================================================

Simulated hospital readmission data:
  n = 1000, Readmission rate = 35.6%

True model: logit(p) = -2.5 + 0.05·LOS + 0.3·comorbid + 0.02·age + -0.5·education

======================================================================
MODEL SUMMARY
======================================================================

Variable        True β     β̂         SE         OR         p-value   
----------------------------------------------------------------------
Intercept       -2.500     -2.514     0.432      0.081      0.0000    
LOS             0.050      0.052      0.013      1.054      0.0001    
Comorbid        0.300      0.339      0.049      1.404      0.0000    
Age             0.020      0.019      0.006      1.019      0.0011    
Education       -0.500     -0.604     0.140      0.547      0.0000    

======================================================================
MODEL FIT DIAGNOSTICS
======================================================================
  Deviance = 1215.26
  Pearson χ² = 996.28
  df = 995
  Deviance/df = 1.221 (should be ≈ 1)

======================================================================
LIKELIHOOD RATIO TEST FOR EDUCATION EFFECT
======================================================================
  H₀: β_education = 0
  LRT statistic = 18.6549
  p-value = 0.0000
  Conclusion: Reject H₀ at α=0.05

======================================================================
INTERPRETATION
======================================================================
  • Each additional day in hospital increases odds by 5.4%
  • Each additional comorbidity increases odds by 40%
  • Discharge education REDUCES odds by 45%

======================================================================
✓ Complete GLM analysis workflow demonstrated!
======================================================================

Section	Topic	Key Methods
3.1	Exponential Families	Canonical form, log-partition function, sufficiency
3.2	Maximum Likelihood	Score function, Fisher information, Newton-Raphson
3.3	Sampling Variability	Bias, variance, delta method, sandwich estimators
3.4	Linear Models	OLS, Gauss-Markov, t-tests, F-tests
3.5	Generalized Linear Models	Link functions, IRLS, deviance, diagnostics

Module	Purpose	Key Classes/Functions
`statsmodels.api`	Main API	`OLS`, `WLS`, `GLM`, `add_constant`
`statsmodels.genmod.families`	GLM distributions	`Gaussian`, `Binomial`, `Poisson`, `Gamma`
`statsmodels.genmod.families.links`	Link functions	`Log`, `Logit`, `Identity`, `InversePower`
Robust covariance	Robust SEs	`cov_type='HC0'`, `'HC1'`, `'HC3'`, `'HAC'`

Feature	scipy.stats	statsmodels
MLE fitting	Basic (`.fit()`)	Full inference (SE, CI, tests)
Model summary	None	Comprehensive `.summary()`
GLM support	None	Full exponential family
Robust SEs	None	HC0-HC3, HAC (Newey-West)
Diagnostics	Limited	Residuals, influence, VIF
Time series	None	ARIMA, VAR, state space

Section	Core Concept	Key Formula
3.1	Exponential Families	$f(x	\eta) = h(x)\exp{\eta^\top T(x) - A(\eta)}$
3.2	Maximum Likelihood	$U(\hat{\theta}) = 0$, asymptotic normality
3.3	Sampling Variability	Delta method: $\text{Var}(g(\hat{\theta})) \approx [g'(\theta)]^2 \text{Var}(\hat{\theta})$
3.4	Linear Models	$\hat{\beta} = (X'X)^{-1}X'y$ (BLUE under Gauss-Markov)
3.5	GLMs	$g(\mu) = X\beta$, IRLS algorithm

Chapter 3: Parametric Inference and Likelihood Methods¶

Learning Objectives¶

Introduction to statsmodels¶

What is statsmodels?¶

Key Features Used in This Chapter¶

Installation¶

Basic Usage Pattern¶

Documentation Links¶

Citation¶

Why statsmodels vs. scipy.stats?¶

Section 3.1: Exponential Families¶

Section 3.2: Maximum Likelihood Estimation¶

Section 3.3: Sampling Variability¶

Section 3.4: Linear Models¶

Section 3.5: Generalized Linear Models¶

🎯 Chapter 3 Exercises — Complete Solutions¶

Exercise 3.1: Beta Distribution as Exponential Family¶

Exercise 3.2: Gamma MLE via Profile Likelihood¶

Exercise 3.3: Delta Method for Log Odds Ratio¶

Exercise 3.4: Gauss-Markov Violations¶

Exercise 3.5: Gamma Regression via IRLS¶

Exercise 3.6: Complete GLM Analysis Workflow¶

Chapter 3 Summary¶

Key Concepts¶

Connections Forward¶