import numpy as np
import matplotlib.pyplot as plt
import sympy as sp
from scipy import stats
from scipy.optimize import approx_fprime, minimize_scalar
from scipy.integrate import quad
from scipy.special import gammaln, digamma, polygamma
%matplotlib inline

np.random.seed(418)            # global reproducibility (STYLE.md contract)
plt.rcParams.update({
    'figure.figsize': (7.2, 4.2), 'figure.dpi': 110,
    'axes.grid': True, 'grid.alpha': 0.25,
    'axes.spines.top': False, 'axes.spines.right': False,
    'font.size': 11,
})
BLUE, RED, GREEN, ORANGE, PURPLE = '#4C72B0', '#C44E52', '#55A868', '#DD8452', '#8172B3'

# This appendix has NO external data: every result is a reproducible simulation or a
# symbolic identity. Each worked example re-seeds its own default_rng(42) -- the seed
# the appendix documents -- so the printed numbers match the RST to the last digit.
print('environment ready · numpy', np.__version__, '· sympy', sp.__version__)

environment ready · numpy 1.26.4 · sympy 1.13.2


# SymPy: derive the per-observation Poisson score from log f, then sum.
lam, xsym, nsym, xbar_s = sp.symbols('lambda x n xbar', positive=True)
logf = xsym*sp.log(lam) - lam - sp.loggamma(xsym + 1)   # log Poisson pmf
score_obs = sp.simplify(sp.diff(logf, lam))             # per-observation score
score_sample = sp.simplify(nsym*(xbar_s/lam - 1))       # sum over n iid obs
print('SymPy per-observation score  d/dlambda log f =', score_obs, '  = x/lambda - 1')
print('Sample score U(lambda) = sum_i (x_i/lambda - 1) = n*(xbar/lambda - 1) =', score_sample)
print('Solve U=0  ->  lambda_hat =', sp.solve(sp.Eq(score_sample, 0), lam)[0], '= xbar  (the MLE)')

SymPy per-observation score  d/dlambda log f = (-lambda + x)/lambda   = x/lambda - 1
Sample score U(lambda) = sum_i (x_i/lambda - 1) = n*(xbar/lambda - 1) = n*(-lambda + xbar)/lambda
Solve U=0  ->  lambda_hat = xbar = xbar  (the MLE)


# Numeric check: analytic score vs finite-difference derivative of the log-likelihood.
rng = np.random.default_rng(42)
n = 50
data = rng.poisson(3.5, n)
sum_x = int(np.sum(data)); xbar = np.mean(data)

def poisson_loglik(lam_arr):
    lam = lam_arr[0]
    if lam <= 0:
        return -1e20
    return float(np.sum(data * np.log(lam) - lam))      # drop constant log(x!) term
def score_analytical(lam):
    return sum_x / lam - n

print(f'Poisson score verification (n={n}, sum_x={sum_x}, xbar={xbar:.2f})')
print(f"{'lambda':>8}  {'analytic':>12}  {'finite-diff':>12}  {'|diff|':>10}")
print('-' * 48)
for lam_v in [2.0, 3.0, xbar, 4.0, 5.0]:
    s_an = score_analytical(lam_v)
    s_fd = approx_fprime(np.array([lam_v]), poisson_loglik, 1e-8)[0]
    print(f'{lam_v:>8.2f}  {s_an:>12.6f}  {s_fd:>12.6f}  {abs(s_an - s_fd):>10.2e}')
print(f'\nMLE lambda_hat = xbar = {xbar:.2f};  score at MLE U({xbar:.2f}) = {score_analytical(xbar):.6f}')

Poisson score verification (n=50, sum_x=171, xbar=3.42)
  lambda      analytic   finite-diff      |diff|
------------------------------------------------
    2.00     35.500000     35.500000    3.11e-07
    3.00      7.000000      6.999998    2.22e-06
    3.42      0.000000     -0.000004    3.55e-06
    4.00     -7.250000     -7.249999    6.00e-07
    5.00    -15.800000    -15.799997    2.86e-06

MLE lambda_hat = xbar = 3.42;  score at MLE U(3.42) = 0.000000


# Tabulate the log-likelihood, score, and curvature around the MLE.
print(f'Poisson log-likelihood analysis (n={n}, sum_x={sum_x}, xbar={xbar:.2f})')
print(f"{'lambda':>7}  {'loglik l':>11}  {'score l1':>11}  {'curv l2':>11}")
print('-' * 46)
for lam_v in [2.0, 2.5, 3.0, xbar, 3.5, 4.0, 4.5]:
    ll = sum_x * np.log(lam_v) - n * lam_v
    sc = sum_x / lam_v - n
    d2 = -sum_x / lam_v**2
    print(f'{lam_v:>7.2f}  {ll:>11.4f}  {sc:>11.4f}  {d2:>11.4f}')
J_obs = sum_x / xbar**2
print(f'\nAt MLE lambda_hat={xbar:.2f}:  score=0 (check {sum_x/xbar - n:.6f}),  '
      f"l''={-J_obs:.4f} < 0 (maximum)")
print(f'  observed information J = {J_obs:.4f}  ->  SE ~ 1/sqrt(J) = {1/np.sqrt(J_obs):.4f}')

Poisson log-likelihood analysis (n=50, sum_x=171, xbar=3.42)
 lambda     loglik l     score l1      curv l2
----------------------------------------------
   2.00      18.5282      35.5000     -42.7500
   2.50      31.6857      18.4000     -27.3600
   3.00      37.8627       7.0000     -19.0000
   3.42      39.2685       0.0000     -14.6199
   3.50      39.2225      -1.1429     -13.9592
   4.00      37.0563      -7.2500     -10.6875
   4.50      32.1972     -12.0000      -8.4444

At MLE lambda_hat=3.42:  score=0 (check 0.000000),  l''=-14.6199 < 0 (maximum)
  observed information J = 14.6199  ->  SE ~ 1/sqrt(J) = 0.2615


# Figure: the Poisson log-likelihood is concave (left); its score crosses zero at the
# MLE (right) -- 'set the derivative to zero' made visual.
grid = np.linspace(2.0, 5.0, 400)
ll = sum_x * np.log(grid) - n * grid
sc = sum_x / grid - n
fig, (a1, a2) = plt.subplots(1, 2, figsize=(10.5, 4.0))
a1.plot(grid, ll, color=BLUE, lw=2.2)
a1.axvline(xbar, color=RED, ls='--', lw=1.5, label=fr'MLE $\hat\lambda=\bar X={xbar:.2f}$')
a1.plot([xbar], [sum_x*np.log(xbar) - n*xbar], 'o', color=RED, ms=8)
a1.set_xlabel(r'$\lambda$'); a1.set_ylabel(r'$\ell(\lambda)$')
a1.set_title('Poisson log-likelihood (strictly concave)'); a1.legend()
a2.plot(grid, sc, color=GREEN, lw=2.2, label=r"score $U(\lambda)=\sum x_i/\lambda - n$")
a2.axhline(0, color='k', lw=1)
a2.axvline(xbar, color=RED, ls='--', lw=1.5)
a2.plot([xbar], [0], 'o', color=RED, ms=8, label=fr'$U(\hat\lambda)=0$')
a2.set_xlabel(r'$\lambda$'); a2.set_ylabel(r'$U(\lambda)$')
a2.set_title('Score crosses zero at the MLE'); a2.legend()
plt.tight_layout(); plt.show()


rng = np.random.default_rng(42)
alpha, beta = 3.0, 2.0
# Analytical: E[X^2] = Var(X) + (E[X])^2 = alpha/beta^2 + (alpha/beta)^2
analytical = alpha / beta**2 + (alpha / beta)**2
integrand = lambda x: x**2 * stats.gamma.pdf(x, a=alpha, scale=1/beta)
quad_result, _ = quad(integrand, 0, np.inf)
samples = rng.gamma(alpha, 1/beta, 100000)
mc_result = np.mean(samples**2)
print('Verification: E[X^2] for Gamma(alpha=3, beta=2)')
print(f"{'method':>22}  {'E[X^2]':>10}  {'error':>10}")
print('-' * 46)
print(f"{'analytical':>22}  {analytical:>10.6f}  {'--':>10}")
print(f"{'scipy.integrate.quad':>22}  {quad_result:>10.6f}  {abs(quad_result-analytical):>10.2e}")
print(f"{'Monte Carlo (n=1e5)':>22}  {mc_result:>10.6f}  {abs(mc_result-analytical):>10.2e}")

Verification: E[X^2] for Gamma(alpha=3, beta=2)
                method      E[X^2]       error
----------------------------------------------
            analytical    3.000000          --
  scipy.integrate.quad    3.000000    0.00e+00
   Monte Carlo (n=1e5)    3.010147    1.01e-02


# SymPy evaluates the (improper) integral symbolically -> alpha/beta.
a_s, b_s, x_s = sp.symbols('alpha beta x', positive=True)
pdf = b_s**a_s / sp.gamma(a_s) * x_s**(a_s - 1) * sp.exp(-b_s * x_s)
EX_sym = sp.simplify(sp.integrate(x_s * pdf, (x_s, 0, sp.oo)))
EX2_sym = sp.simplify(sp.integrate(x_s**2 * pdf, (x_s, 0, sp.oo)))
print('SymPy integration by parts (closed form):')
print('  E[X]   = int_0^inf x f(x) dx   =', EX_sym, '  (= alpha/beta)')
print('  E[X^2] = int_0^inf x^2 f(x) dx =', EX2_sym, '  (= alpha(alpha+1)/beta^2)')

rng = np.random.default_rng(42)
samples = rng.gamma(3.0, 1/2.0, 100000)
print(f'\nSimulation check  E[X] for Gamma(3,2):')
print(f'  theoretical alpha/beta = {3.0/2.0:.6f}')
print(f'  simulated mean         = {np.mean(samples):.6f}')
print(f'  difference             = {abs(np.mean(samples) - 3.0/2.0):.6f}')

SymPy integration by parts (closed form):
  E[X]   = int_0^inf x f(x) dx   = alpha/beta   (= alpha/beta)
  E[X^2] = int_0^inf x^2 f(x) dx = alpha*(alpha + 1)/beta**2   (= alpha(alpha+1)/beta^2)

Simulation check  E[X] for Gamma(3,2):
  theoretical alpha/beta = 1.500000
  simulated mean         = 1.500859
  difference             = 0.000859


# Inverse-CDF transform U -> X = -log(U)/lambda, then compare to the Exp(lambda) pdf.
rng = np.random.default_rng(42)
lam_e = 1.5
U = rng.uniform(0, 1, 100000)
X = -np.log(U) / lam_e                       # inverse-CDF construction
print(f'Inverse-CDF Exp(lambda={lam_e}):  X = -log(U)/lambda')
print(f'  sample mean = {X.mean():.4f}   theory 1/lambda = {1/lam_e:.4f}')
print(f'  sample var  = {X.var():.4f}   theory 1/lambda^2 = {1/lam_e**2:.4f}')
ks = stats.kstest(X, 'expon', args=(0, 1/lam_e))
print(f'  KS test vs Exp: stat = {ks.statistic:.4f}, p = {ks.pvalue:.3f}  (cannot reject)')

fig, ax = plt.subplots(figsize=(7.2, 3.8))
ax.hist(X, bins=60, range=(0, 5), density=True, color=BLUE, alpha=0.55,
        edgecolor='white', label='transformed $-\\log(U)/\\lambda$')
xs = np.linspace(0, 5, 300)
ax.plot(xs, lam_e * np.exp(-lam_e * xs), color=RED, lw=2.4,
        label=r'$\lambda e^{-\lambda x}$ density')
ax.set_xlabel('x'); ax.set_ylabel('density')
ax.set_title('Change of variables: the Jacobian reshapes Uniform into Exponential')
ax.legend(); plt.tight_layout(); plt.show()

Inverse-CDF Exp(lambda=1.5):  X = -log(U)/lambda
  sample mean = 0.6662   theory 1/lambda = 0.6667
  sample var  = 0.4480   theory 1/lambda^2 = 0.4444
  KS test vs Exp: stat = 0.0030, p = 0.331  (cannot reject)


# Running means: Normal converges to its mean; Cauchy wanders forever (no mean).
rng = np.random.default_rng(42)
M = 20000
normal_draws = rng.normal(0, 1, M)
cauchy_draws = stats.cauchy.rvs(size=M, random_state=rng)
run_norm = np.cumsum(normal_draws) / np.arange(1, M + 1)
run_cau = np.cumsum(cauchy_draws) / np.arange(1, M + 1)
print('Running-mean behaviour over', M, 'draws:')
print(f'  Normal(0,1):  running mean at n=M is {run_norm[-1]:+.4f}  '
      f'(range last 5000: [{run_norm[-5000:].min():+.3f}, {run_norm[-5000:].max():+.3f}])')
print(f'  Cauchy(0,1):  running mean at n=M is {run_cau[-1]:+.4f}  '
      f'(range last 5000: [{run_cau[-5000:].min():+.3f}, {run_cau[-5000:].max():+.3f}])')
print('  The Cauchy running mean does NOT converge: E[X] is an undefined (divergent) integral.')

fig, ax = plt.subplots(figsize=(7.6, 3.8))
k = np.arange(1, M + 1)
ax.plot(k, run_norm, color=BLUE, lw=1.3, label='Normal(0,1) running mean')
ax.plot(k, run_cau, color=RED, lw=1.0, alpha=0.85, label='Cauchy(0,1) running mean')
ax.axhline(0, color='k', lw=0.8, ls=':')
ax.set_xscale('log'); ax.set_ylim(-6, 6)
ax.set_xlabel('number of draws (log scale)'); ax.set_ylabel('running mean')
ax.set_title('Convergent vs divergent: a mean that exists, and one that does not')
ax.legend(); plt.tight_layout(); plt.show()

Running-mean behaviour over 20000 draws:
  Normal(0,1):  running mean at n=M is +0.0051  (range last 5000: [-0.010, +0.005])
  Cauchy(0,1):  running mean at n=M is -0.0422  (range last 5000: [-0.344, -0.017])
  The Cauchy running mean does NOT converge: E[X] is an undefined (divergent) integral.


# SymPy supplies the Taylor coefficients; we evaluate orders 1-4 numerically.
xs = sp.symbols('x')
print('SymPy series of log(1+x) about 0:', sp.series(sp.log(1 + xs), xs, 0, 5))

def taylor_log1px(x, order):
    return sum(((-1) ** (k + 1)) * x**k / k for k in range(1, order + 1))

print('\nTaylor approximations of log(1+x):')
print(f"{'x':>5}  {'exact':>9}  {'T1':>9}  {'T2':>9}  {'T3':>9}  {'T4':>9}")
print('-' * 58)
for x in [0.1, 0.3, 0.5, 0.8, 1.0, 1.5]:
    ex = np.log(1 + x); v = [taylor_log1px(x, k) for k in (1, 2, 3, 4)]
    print(f'{x:>5.1f}  {ex:>9.6f}  {v[0]:>9.6f}  {v[1]:>9.6f}  {v[2]:>9.6f}  {v[3]:>9.6f}')
print('\nabsolute errors |log(1+x) - Tn(x)|:')
print(f"{'x':>5}  {'|E1|':>9}  {'|E2|':>9}  {'|E3|':>9}  {'|E4|':>9}")
print('-' * 46)
for x in [0.1, 0.3, 0.5, 0.8, 1.0, 1.5]:
    ex = np.log(1 + x); e = [abs(ex - taylor_log1px(x, k)) for k in (1, 2, 3, 4)]
    print(f'{x:>5.1f}  {e[0]:>9.2e}  {e[1]:>9.2e}  {e[2]:>9.2e}  {e[3]:>9.2e}')

SymPy series of log(1+x) about 0: x - x**2/2 + x**3/3 - x**4/4 + O(x**5)

Taylor approximations of log(1+x):
    x      exact         T1         T2         T3         T4
----------------------------------------------------------
  0.1   0.095310   0.100000   0.095000   0.095333   0.095308
  0.3   0.262364   0.300000   0.255000   0.264000   0.261975
  0.5   0.405465   0.500000   0.375000   0.416667   0.401042
  0.8   0.587787   0.800000   0.480000   0.650667   0.548267
  1.0   0.693147   1.000000   0.500000   0.833333   0.583333
  1.5   0.916291   1.500000   0.375000   1.500000   0.234375

absolute errors |log(1+x) - Tn(x)|:
    x       |E1|       |E2|       |E3|       |E4|
----------------------------------------------
  0.1   4.69e-03   3.10e-04   2.32e-05   1.85e-06
  0.3   3.76e-02   7.36e-03   1.64e-03   3.89e-04
  0.5   9.45e-02   3.05e-02   1.12e-02   4.42e-03
  0.8   2.12e-01   1.08e-01   6.29e-02   3.95e-02
  1.0   3.07e-01   1.93e-01   1.40e-01   1.10e-01
  1.5   5.84e-01   5.41e-01   5.84e-01   6.82e-01


# Figure A.1: (a) the polynomials vs the exact curve; (b) error growth (log scale).
xx = np.linspace(-0.9, 1.8, 400)
fig, (a1, a2) = plt.subplots(1, 2, figsize=(11, 4.2))
a1.plot(xx, np.log(1 + xx), color='k', lw=2.6, label=r'$\log(1+x)$ (exact)')
for order, col in zip((1, 2, 3, 4), (RED, ORANGE, GREEN, BLUE)):
    a1.plot(xx, taylor_log1px(xx, order), lw=1.6, color=col, label=fr'$T_{order}$')
a1.axvline(1.0, color='gray', ls=':', lw=1.2, label=r'radius $|x|=1$')
a1.set_ylim(-3, 1.6); a1.set_xlabel('x'); a1.set_ylabel('value')
a1.set_title('Taylor polynomials of $\\log(1+x)$ about 0'); a1.legend(fontsize=8, loc='lower right')
xe = np.linspace(0.02, 0.98, 200)
for order, col in zip((1, 2, 3, 4), (RED, ORANGE, GREEN, BLUE)):
    a2.semilogy(xe, np.abs(np.log(1 + xe) - taylor_log1px(xe, order)), color=col,
                lw=1.8, label=fr'$|E_{order}|$')
a2.set_xlabel('x'); a2.set_ylabel('absolute error (log scale)')
a2.set_title(r'Error $\sim |x-a|^{\,n+1}$ (Lagrange remainder)'); a2.legend(fontsize=9)
plt.tight_layout(); plt.show()


# Delta method: Var(sqrt(Xbar)) ~ 1/(4n), independent of lambda. Simulated vs theory.
rng = np.random.default_rng(42)
lam = 3.0; n_reps = 50000
print('Delta method: Var(sqrt(Xbar)) for Poisson(lambda=3),  theory = 1/(4n)')
print(f"{'n':>6}  {'Var sim':>12}  {'1/(4n)':>10}  {'ratio':>8}")
print('-' * 42)
for n_ in [25, 100, 500]:
    sqrt_xbar = np.empty(n_reps)
    for i in range(n_reps):
        sqrt_xbar[i] = np.sqrt(np.mean(rng.poisson(lam, n_)))
    vs = np.var(sqrt_xbar); vt = 1.0 / (4 * n_)
    print(f'{n_:>6}  {vs:>12.6f}  {vt:>10.6f}  {vs/vt:>8.4f}')
print('  The simulated variance matches 1/(4n) to ~1%, and no longer depends on lambda.')

Delta method: Var(sqrt(Xbar)) for Poisson(lambda=3),  theory = 1/(4n)
     n       Var sim      1/(4n)     ratio
------------------------------------------
    25      0.009901    0.010000    0.9901
   100      0.002475    0.002500    0.9902
   500      0.000502    0.000500    1.0039
  The simulated variance matches 1/(4n) to ~1%, and no longer depends on lambda.


# Profile log-likelihood for a Gamma shape alpha (beta profiled out as alpha/xbar).
rng = np.random.default_rng(42)
n = 50
data = rng.gamma(4.0, 1/2.0, n)
xbar = np.mean(data)

def profile_loglik(alpha):
    if alpha <= 0:
        return -1e20
    beta = alpha / xbar
    return (n * (alpha * np.log(beta) - gammaln(alpha))
            + (alpha - 1) * np.sum(np.log(data)) - beta * np.sum(data))

res = minimize_scalar(lambda a: -profile_loglik(a), bounds=(0.5, 20), method='bounded')
alpha_hat = res.x; ll_max = profile_loglik(alpha_hat)
eps = 1e-5
J_obs = -(profile_loglik(alpha_hat + eps) - 2*profile_loglik(alpha_hat)
          + profile_loglik(alpha_hat - eps)) / eps**2
print(f'Profile log-likelihood (Gamma data, n={n}):  alpha_hat = {alpha_hat:.4f}, J = {J_obs:.4f}')
print(f"{'alpha':>7}  {'l exact':>11}  {'l quad':>11}  {'|diff|':>9}")
print('-' * 44)
for da in [-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5]:
    a = alpha_hat + da
    le = profile_loglik(a); lq = ll_max - 0.5 * J_obs * da**2
    print(f'{a:>7.2f}  {le:>11.4f}  {lq:>11.4f}  {abs(le-lq):>9.4f}')

Profile log-likelihood (Gamma data, n=50):  alpha_hat = 6.6685, J = 0.5906
  alpha      l exact       l quad     |diff|
--------------------------------------------
   5.17     -52.5601     -52.4371     0.1230
   5.67     -52.1019     -52.0680     0.0339
   6.17     -51.8505     -51.8465     0.0040
   6.67     -51.7727     -51.7727     0.0000
   7.17     -51.8429     -51.8465     0.0036
   7.67     -52.0407     -52.0680     0.0273
   8.17     -52.3496     -52.4371     0.0875


# Wald statistic vs 2*log-likelihood-ratio: agree near the MLE, diverge far away.
print(f'Wald  J*(alpha_hat-alpha0)^2   vs   2*LR = 2[l(alpha_hat)-l(alpha0)]')
print(f"{'alpha0':>7}  {'Wald':>9}  {'2*LR':>9}  {'|diff|':>9}")
print('-' * 38)
for a0 in [3.0, 4.0, 5.0, 6.0, 7.0, 8.0]:
    wald = J_obs * (alpha_hat - a0)**2
    lr2 = 2 * (ll_max - profile_loglik(a0))
    print(f'{a0:>7.1f}  {wald:>9.4f}  {lr2:>9.4f}  {abs(wald-lr2):>9.4f}')

# Figure: exact profile log-lik vs its quadratic (Wald) approximation.
ag = np.linspace(alpha_hat - 3, alpha_hat + 3, 300)
le = np.array([profile_loglik(a) for a in ag])
lq = ll_max - 0.5 * J_obs * (ag - alpha_hat)**2
fig, ax = plt.subplots(figsize=(7.2, 4.0))
ax.plot(ag, le, color=BLUE, lw=2.4, label='exact profile log-likelihood')
ax.plot(ag, lq, color=RED, ls='--', lw=2.0, label='2nd-order (quadratic) approx')
ax.axvline(alpha_hat, color='k', ls=':', lw=1.2, label=fr'$\hat\alpha={alpha_hat:.2f}$')
ax.plot([alpha_hat], [ll_max], 'o', color='k', ms=7)
ax.set_ylim(ll_max - 8, ll_max + 0.6)
ax.set_xlabel(r'shape $\alpha$'); ax.set_ylabel('log-likelihood')
ax.set_title('Quadratic log-likelihood: exact near the MLE, degrades in the tails')
ax.legend(); plt.tight_layout(); plt.show()

Wald  J*(alpha_hat-alpha0)^2   vs   2*LR = 2[l(alpha_hat)-l(alpha0)]
 alpha0       Wald       2*LR     |diff|
--------------------------------------
    3.0     7.9483    13.2661     5.3178
    4.0     4.2056     5.8782     1.6725
    5.0     1.6442     1.9913     0.3471
    6.0     0.2639     0.2833     0.0194
    7.0     0.0649     0.0627     0.0022
    8.0     1.0471     0.9225     0.1245


# E[U(mu)] = 0 at the truth; nonzero (and signed toward the truth) elsewhere.
rng = np.random.default_rng(42)
n = 100000; mu_true = 5.0; sigma2 = 4.0
data = rng.normal(mu_true, np.sqrt(sigma2), n)
print(f'Score of Normal(mu, sigma^2=4),  n = {n:,}')
print(f"{'mu':>6}  {'E[U] theory':>14}  {'mean U sim':>14}")
print('-' * 40)
rows = []
for mu in [3.0, 4.0, 5.0, 6.0, 7.0]:
    mean_score = np.mean((data - mu) / sigma2)
    theory = (mu_true - mu) / sigma2
    rows.append((mu, theory, mean_score))
    print(f'{mu:>6.1f}  {theory:>14.6f}  {mean_score:>14.6f}')
print(f'\nAt true mu=5: mean score ~ 0 (sim noise {rows[2][2]:+.4f}); the score points toward the truth.')

# Figure: simulated mean score (points) on the theory line E[U(mu)] = (mu_true-mu)/sigma^2.
mug = np.linspace(2.5, 7.5, 200)
fig, ax = plt.subplots(figsize=(7.2, 3.9))
ax.plot(mug, (mu_true - mug) / sigma2, color=BLUE, lw=2.2, label=r'$\mathbb{E}[U(\mu)]=(\mu_0-\mu)/\sigma^2$')
ax.scatter([r[0] for r in rows], [r[2] for r in rows], color=RED, s=60, zorder=5,
           label='simulated mean score')
ax.axhline(0, color='k', lw=0.9, ls=':'); ax.axvline(mu_true, color=GREEN, lw=1.2, ls='--', label=r'true $\mu_0=5$')
ax.set_xlabel(r'$\mu$'); ax.set_ylabel('mean score'); ax.set_title('The score is centered at the true parameter')
ax.legend(); plt.tight_layout(); plt.show()

Score of Normal(mu, sigma^2=4),  n = 100,000
    mu     E[U] theory      mean U sim
----------------------------------------
   3.0        0.500000        0.497884
   4.0        0.250000        0.247884
   5.0        0.000000       -0.002116
   6.0       -0.250000       -0.252116
   7.0       -0.500000       -0.502116

At true mu=5: mean score ~ 0 (sim noise -0.0021); the score points toward the truth.


# Fisher information two ways: Var(U) (simulated) vs -E[l''] (analytic) for Exp(lambda).
rng = np.random.default_rng(42)
lam_true = 2.0; n_samples = 100000
samples = rng.exponential(1/lam_true, n_samples)
scores = 1/lam_true - samples
var_score = np.var(scores)
neg_E_lpp = 1 / lam_true**2
print(f'Fisher information equivalence: Exp(lambda={lam_true})')
print(f'  analytic I1(lambda) = 1/lambda^2        = {neg_E_lpp:.6f}')
print(f'  form 1: Var(U) from {n_samples:,} samples = {var_score:.6f}')
print(f'  form 2: -E[l\'\'(lambda)] = 1/lambda^2   = {neg_E_lpp:.6f}')
print(f'  ratio Var(U) / (-E[l\'\']) = {var_score/neg_E_lpp:.6f}  (~1, deviation is sim noise)')

Fisher information equivalence: Exp(lambda=2.0)
  analytic I1(lambda) = 1/lambda^2        = 0.250000
  form 1: Var(U) from 100,000 samples = 0.252651
  form 2: -E[l''(lambda)] = 1/lambda^2   = 0.250000
  ratio Var(U) / (-E[l'']) = 1.010606  (~1, deviation is sim noise)


# SymPy: score vector and Hessian of the per-observation Normal log-density.
mu, s2, x = sp.symbols('mu sigma2 x', real=True)
logf = -sp.log(2*sp.pi*s2)/2 - (x - mu)**2 / (2*s2)
g_mu = sp.simplify(sp.diff(logf, mu))
g_s2 = sp.simplify(sp.diff(logf, s2))
H = sp.simplify(sp.hessian(logf, (mu, s2)))
print('SymPy per-observation Normal score vector:')
print('  dl/dmu     =', g_mu, '   (sum -> n(xbar-mu)/sigma^2  ->  mu_hat = xbar)')
print('  dl/dsigma2 =', g_s2, '   (sum=0 -> sigma2_hat = mean((x-mu)^2))')
print('\nSymPy per-observation Hessian H (symmetric):')
print('  H[mu,mu]   =', H[0, 0])
print('  H[mu,s2]   =', H[0, 1], '  (E -> 0: mu and sigma^2 are orthogonal)')
print('  H[s2,s2]   =', H[1, 1])

SymPy per-observation Normal score vector:
  dl/dmu     = (-mu + x)/sigma2    (sum -> n(xbar-mu)/sigma^2  ->  mu_hat = xbar)
  dl/dsigma2 = (-sigma2 + (mu - x)**2)/(2*sigma2**2)    (sum=0 -> sigma2_hat = mean((x-mu)^2))

SymPy per-observation Hessian H (symmetric):
  H[mu,mu]   = -1/sigma2
  H[mu,s2]   = (mu - x)/sigma2**2   (E -> 0: mu and sigma^2 are orthogonal)
  H[s2,s2]   = (sigma2/2 - (mu - x)**2)/sigma2**3


# Analytic vs finite-difference Hessian of the FULL Normal log-likelihood at the MLE.
rng = np.random.default_rng(42)
n = 100; mu_true, s2_true = 5.0, 4.0
data = rng.normal(mu_true, np.sqrt(s2_true), n)
mu_hat = np.mean(data); s2_hat = np.var(data)

def normal_loglik(params):
    m, v = params
    if v <= 0:
        return -1e20
    return -n/2 * np.log(2*np.pi*v) - np.sum((data - m)**2) / (2*v)

H11, H12, H22 = -n / s2_hat, 0.0, -n / (2 * s2_hat**2)     # analytic at MLE
eps = 1e-5; th = np.array([mu_hat, s2_hat]); Hn = np.zeros((2, 2))
for i in range(2):
    for j in range(2):
        ei = np.zeros(2); ei[i] = eps; ej = np.zeros(2); ej[j] = eps
        Hn[i, j] = (normal_loglik(th+ei+ej) - normal_loglik(th+ei-ej)
                    - normal_loglik(th-ei+ej) + normal_loglik(th-ei-ej)) / (4*eps**2)
J = -np.array([[H11, H12], [H12, H22]]); cov = np.linalg.inv(J)
print(f'Normal Hessian (n={n}):  mu_hat={mu_hat:.4f}, sigma2_hat={s2_hat:.4f}')
print(f'  analytic  H = [[{H11:.4f}, {H12:.4f}], [{H12:.4f}, {H22:.4f}]]')
print(f'  numeric   H = [[{Hn[0,0]:.4f}, {Hn[0,1]:.4f}], [{Hn[1,0]:.4f}, {Hn[1,1]:.4f}]]')
print('  covariance (-H)^-1 vs theory:')
print(f'    Var(mu_hat)  = {cov[0,0]:.6f}  (theory sigma2/n   = {s2_hat/n:.6f})')
print(f'    Var(s2_hat)  = {cov[1,1]:.6f}  (theory 2sigma4/n  = {2*s2_hat**2/n:.6f})')
print(f'    Cov          = {cov[0,1]:.6f}  (theory 0 -> orthogonal parameters)')

Normal Hessian (n=100):  mu_hat=4.8995, sigma2_hat=2.3888
  analytic  H = [[-41.8624, 0.0000], [0.0000, -8.7623]]
  numeric   H = [[-41.8624, 0.0000], [0.0000, -8.7623]]
  covariance (-H)^-1 vs theory:
    Var(mu_hat)  = 0.023888  (theory sigma2/n   = 0.023888)
    Var(s2_hat)  = 0.114125  (theory 2sigma4/n  = 0.114125)
    Cov          = 0.000000  (theory 0 -> orthogonal parameters)


# SymPy: the chain rule collapses dl/deta to (y - mu) for the logistic link.
eta, y, m = sp.symbols('eta y mu')
mu_sig = 1/(1 + sp.exp(-eta))                      # mu = sigmoid(eta)
l_mu = y*sp.log(m) + (1 - y)*sp.log(1 - m)         # ell as a function of the mu symbol
dl_dmu = sp.simplify(sp.diff(l_mu, m))             # outer layer: (y-mu)/[mu(1-mu)]
dmu_deta = sp.simplify(sp.diff(mu_sig, eta))       # inner layer: mu(1-mu)
dl_deta = sp.diff(l_mu.subs(m, mu_sig), eta)       # composed score wrt eta
print('Chain rule for the logistic score, layer by layer:')
print('  dl/dmu   =', dl_dmu, '   (= (y-mu)/[mu(1-mu)])')
print('  dmu/deta =', dmu_deta, '   (= mu(1-mu), the sigmoid derivative)')
print('  dl/deta  =', sp.simplify(dl_deta), '   (chain rule: the mu(1-mu) factors cancel)')
print('  equals y - sigmoid(eta):', sp.simplify(dl_deta - (y - mu_sig)) == 0,
      '  ->  dl/deta = y - mu')

Chain rule for the logistic score, layer by layer:
  dl/dmu   = (mu - y)/(mu*(mu - 1))    (= (y-mu)/[mu(1-mu)])
  dmu/deta = 1/(4*cosh(eta/2)**2)    (= mu(1-mu), the sigmoid derivative)
  dl/deta  = (y + (y - 1)*exp(eta))/(exp(eta) + 1)    (chain rule: the mu(1-mu) factors cancel)
  equals y - sigmoid(eta): True   ->  dl/deta = y - mu


# Numeric: the assembled score X^T (y - mu) equals the finite-difference gradient.
rng = np.random.default_rng(42)
n, p = 200, 4
X = np.column_stack([np.ones(n), rng.normal(size=(n, p - 1))])
beta_true = np.array([0.3, -1.0, 0.7, 0.5])
prob = 1 / (1 + np.exp(-(X @ beta_true)))
y = (rng.uniform(size=n) < prob).astype(float)

def neg_loglik(b):
    z = X @ b
    return float(np.sum(np.log1p(np.exp(z)) - y * z))
def chain_rule_grad(b):
    mu = 1 / (1 + np.exp(-(X @ b)))
    return -X.T @ (y - mu)                    # gradient of the NEGATIVE log-lik

b0 = np.array([0.1, 0.2, -0.3, 0.4])
g_chain = chain_rule_grad(b0)
g_fd = approx_fprime(b0, neg_loglik, 1e-6)
print('GLM score via the multivariate chain rule  vs  finite differences (at a test beta):')
print(f"{'j':>3}  {'chain-rule X^T(mu-y)':>22}  {'finite-diff':>14}  {'|diff|':>10}")
print('-' * 56)
for j in range(p):
    print(f'{j:>3}  {g_chain[j]:>22.6f}  {g_fd[j]:>14.6f}  {abs(g_chain[j]-g_fd[j]):>10.2e}')
print(f'\nmax abs difference = {np.max(np.abs(g_chain - g_fd)):.2e}  '
      '(the chain rule reproduces the numerical gradient exactly)')

GLM score via the multivariate chain rule  vs  finite differences (at a test beta):
  j    chain-rule X^T(mu-y)     finite-diff      |diff|
--------------------------------------------------------
  0               -2.136106       -2.136083    2.36e-05
  1               54.386501       54.386525    2.37e-05
  2              -30.615793      -30.615771    2.18e-05
  3                2.332375        2.332395    2.00e-05

max abs difference = 2.37e-05  (the chain rule reproduces the numerical gradient exactly)


# Concave Normal log-lik (unique max) vs bimodal mixture log-lik (two local maxima).
rng = np.random.default_rng(42)
n = 50
data_normal = rng.normal(3.0, 1.0, n)
mu_hat = np.mean(data_normal)
print('Log-likelihood shape comparison')
print(f'  Normal model (n={n}): mu_hat = {mu_hat:.4f}')
print(f"    l''(mu) = -n/sigma^2 = {-n/1.0:.1f} < 0 for all mu  (strictly concave, unique max)")

n_mix = 50
mix = np.concatenate([rng.normal(-3, 0.5, n_mix//2), rng.normal(3, 0.5, n_mix//2)])
def mixture_ll(m):
    ld1 = stats.norm.logpdf(mix, -m, 0.5); ld2 = stats.norm.logpdf(mix, m, 0.5)
    mx = np.maximum(ld1, ld2)
    return np.sum(mx + np.log(0.5*np.exp(ld1-mx) + 0.5*np.exp(ld2-mx)))
mu_grid = np.linspace(-5, 5, 1000)
ll_vals = np.array([mixture_ll(m) for m in mu_grid])
loc = [(mu_grid[i], ll_vals[i]) for i in range(1, len(ll_vals)-1)
       if ll_vals[i] > ll_vals[i-1] and ll_vals[i] > ll_vals[i+1]]
print(f'  Mixture 0.5 N(-mu,0.25)+0.5 N(mu,0.25): {len(loc)} local maxima ->')
for m, ll in loc:
    print(f'    mu = {m:+.2f},  l(mu) = {ll:.2f}')
print('    NOT concave: Newton-Raphson converges to a different mode by initialization.')

Log-likelihood shape comparison
  Normal model (n=50): mu_hat = 3.0912
    l''(mu) = -n/sigma^2 = -50.0 < 0 for all mu  (strictly concave, unique max)
  Mixture 0.5 N(-mu,0.25)+0.5 N(mu,0.25): 2 local maxima ->
    mu = -2.97,  l(mu) = -61.17
    mu = +2.97,  l(mu) = -61.17
    NOT concave: Newton-Raphson converges to a different mode by initialization.


# Figure A.2: (a) concave Normal log-lik with two tangent lines lying ABOVE the curve;
#             (b) bimodal mixture log-lik with two equal maxima around +-3.
fig, (a1, a2) = plt.subplots(1, 2, figsize=(11, 4.2))
mg = np.linspace(1.5, 4.5, 300)
ll_norm = np.array([-0.5*np.sum((data_normal - m)**2) for m in mg])   # sigma^2=1, drop const
a1.plot(mg, ll_norm, color=BLUE, lw=2.4, label=r'$\ell(\mu)$ (Normal)')
for m0, col in [(2.4, RED), (3.8, GREEN)]:
    ll0 = -0.5*np.sum((data_normal - m0)**2)
    slope = np.sum(data_normal - m0)
    a1.plot(mg, ll0 + slope*(mg - m0), '--', color=col, lw=1.4)
    a1.plot([m0], [ll0], 'o', color=col, ms=6)
a1.axvline(mu_hat, color='k', ls=':', lw=1.1, label=fr'$\hat\mu={mu_hat:.2f}$')
a1.set_ylim(ll_norm.min()-2, ll_norm.max()+8)
a1.set_xlabel(r'$\mu$'); a1.set_ylabel('log-likelihood')
a1.set_title('Concave: tangents lie above the curve (unique max)'); a1.legend(fontsize=9)
a2.plot(mu_grid, ll_vals, color=PURPLE, lw=2.2, label='mixture log-likelihood')
for m, ll in loc:
    a2.plot([m], [ll], 'o', color=RED, ms=8)
    a2.annotate(fr'$\mu={m:+.1f}$', (m, ll), textcoords='offset points', xytext=(0, -16),
                ha='center', fontsize=9, color=RED)
a2.axvline(0, color='gray', ls=':', lw=1.1, label='valley (saddle) at 0')
a2.set_xlabel(r'$\mu$'); a2.set_ylabel('log-likelihood')
a2.set_title('Non-concave: two equal modes trap optimizers'); a2.legend(fontsize=9)
plt.tight_layout(); plt.show()


# Exact Normal log-lik vs its 2nd-order Taylor (quadratic) approximation at the MLE.
rng = np.random.default_rng(42)
n = 50; mu_true, s2_true = 5.0, 4.0
data = rng.normal(mu_true, np.sqrt(s2_true), n)
mu_hat = np.mean(data); s2_hat = np.var(data)
def normal_ll(m, v):
    return -n/2 * np.log(2*np.pi*v) - np.sum((data - m)**2) / (2*v)
ll_max = normal_ll(mu_hat, s2_hat)
J11 = n / s2_hat; J22 = n / (2 * s2_hat**2); J = np.array([[J11, 0], [0, J22]])
print(f'Multivariate Taylor: Normal(mu, sigma^2)  (n={n})')
print(f'  MLE: mu_hat={mu_hat:.4f}, sigma2_hat={s2_hat:.4f}')
print(f"{'(mu0, s20)':>16}  {'l exact':>10}  {'l quad':>10}  {'diff':>8}")
print('-' * 50)
for m0, v0 in [(mu_hat, s2_hat), (mu_hat+0.5, s2_hat), (mu_hat, s2_hat+1),
               (mu_hat+0.5, s2_hat+1), (mu_hat+1.0, s2_hat+2)]:
    le = normal_ll(m0, v0)
    d = np.array([m0 - mu_hat, v0 - s2_hat])
    lq = ll_max - 0.5 * d @ J @ d
    print(f'  ({m0:.2f}, {v0:.2f})  {le:>10.4f}  {lq:>10.4f}  {abs(le-lq):>8.4f}')

Multivariate Taylor: Normal(mu, sigma^2)  (n=50)
  MLE: mu_hat=5.1824, sigma2_hat=2.3138
      (mu0, s20)     l exact      l quad      diff
--------------------------------------------------
  (5.18, 2.31)    -91.9190    -91.9190    0.0000
  (5.68, 2.31)    -94.6202    -94.6202    0.0000
  (5.18, 3.31)    -93.3549    -94.2539    0.8990
  (5.68, 3.31)    -95.2410    -96.9551    1.7141
  (6.18, 4.31)   -101.6969   -112.0634   10.3665


# Figure A.3: contour plots of the exact log-lik (left) vs its quadratic Taylor (right).
mg = np.linspace(mu_hat - 1.6, mu_hat + 1.6, 160)
vg = np.linspace(max(0.4, s2_hat - 1.6), s2_hat + 2.6, 160)
MM, VV = np.meshgrid(mg, vg)
LL = np.empty_like(MM); LQ = np.empty_like(MM)
for i in range(MM.shape[0]):
    for j in range(MM.shape[1]):
        LL[i, j] = normal_ll(MM[i, j], VV[i, j])
        d = np.array([MM[i, j] - mu_hat, VV[i, j] - s2_hat])
        LQ[i, j] = ll_max - 0.5 * d @ J @ d
lev = np.linspace(ll_max - 12, ll_max - 0.2, 12)
fig, (a1, a2) = plt.subplots(1, 2, figsize=(11, 4.4), sharey=True)
for ax, Z, ttl in [(a1, LL, 'exact $\\ell(\\mu,\\sigma^2)$'),
                    (a2, LQ, '2nd-order Taylor (ellipses)')]:
    cf = ax.contourf(MM, VV, Z, levels=lev, cmap='viridis', extend='min')
    ax.contour(MM, VV, Z, levels=lev, colors='white', linewidths=0.4, alpha=0.6)
    ax.plot([mu_hat], [s2_hat], '*', color=RED, ms=16, label='MLE')
    ax.set_xlabel(r'$\mu$'); ax.set_title(ttl); ax.grid(False)
a1.set_ylabel(r'$\sigma^2$'); a1.legend(loc='upper right')
fig.colorbar(cf, ax=(a1, a2), shrink=0.85, label='log-likelihood')
plt.show()


# Jacobian of polar -> Cartesian: analytic det = r, vs central-difference det.
def transform(r, theta):
    return np.array([r*np.cos(theta), r*np.sin(theta)])
eps = 1e-7
print('Jacobian determinant: polar -> Cartesian')
print(f"{'r':>5}  {'theta':>8}  {'det J analytic':>15}  {'det J numeric':>15}  {'|diff|':>9}")
print('-' * 60)
for r, theta in [(1.0, 0.5), (2.0, 1.0), (0.5, np.pi/4), (3.0, np.pi/3)]:
    J = np.zeros((2, 2))
    J[:, 0] = (transform(r+eps, theta) - transform(r-eps, theta)) / (2*eps)
    J[:, 1] = (transform(r, theta+eps) - transform(r, theta-eps)) / (2*eps)
    det_num = np.linalg.det(J)
    print(f'{r:>5.1f}  {theta:>8.4f}  {r:>15.6f}  {det_num:>15.6f}  {abs(r-det_num):>9.2e}')
print('\ndet(J) = r for all (r, theta): area expands with radius -> element r dr dtheta.')

Jacobian determinant: polar -> Cartesian
    r     theta   det J analytic    det J numeric     |diff|
------------------------------------------------------------
  1.0    0.5000         1.000000         1.000000   5.03e-10
  2.0    1.0000         2.000000         2.000000   8.61e-12
  0.5    0.7854         0.500000         0.500000   3.97e-10
  3.0    1.0472         3.000000         3.000000   8.27e-09

det(J) = r for all (r, theta): area expands with radius -> element r dr dtheta.


# Figure: a uniform (r, theta) grid maps to Cartesian cells whose AREA grows with r.
fig, ax = plt.subplots(figsize=(5.6, 5.4))
rs = np.linspace(0.0, 3.0, 7)
ths = np.linspace(0, np.pi/2, 10)
for r in rs:
    tt = np.linspace(0, np.pi/2, 100)
    ax.plot(r*np.cos(tt), r*np.sin(tt), color=BLUE, lw=1.0, alpha=0.7)
for th in ths:
    ax.plot([0, 3*np.cos(th)], [0, 3*np.sin(th)], color=BLUE, lw=1.0, alpha=0.7)
# shade two equal-(dr,dtheta) cells at small vs large r to show area ~ r.
for (r0, col, lab) in [(0.5, ORANGE, 'small r: small area'), (2.5, RED, 'large r: large area')]:
    tt = np.linspace(np.pi/6, np.pi/6 + np.pi/9, 30)
    inner_r, outer_r = r0, r0 + 0.5
    xs = np.concatenate([inner_r*np.cos(tt), outer_r*np.cos(tt[::-1])])
    ys = np.concatenate([inner_r*np.sin(tt), outer_r*np.sin(tt[::-1])])
    ax.fill(xs, ys, color=col, alpha=0.65, label=lab)
ax.set_aspect('equal'); ax.set_xlim(-0.1, 3.1); ax.set_ylim(-0.1, 3.1); ax.grid(False)
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.set_title(r'Polar grid: equal $dr\,d\theta$ cells, area $=r\,dr\,d\theta$')
ax.legend(loc='upper right', fontsize=9); plt.tight_layout(); plt.show()


# Solution 1.
rng = np.random.default_rng(seed=42)
n = 30
data = rng.binomial(1, 0.4, n); s = int(np.sum(data)); p_hat = s / n
print(f'n = {n}, s = {s}, p_hat = {p_hat:.4f}')
print('(a)  l\'(p)=s/p-(n-s)/(1-p),  l\'\'(p)=-s/p^2-(n-s)/(1-p)^2,  '
      'l\'\'\'(p)=2s/p^3-2(n-s)/(1-p)^3')

def loglik(p):
    if p <= 0 or p >= 1:
        return -1e20
    return s*np.log(p) + (n - s)*np.log(1 - p)
ll_hat = loglik(p_hat)
score_hat = s/p_hat - (n - s)/(1 - p_hat)
J_hat = s/p_hat**2 + (n - s)/(1 - p_hat)**2
print(f'(b)  l(p_hat)={ll_hat:.4f}, score={score_hat:.6f}, J(p_hat)={J_hat:.4f}')
in_range = [p for p in np.linspace(0.05, 0.85, 500)
            if abs(loglik(p) - (ll_hat - 0.5*J_hat*(p - p_hat)**2)) < 0.1]
print(f'     2nd-order within 0.1 of exact for p in [{min(in_range):.3f}, {max(in_range):.3f}]')

se = 1/np.sqrt(J_hat); z = 1.96
wald = (p_hat - z*se, p_hat + z*se)
wc = (s + z**2/2)/(n + z**2)
whw = (z/(n + z**2))*np.sqrt(s*(n - s)/n + z**2/4)
wilson = (wc - whw, wc + whw)
cp = (stats.beta.ppf(0.025, s, n - s + 1), stats.beta.ppf(0.975, s + 1, n - s))
print(f'(c)  Wald            [{wald[0]:.4f}, {wald[1]:.4f}]')
print(f'     Wilson          [{wilson[0]:.4f}, {wilson[1]:.4f}]')
print(f'     Clopper-Pearson [{cp[0]:.4f}, {cp[1]:.4f}]')

dp = 0.1; p_test = p_hat + dp
third = lambda p: 2*s/p**3 - 2*(n - s)/(1 - p)**3
worst_R2 = max(abs(third(c)/6 * dp**3) for c in np.linspace(p_hat, p_test, 100))
actual = abs(loglik(p_test) - (ll_hat - 0.5*J_hat*dp**2))
print(f'(d)  worst-case |R2| bound = {worst_R2:.6f},  actual error = {actual:.6f},  '
      f'conservative = {worst_R2 >= actual}')

n = 30, s = 16, p_hat = 0.5333
(a)  l'(p)=s/p-(n-s)/(1-p),  l''(p)=-s/p^2-(n-s)/(1-p)^2,  l'''(p)=2s/p^3-2(n-s)/(1-p)^3
(b)  l(p_hat)=-20.7277, score=0.000000, J(p_hat)=120.5357
     2nd-order within 0.1 of exact for p in [0.340, 0.680]
(c)  Wald            [0.3548, 0.7119]
     Wilson          [0.3614, 0.6977]
     Clopper-Pearson [0.3433, 0.7166]
(d)  worst-case |R2| bound = 0.073671,  actual error = 0.023986,  conservative = True


# Solution 2.
rng = np.random.default_rng(seed=42)
alpha0, beta0 = 3.0, 2.0; n_samples = 100000
print('(a)  U_alpha = log(beta) - psi(alpha) + log(x)   [needs digamma psi]')
print('     U_beta  = alpha/beta - x')
samples = rng.gamma(alpha0, 1/beta0, n_samples)
U_alpha = np.log(beta0) - digamma(alpha0) + np.log(samples)
U_beta = alpha0/beta0 - samples
print(f'(b)  mean U_alpha = {np.mean(U_alpha):+.6f}, mean U_beta = {np.mean(U_beta):+.6f}  (expect 0)')
I_analytic = np.array([[polygamma(1, alpha0), -1/beta0], [-1/beta0, alpha0/beta0**2]])
I_sim = np.cov(np.column_stack([U_alpha, U_beta]), rowvar=False)
print('(c)  Fisher information  -E[H]  (analytic):')
print(f'       [[{I_analytic[0,0]:.6f}, {I_analytic[0,1]:.6f}], '
      f'[{I_analytic[1,0]:.6f}, {I_analytic[1,1]:.6f}]]   [(1,1) uses trigamma psi\']')
print('     score covariance Cov(U) (simulated):')
print(f'       [[{I_sim[0,0]:.6f}, {I_sim[0,1]:.6f}], [{I_sim[1,0]:.6f}, {I_sim[1,1]:.6f}]]')
ratio = I_sim / I_analytic
print(f'     element-wise ratio ~ 1: [[{ratio[0,0]:.4f}, {ratio[0,1]:.4f}], '
      f'[{ratio[1,0]:.4f}, {ratio[1,1]:.4f}]]')

(a)  U_alpha = log(beta) - psi(alpha) + log(x)   [needs digamma psi]
     U_beta  = alpha/beta - x
(b)  mean U_alpha = -0.000841, mean U_beta = -0.000859  (expect 0)
(c)  Fisher information  -E[H]  (analytic):
       [[0.394934, -0.500000], [-0.500000, 0.750000]]   [(1,1) uses trigamma psi']
     score covariance Cov(U) (simulated):
       [[0.398013, -0.504284], [-0.504284, 0.757577]]
     element-wise ratio ~ 1: [[1.0078, 1.0086], [1.0086, 1.0101]]


# Solution 3.
rng = np.random.default_rng(seed=42)
print('(a)  inverse map X=UV, Y=U(1-V); J=[[V,U],[1-V,-U]]; det=-U; |det J|=U')
print('     f_{U,V}=e^{-u}*U = (u e^{-u}) * (1)  ->  U~Gamma(2,1) indep V~Uniform(0,1)')
n_sim = 100000
X = rng.exponential(1, n_sim); Y = rng.exponential(1, n_sim)
U = X + Y; V = X / (X + Y)
ks_U = stats.kstest(U, 'gamma', args=(2, 0, 1))
ks_V = stats.kstest(V, 'uniform')
print(f'(b)  U ~ Gamma(2,1):  KS={ks_U.statistic:.4f}, p={ks_U.pvalue:.4f}')
print(f'     V ~ Uniform(0,1): KS={ks_V.statistic:.4f}, p={ks_V.pvalue:.4f}')
print(f'     Corr(U,V) = {np.corrcoef(U, V)[0,1]:.6f}  (expect ~0; independence)')
a, b = 3.0, 2.0
Xa = rng.gamma(a, 1, n_sim); Yb = rng.gamma(b, 1, n_sim)
W = Xa / (Xa + Yb)                                  # SAME Xa in numerator and denominator
ks_W = stats.kstest(W, 'beta', args=(a, b))
print(f'(d)  X/(X+Y) ~ Beta({a:.0f},{b:.0f}):  KS={ks_W.statistic:.4f}, p={ks_W.pvalue:.4f}')
print(f'     mean={np.mean(W):.4f} (theory {a/(a+b):.4f}),  '
      f'var={np.var(W):.4f} (theory {a*b/((a+b)**2*(a+b+1)):.4f})')

(a)  inverse map X=UV, Y=U(1-V); J=[[V,U],[1-V,-U]]; det=-U; |det J|=U
     f_{U,V}=e^{-u}*U = (u e^{-u}) * (1)  ->  U~Gamma(2,1) indep V~Uniform(0,1)
(b)  U ~ Gamma(2,1):  KS=0.0020, p=0.7978
     V ~ Uniform(0,1): KS=0.0037, p=0.1223
     Corr(U,V) = 0.002447  (expect ~0; independence)
(d)  X/(X+Y) ~ Beta(3,2):  KS=0.0025, p=0.5639
     mean=0.6003 (theory 0.6000),  var=0.0398 (theory 0.0400)

§	Topic	Computational illustration
A.1	Univariate differentiation	Poisson score: analytic vs finite-difference vs SymPy; second-derivative test
A.2	Integration	$\mathbb{E}[X^2]$ for Gamma 3 ways; IBP $\mathbb{E}[X]=\alpha/\beta$; inverse-CDF; Cauchy has no mean
A.3	Taylor series	$\log(1+x)$ orders 1–4; delta method (variance-stabilizing $\sqrt{\bar X}$); quadratic log-lik
A.4	Leibniz rule	$\mathbb{E}[U(\theta)]=0$ (Normal); Fisher information equality (Exponential)
A.5	Gradients & Hessians	Normal score vector & Hessian (SymPy + numeric); GLM chain-rule score
A.6	Convexity	concave Normal log-lik (unique MLE) vs bimodal mixture (local traps)
A.7	Multivariate Taylor	quadratic approximation of $\ell(\mu,\sigma^2)$; the Wald statistic
A.8	Multiple integrals / Jacobian	polar coordinates $\det J = r$; $X/(X+Y)\sim\text{Beta}$

Calculus tool	This notebook	Where it returns in the course
Chain rule	GLM score $\mathbf X^\top(\mathbf y-\boldsymbol\mu)$ (A.5)	Ch 3.2 MLE score, Ch 3.5 GLM/IRLS
Logarithmic differentiation	Poisson score $\sum x_i/\lambda - n$ (A.1)	Ch 3.2 score, App E Cramér-Rao
Second-derivative test	$J=-\ell''$, observed information (A.1)	Ch 3.2 MLE verification
Integration by parts	$\mathbb{E}[X]=\alpha/\beta$ (A.2)	App D moments, Ch 2.4 MGFs
Change of variables	inverse-CDF Exp; polar $\det J=r$ (A.2, A.8)	Ch 2.4 inverse-CDF, Box-Muller
Taylor (1st order)	$\operatorname{Var}(\sqrt{\bar X})=1/4n$ (A.3)	App E delta method, Ch 4.3 bootstrap SE
Taylor (2nd order)	quadratic log-lik, Wald vs LR (A.3, A.7)	Ch 3.2 Newton/Wald, Ch 5.5 Laplace
Leibniz rule	$\mathbb{E}[U]=0$; $\operatorname{Var}(U)=-\mathbb{E}[\ell'']$ (A.4)	App E score & Fisher info
Gradient / Hessian	Normal score & Hessian, orthogonality (A.5)	Ch 3.2 multivariate MLE, Ch 3.5 IRLS
Jacobian determinant	$X/(X+Y)\sim\text{Beta}$ (A.8, Ex 3)	Ch 2.4 multivariate transforms
Convexity	concave Normal vs bimodal mixture (A.6)	Ch 3.1 exponential families

Appendix A — Calculus Review¶

Learning outcomes¶

Section map (all core — calculus the rest of the course assumes)¶

Section A.1 — Univariate Differentiation¶

Logarithmic differentiation and the score function¶

Higher derivatives and the second-derivative test¶

Section A.2 — Integration¶

Integration by parts: $\mathbb{E}[X]=\alpha/\beta$ for the Gamma¶

Change of variables (univariate): the inverse-CDF method¶

Improper integrals and convergence: the Cauchy has no mean¶

Section A.3 — Taylor Series and Approximation¶

First-order Taylor = the delta method¶

Second-order Taylor = the quadratic log-likelihood¶

Section A.4 — Leibniz Rule (Differentiation Under the Integral)¶

Fisher information equality¶

Section A.5 — Gradients, Hessians, and the Chain Rule¶

The multivariate chain rule: a GLM score vector¶

Section A.6 — Convexity and Optimization¶

Section A.7 — Multivariate Taylor Expansion¶

Section A.8 — Multiple Integrals and the Jacobian Determinant¶

Practice Problems¶

Summary & connections forward¶