import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import time
from statistics import mean, pvariance
from collections import Counter
from scipy import stats
from numpy.random import SeedSequence, default_rng
%matplotlib inline

np.random.seed(418)            # reproducibility (STYLE.md contract)
random.seed(418)
plt.rcParams.update({
    'figure.figsize': (7.2, 4.2), 'figure.dpi': 110,
    'axes.grid': True, 'grid.alpha': 0.25,
    'axes.spines.top': False, 'axes.spines.right': False,
    'font.size': 11,
})
BLUE, RED, GREEN, ORANGE, PURPLE = '#4C72B0', '#C44E52', '#55A868', '#DD8452', '#8172B3'

import scipy
print('environment ready · numpy', np.__version__, '· scipy', scipy.__version__,
      '· python', sys.version.split()[0])

environment ready · numpy 1.26.4 · scipy 1.15.3 · python 3.10.14


# Reproducibility: same seed -> identical sequence (the whole point of a PRNG).
random.seed(42)
run1 = [random.random() for _ in range(5)]
random.seed(42)                       # reset to the same seed
run2 = [random.random() for _ in range(5)]
random.seed(123)                      # a different seed
run3 = [random.random() for _ in range(5)]

print('Run 1 (seed 42):', [f'{x:.4f}' for x in run1])
print('Run 2 (seed 42):', [f'{x:.4f}' for x in run2])
print('  identical?    ', run1 == run2)
print('Run 3 (seed 123):', [f'{x:.4f}' for x in run3], '  <- different seed, different stream')

# State save / restore: pause and resume the EXACT same stream.
random.seed(42)
_ = [random.random() for _ in range(100)]   # advance the state
saved = random.getstate()                    # capture the full Mersenne-Twister state
batch1 = [random.random() for _ in range(5)]
random.setstate(saved)                       # rewind to the saved point
batch2 = [random.random() for _ in range(5)]
print('\nState save/restore -> batches match:', batch1 == batch2)

Run 1 (seed 42): ['0.6394', '0.0250', '0.2750', '0.2232', '0.7365']
Run 2 (seed 42): ['0.6394', '0.0250', '0.2750', '0.2232', '0.7365']
  identical?     True
Run 3 (seed 123): ['0.0524', '0.0872', '0.4072', '0.1077', '0.9012']   <- different seed, different stream

State save/restore -> batches match: True


# Figure: do PCG64 draws look uniform AND independent?
rng = np.random.default_rng(2718)
u = rng.random(5000)

# uniformity via a chi-square goodness-of-fit over 25 equal bins;
# independence via the lag-1 autocorrelation (should be ~0).
k = 25
counts, _ = np.histogram(u, bins=k, range=(0, 1))
expected = len(u) / k
chi2 = ((counts - expected) ** 2 / expected).sum()
chi2_crit = stats.chi2.ppf(0.95, k - 1)
lag1 = np.corrcoef(u[:-1], u[1:])[0, 1]
print(f'Uniformity  : chi-square = {chi2:.1f}  (5% critical = {chi2_crit:.1f}) -> '
      f'{"PASS" if chi2 < chi2_crit else "FAIL"}')
print(f'Independence: lag-1 autocorrelation = {lag1:+.4f}  (target ~0)')

fig, axes = plt.subplots(1, 2, figsize=(10.5, 3.8))
axes[0].hist(u, bins=k, range=(0, 1), color=BLUE, edgecolor='white')
axes[0].axhline(expected, color=RED, ls='--', lw=1.6, label=f'expected n/k = {expected:.0f}')
axes[0].set_xlabel('value'); axes[0].set_ylabel('count')
axes[0].set_title(f'Uniformity: 5000 PCG64 draws over {k} bins'); axes[0].legend()
axes[1].scatter(u[:-1], u[1:], s=5, alpha=0.18, color=GREEN)
axes[1].set_xlabel(r'$u_t$'); axes[1].set_ylabel(r'$u_{t+1}$')
axes[1].set_title(f'Independence: lag-1 plot (r = {lag1:+.4f})')
plt.tight_layout(); plt.show()

Uniformity  : chi-square = 24.7  (5% critical = 36.4) -> PASS
Independence: lag-1 autocorrelation = -0.0003  (target ~0)


random.seed(42)
n = 10000

# U(0,1)
u01 = [random.random() for _ in range(n)]
print(f'U(0,1)  n={n}:  mean = {mean(u01):.5f} (theory 0.5)   '
      f'var = {pvariance(u01):.5f} (theory 0.08333)')

# U(a,b)
a, b = 5, 15
uab = [random.uniform(a, b) for _ in range(n)]
print(f'U({a},{b}):       mean = {mean(uab):.4f} (theory {(a+b)/2})        '
      f'var = {pvariance(uab):.4f} (theory {(b-a)**2/12:.4f})')

# discrete uniform (a die)
dice = [random.randint(1, 6) for _ in range(n)]
print(f'randint(1,6):  mean = {mean(dice):.4f} (theory 3.5)   '
      f'counts = {dict(sorted(Counter(dice).items()))}')

U(0,1)  n=10000:  mean = 0.50025 (theory 0.5)   var = 0.08276 (theory 0.08333)
U(5,15):       mean = 10.0248 (theory 10.0)        var = 8.4013 (theory 8.3333)
randint(1,6):  mean = 3.5003 (theory 3.5)   counts = {1: 1647, 2: 1679, 3: 1646, 4: 1722, 5: 1664, 6: 1642}


random.seed(42)

# choice: a single random element
volunteers = ['Alice', 'Bob', 'Carol', 'Deepak', 'Eve']
print("today's volunteer:", random.choice(volunteers))

# choices: weighted sampling WITH replacement (a loaded die)
faces = [1, 2, 3, 4, 5, 6]
weights = [0.05, 0.10, 0.15, 0.20, 0.25, 0.25]   # biased toward high faces
rolls = random.choices(faces, weights=weights, k=10000)
print('\nloaded die — empirical vs target probability:')
for face, target in zip(faces, weights):
    print(f'  face {face}: {rolls.count(face)/len(rolls):.3f}  (target {target:.2f})')

# sample: WITHOUT replacement (a 5-card poker hand)
deck = [f'{r}{s}' for s in 'SHDC' for r in 'A23456789TJQK']
print('\npoker hand (sample, no replacement):', random.sample(deck, 5))

# shuffle: in place -> works only on MUTABLE sequences (lists)
items = list(range(10))
random.shuffle(items)
print('shuffled list:', items)
text = 'HELLO'                              # strings are immutable: convert first
letters = list(text); random.shuffle(letters)
print('shuffled string:', ''.join(letters))

today's volunteer: Alice

loaded die — empirical vs target probability:
  face 1: 0.046  (target 0.05)
  face 2: 0.106  (target 0.10)
  face 3: 0.144  (target 0.15)
  face 4: 0.203  (target 0.20)
  face 5: 0.255  (target 0.25)
  face 6: 0.246  (target 0.25)

poker hand (sample, no replacement): ['AC', '6C', 'TC', 'KC', '3S']
shuffled list: [3, 8, 2, 0, 7, 9, 4, 6, 1, 5]
shuffled string: OHLLE


random.seed(42)
n = 10000

# (name, sampler, theory mean, theory variance)
specs = [
    ('Normal(0,1)',       lambda: random.gauss(0, 1),         0.0,        1.0),
    ('Exponential(lam=1)', lambda: random.expovariate(1),      1.0,        1.0),
    ('Gamma(a=2, b=3)',   lambda: random.gammavariate(2, 3),  2*3,        2*3**2),
    ('Beta(2,5)',         lambda: random.betavariate(2, 5),   2/7,        (2*5)/(49*8)),
]
rows = []
for name, samp, mu_th, var_th in specs:
    xs = [samp() for _ in range(n)]
    rows.append([name, mean(xs), mu_th, pvariance(xs), var_th])
tab = pd.DataFrame(rows, columns=['distribution', 'sample_mu', 'theory_mu',
                                  'sample_var', 'theory_var'])
print(tab.round(4).to_string(index=False))
print('\nSample moments track theory -> the parameterizations are correct.')
print('Reminder: expovariate uses RATE (mean = 1/rate); gammavariate uses SCALE.')

      distribution  sample_mu  theory_mu  sample_var  theory_var
       Normal(0,1)    -0.0117     0.0000      0.9991      1.0000
Exponential(lam=1)     1.0057     1.0000      0.9780      1.0000
   Gamma(a=2, b=3)     5.9748     6.0000     17.4618     18.0000
         Beta(2,5)     0.2848     0.2857      0.0253      0.0255

Sample moments track theory -> the parameterizations are correct.
Reminder: expovariate uses RATE (mean = 1/rate); gammavariate uses SCALE.


# The modern Generator API: create an explicit, seeded generator.
rng = np.random.default_rng(42)

uniforms = rng.random(10)            # 10 U(0,1)
normals = rng.standard_normal(10)    # 10 N(0,1)
integers = rng.integers(1, 7, 10)    # 10 dice rolls in {1,...,6}  (high EXCLUSIVE)
print('uniforms:', uniforms[:5].round(4))
print('normals :', normals[:5].round(4))
print('integers:', integers)
print('\nWhy explicit generators: no hidden global state, pass rng to functions,')
print('independent streams per worker, and PCG64 beats Mersenne Twister.')

uniforms: [0.774  0.4389 0.8586 0.6974 0.0942]
normals : [0.8794 0.7778 0.066  1.1272 0.4675]
integers: [1 5 5 3 1 6 3 6 5 5]

Why explicit generators: no hidden global state, pass rng to functions,
independent streams per worker, and PCG64 beats Mersenne Twister.


# Performance: a Python loop vs one vectorized NumPy call (1,000,000 draws).
N = 1_000_000

random.seed(42)
t = time.perf_counter()
_py = [random.random() for _ in range(N)]
py_time = time.perf_counter() - t

rng = np.random.default_rng(42)
t = time.perf_counter()
_np = rng.random(N)
np_time = time.perf_counter() - t

speedup = py_time / np_time
print(f'Python loop : {py_time:.4f} s')
print(f'NumPy call  : {np_time:.4f} s')
print(f'Speedup     : {speedup:.1f}x  (typical 5-10x for simple draws; more for heavier ones)')

fig, ax = plt.subplots(figsize=(6.0, 3.6))
bars = ax.bar(['Python\nloop', 'NumPy\nGenerator'], [py_time, np_time],
              color=[RED, GREEN], edgecolor='white', width=0.6)
for b, v in zip(bars, [py_time, np_time]):
    ax.text(b.get_x()+b.get_width()/2, v, f'{v*1000:.1f} ms', ha='center', va='bottom', fontsize=10)
ax.set_ylabel('time for 1,000,000 draws (s)')
ax.set_title(f'Vectorization wins: NumPy ~{speedup:.0f}x faster')
plt.tight_layout(); plt.show()

Python loop : 0.0660 s
NumPy call  : 0.0059 s
Speedup     : 11.2x  (typical 5-10x for simple draws; more for heavier ones)


rng = np.random.default_rng(42)
n = 10000

normal = rng.normal(loc=100, scale=15, size=n)      # N(100, 15^2)
exponential = rng.exponential(scale=2, size=n)       # Exp, mean = 2
gamma = rng.gamma(shape=2, scale=3, size=n)          # Gamma(k=2, theta=3), mean 6
beta = rng.beta(a=2, b=5, size=n)                    # Beta(2,5), mean 2/7
binomial = rng.binomial(n=20, p=0.3, size=n)         # Binom(20, 0.3), mean 6
poisson = rng.poisson(lam=5, size=n)                 # Poisson(5), mean=var=5

print(f'N(100,15^2): mean={normal.mean():.2f}  std={normal.std():.2f}  (theory 100, 15)')
print(f'Exp(scale=2): mean={exponential.mean():.3f}  (theory 2)')
print(f'Gamma(2,3):  mean={gamma.mean():.3f}  (theory 6)')
print(f'Beta(2,5):   mean={beta.mean():.4f}  (theory {2/7:.4f})')
print(f'Binom(20,.3):mean={binomial.mean():.3f}  (theory 6)')
print(f'Poisson(5):  mean={poisson.mean():.3f}  var={poisson.var():.3f}  (theory 5, 5)')

N(100,15^2): mean=99.85  std=15.09  (theory 100, 15)
Exp(scale=2): mean=2.010  (theory 2)
Gamma(2,3):  mean=6.034  (theory 6)
Beta(2,5):   mean=0.2851  (theory 0.2857)
Binom(20,.3):mean=5.987  (theory 6)
Poisson(5):  mean=4.993  var=5.001  (theory 5, 5)


# Figure: six-distribution gallery, histogram + theoretical density / pmf overlay.
fig, axes = plt.subplots(2, 3, figsize=(12, 6.4))

axes[0,0].hist(normal, bins=40, density=True, color=BLUE, alpha=0.6, edgecolor='white')
xx = np.linspace(normal.min(), normal.max(), 200)
axes[0,0].plot(xx, stats.norm.pdf(xx, 100, 15), color='k', lw=2)
axes[0,0].set_title('Normal(100, 15)')

axes[0,1].hist(exponential, bins=40, density=True, color=ORANGE, alpha=0.6, edgecolor='white')
xx = np.linspace(0, exponential.max(), 200)
axes[0,1].plot(xx, stats.expon.pdf(xx, scale=2), color='k', lw=2)
axes[0,1].set_title('Exponential(scale=2)')

axes[0,2].hist(gamma, bins=40, density=True, color=GREEN, alpha=0.6, edgecolor='white')
xx = np.linspace(0, gamma.max(), 200)
axes[0,2].plot(xx, stats.gamma.pdf(xx, a=2, scale=3), color='k', lw=2)
axes[0,2].set_title('Gamma(2, 3)')

axes[1,0].hist(beta, bins=40, density=True, color=PURPLE, alpha=0.6, edgecolor='white')
xx = np.linspace(0, 1, 200)
axes[1,0].plot(xx, stats.beta.pdf(xx, 2, 5), color='k', lw=2)
axes[1,0].set_title('Beta(2, 5)')

kk = np.arange(0, binomial.max()+1)
axes[1,1].hist(binomial, bins=np.arange(-0.5, binomial.max()+1.5), density=True,
               color=RED, alpha=0.6, edgecolor='white')
axes[1,1].plot(kk, stats.binom.pmf(kk, 20, 0.3), 'ko-', lw=1.5, ms=4)
axes[1,1].set_title('Binomial(20, 0.3)')

kk = np.arange(0, poisson.max()+1)
axes[1,2].hist(poisson, bins=np.arange(-0.5, poisson.max()+1.5), density=True,
               color='#937860', alpha=0.6, edgecolor='white')
axes[1,2].plot(kk, stats.poisson.pmf(kk, 5), 'ko-', lw=1.5, ms=4)
axes[1,2].set_title('Poisson(5)')

for ax in axes.ravel():
    ax.set_ylabel('density')
fig.suptitle('Sampling from common distributions (10,000 draws each, theory overlaid)', y=1.02)
plt.tight_layout(); plt.show()


rng = np.random.default_rng(42)
rhos = [-0.8, 0.0, 0.8]
fig, axes = plt.subplots(1, 3, figsize=(12, 3.9))
for ax, rho in zip(axes, rhos):
    cov = [[1.0, rho], [rho, 1.0]]
    s = rng.multivariate_normal([0, 0], cov, size=4000)
    emp = np.corrcoef(s.T)[0, 1]
    ax.scatter(s[:, 0], s[:, 1], s=6, alpha=0.18, color=BLUE)
    ax.set_title(f'rho = {rho:+.1f}   (sample r = {emp:+.3f})')
    ax.set_xlabel('x1'); ax.set_ylabel('x2')
    ax.set_xlim(-4, 4); ax.set_ylim(-4, 4)
fig.suptitle('Bivariate normal: correlation tilts the cloud', y=1.03)
plt.tight_layout(); plt.show()

# Dirichlet: each row is a probability vector (sums to 1).
alpha = [2, 5, 3]
ds = rng.dirichlet(alpha, size=5)
print('Dirichlet(alpha=[2,5,3]) draws (each row sums to 1):')
for i, row in enumerate(ds):
    print(f'  {i+1}: [{row[0]:.3f}, {row[1]:.3f}, {row[2]:.3f}]  sum={row.sum():.6f}')
print('theory mean = alpha/sum(alpha) =', (np.array(alpha)/sum(alpha)).round(3))

Dirichlet(alpha=[2,5,3]) draws (each row sums to 1):
  1: [0.158, 0.510, 0.332]  sum=1.000000
  2: [0.171, 0.237, 0.592]  sum=1.000000
  3: [0.146, 0.496, 0.359]  sum=1.000000
  4: [0.118, 0.132, 0.750]  sum=1.000000
  5: [0.252, 0.643, 0.105]  sum=1.000000
theory mean = alpha/sum(alpha) = [0.2 0.5 0.3]


rng = np.random.default_rng(42)
data = np.array([10, 20, 30, 40, 50])

boot = rng.choice(data, size=10, replace=True)              # WITH replacement (bootstrap)
subset = rng.choice(data, size=3, replace=False)            # WITHOUT replacement
weighted = rng.choice(data, size=10, replace=True, p=[0.1, 0.1, 0.1, 0.3, 0.4])
print('bootstrap (with replacement):', boot)
print('subset    (no replacement)  :', subset)
print('weighted  (favors 40, 50)   :', weighted)

X = np.arange(15).reshape(5, 3)
boot_rows = rng.choice(X, size=5, replace=True, axis=0)     # bootstrap whole rows
print('\noriginal rows:\n', X)
print('bootstrap rows:\n', boot_rows)

# Figure: what "with replacement" means -- duplicates appear, and a bootstrap of the
# mean reconstructs the sampling distribution.
rng = np.random.default_rng(7)
pop = np.array([10, 20, 30, 40, 50])
one_draw = rng.choice(pop, size=len(pop), replace=True)
cnt = Counter(one_draw)
boot_means = [rng.choice(pop, size=len(pop), replace=True).mean() for _ in range(5000)]

fig, axes = plt.subplots(1, 2, figsize=(11, 3.8))
axes[0].bar([str(v) for v in pop], [cnt.get(v, 0) for v in pop], color=BLUE, edgecolor='white')
axes[0].set_title(f'One bootstrap draw of {pop.tolist()}\n(some values repeat, some vanish)')
axes[0].set_xlabel('value'); axes[0].set_ylabel('times selected')
axes[1].hist(boot_means, bins=30, color=GREEN, edgecolor='white')
axes[1].axvline(pop.mean(), color=RED, lw=2, label=f'population mean = {pop.mean():.0f}')
axes[1].set_title('5000 bootstrap means'); axes[1].set_xlabel('resample mean'); axes[1].legend()
plt.tight_layout(); plt.show()

bootstrap (with replacement): [10 40 40 30 30 50 10 40 20 10]
subset    (no replacement)  : [20 40 50]
weighted  (favors 40, 50)   : [20 40 40 50 50 50 40 30 40 10]

original rows:
 [[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]]
bootstrap rows:
 [[ 9 10 11]
 [12 13 14]
 [12 13 14]
 [ 3  4  5]
 [ 9 10 11]]


def mc_pi(child_seed, n_points=1_000_000):
    '''Estimate pi from one worker's independent PCG64 stream.'''
    rng = default_rng(child_seed)
    x, y = rng.random(n_points), rng.random(n_points)
    return 4 * np.sum(x**2 + y**2 <= 1) / n_points

master = SeedSequence(42)
child_seeds = master.spawn(4)              # 4 provably-independent child seeds
estimates = [mc_pi(s) for s in child_seeds]
print('pi from 4 independent workers:', [f'{e:.6f}' for e in estimates])
print(f'combined mean = {np.mean(estimates):.6f}   true pi = {np.pi:.6f}   '
      f'error = {abs(np.mean(estimates)-np.pi):.2e}')
print('\nNEVER share one Generator across threads/processes: that gives race conditions,')
print('correlated streams, and non-reproducible results. Spawn a child per worker.')

pi from 4 independent workers: ['3.142308', '3.141240', '3.141972', '3.139968']
combined mean = 3.141372   true pi = 3.141593   error = 2.21e-04

NEVER share one Generator across threads/processes: that gives race conditions,
correlated streams, and non-reproducible results. Spawn a child per worker.


# The frozen-distribution pattern: IQ ~ N(100, 15).
iq = stats.norm(loc=100, scale=15)
print(f'mean = {iq.mean()}   var = {iq.var()}   median = {iq.median()}')
print(f'P(IQ <= 130) = cdf(130) = {iq.cdf(130):.4f}')
print(f'P(IQ >  130) = sf(130)  = {iq.sf(130):.4f}')
print(f'90th percentile = ppf(0.90) = {iq.ppf(0.90):.2f}')
print(f'5 samples = {iq.rvs(size=5, random_state=42).round(1)}')

mean = 100.0   var = 225.0   median = 100.0
P(IQ <= 130) = cdf(130) = 0.9772
P(IQ >  130) = sf(130)  = 0.0228
90th percentile = ppf(0.90) = 119.22
5 samples = [107.5  97.9 109.7 122.8  96.5]


# Figure: the unified interface on one Gamma(a=2, scale=2) -- PDF, CDF, PPF, RVS.
g = stats.gamma(a=2, scale=2)
xx = np.linspace(0, 16, 300)
qq = np.linspace(0.001, 0.999, 300)
samp = g.rvs(size=5000, random_state=0)

fig, axes = plt.subplots(2, 2, figsize=(11, 6.6))
axes[0,0].plot(xx, g.pdf(xx), color=BLUE, lw=2.2); axes[0,0].set_title('(A) pdf  f(x)')
axes[0,0].fill_between(xx, g.pdf(xx), alpha=0.15, color=BLUE)
axes[0,1].plot(xx, g.cdf(xx), color=GREEN, lw=2.2); axes[0,1].set_title('(B) cdf  F(x) = P(X<=x)')
axes[1,0].plot(qq, g.ppf(qq), color=ORANGE, lw=2.2); axes[1,0].set_title('(C) ppf  F^{-1}(q) (inverts the CDF)')
axes[1,0].set_xlabel('probability q')
axes[1,1].hist(samp, bins=40, density=True, color=PURPLE, alpha=0.55, edgecolor='white')
axes[1,1].plot(xx, g.pdf(xx), color='k', lw=2); axes[1,1].set_title('(D) rvs: 5000 samples vs pdf')
fig.suptitle('SciPy distribution methods on one Gamma(2, 2) object', y=1.02)
plt.tight_layout(); plt.show()


want_mean = 2.0
correct = stats.expon(scale=2)          # scale = mean -> right
wrong = stats.expon(scale=0.5)          # this has mean 0.5 -> a classic bug
print(f'stats.expon(scale=2): sample mean = {correct.rvs(10000, random_state=42).mean():.3f}  '
      f'(want {want_mean}) -> correct')
print(f'stats.expon(scale=0.5): sample mean = {wrong.rvs(10000, random_state=42).mean():.3f}  '
      f'(want {want_mean}) -> WRONG (scale != rate)')

ref = pd.DataFrame([
    ['Normal',      'stats.norm(loc=mu, scale=sigma)', 'scale is sigma, NOT variance'],
    ['Exponential', 'stats.expon(scale=beta)',         'mean = scale = 1/rate'],
    ['Gamma',       'stats.gamma(a=k, scale=theta)',   'a is shape; mean = k*theta'],
    ['Poisson',     'stats.poisson(mu=lam)',           'mu is the rate lambda'],
    ['NegBinomial', 'stats.nbinom(n, p)',              'counts FAILURES before n successes'],
], columns=['distribution', 'SciPy call', 'gotcha'])
print('\n' + ref.to_string(index=False))

stats.expon(scale=2): sample mean = 1.955  (want 2.0) -> correct
stats.expon(scale=0.5): sample mean = 0.489  (want 2.0) -> WRONG (scale != rate)

distribution                      SciPy call                             gotcha
      Normal stats.norm(loc=mu, scale=sigma)       scale is sigma, NOT variance
 Exponential         stats.expon(scale=beta)              mean = scale = 1/rate
       Gamma   stats.gamma(a=k, scale=theta)         a is shape; mean = k*theta
     Poisson           stats.poisson(mu=lam)              mu is the rate lambda
 NegBinomial              stats.nbinom(n, p) counts FAILURES before n successes


rng = np.random.default_rng(42)
true = stats.gamma(a=2, scale=2)
data = true.rvs(1000, random_state=rng)

a_hat, loc_hat, scale_hat = stats.gamma.fit(data, floc=0)   # fix location at 0
print(f'fitted: shape = {a_hat:.3f}, scale = {scale_hat:.3f}   (true: shape 2.000, scale 2.000)')

fitted = stats.gamma(a=a_hat, loc=loc_hat, scale=scale_hat)
D, p = stats.kstest(data, fitted.cdf)
print(f'KS test: D = {D:.4f}, p = {p:.4f}  -> no evidence of misfit')
print(f'95th percentile: fitted = {fitted.ppf(0.95):.2f}, empirical = {np.percentile(data, 95):.2f}')
print('\nCaveat: parameters were estimated from the SAME data being tested, so the plain')
print('KS p-value is optimistic (the Lilliefors problem); use a parametric bootstrap for a')
print('calibrated test.')

fitted: shape = 2.002, scale = 1.966   (true: shape 2.000, scale 2.000)
KS test: D = 0.0170, p = 0.9289  -> no evidence of misfit
95th percentile: fitted = 9.33, empirical = 9.43

Caveat: parameters were estimated from the SAME data being tested, so the plain
KS p-value is optimistic (the Lilliefors problem); use a parametric bootstrap for a
calibrated test.


# Combine: SciPy for definition/analysis, NumPy for fast sampling.
dist = stats.gamma(a=2, scale=3)                 # SciPy: theory side
theory_mean = dist.mean()
crit_95 = dist.ppf(0.95)

rng = np.random.default_rng(42)                  # NumPy: fast sampling side
samples = rng.gamma(shape=2, scale=3, size=100_000)

print(f'theory mean (SciPy)   = {theory_mean}')
print(f'sample mean (NumPy)   = {samples.mean():.4f}')
print(f'95th pct: SciPy ppf   = {crit_95:.3f}   NumPy empirical = {np.percentile(samples, 95):.3f}')
print('-> same model, two libraries, agreeing to sampling precision.')

theory mean (SciPy)   = 6.0
sample mean (NumPy)   = 6.0044
95th pct: SciPy ppf   = 14.232   NumPy empirical = 14.249
-> same model, two libraries, agreeing to sampling precision.


# (a) Mixture sampler -- vectorized with np.where.
def sample_mixture(rng, size):
    '''Sample from 0.3*N(-2,1) + 0.7*N(3,0.5).  rng is a numpy Generator.'''
    component1 = rng.random(size) < 0.3        # True -> component 1 N(-2,1)
    return np.where(component1,
                    rng.normal(-2, 1, size),
                    rng.normal(3, 0.5, size))

rng_test = np.random.default_rng(42)
test = sample_mixture(rng_test, 10000)
print(f'10,000-sample check: mean = {test.mean():.4f} (theory 1.5000), '
      f'std = {test.std():.4f} (theory 2.3927)')

10,000-sample check: mean = 1.4935 (theory 1.5000), std = 2.3977 (theory 2.3927)


# (b) Independent streams via SeedSequence.spawn(); verify near-zero cross-correlation.
parent_seed = 12345
ss = SeedSequence(parent_seed)
child_seeds = ss.spawn(4)
rngs = [default_rng(s) for s in child_seeds]

n_per_worker = 25000
worker_samples = [sample_mixture(r, n_per_worker) for r in rngs]

corr = np.corrcoef(worker_samples)
print(f'parent seed {parent_seed}, 4 workers x {n_per_worker} samples')
print('cross-worker correlation matrix:')
print(np.array2string(corr, precision=4, suppress_small=True))
print(f'expected noise SD ~ 1/sqrt(n) = {1/np.sqrt(n_per_worker):.4f} -> off-diagonals are pure noise.')

parent seed 12345, 4 workers x 25000 samples
cross-worker correlation matrix:
[[ 1.     -0.0046 -0.0067 -0.0105]
 [-0.0046  1.      0.0005 -0.0021]
 [-0.0067  0.0005  1.      0.0051]
 [-0.0105 -0.0021  0.0051  1.    ]]
expected noise SD ~ 1/sqrt(n) = 0.0063 -> off-diagonals are pure noise.


# (c) Estimate P(X > 0) with a Monte-Carlo standard error.
all_samples = np.concatenate(worker_samples)
n_total = len(all_samples)
p_hat = np.mean(all_samples > 0)
se = np.sqrt(p_hat * (1 - p_hat) / n_total)
print(f'total samples : {n_total:,}')
print(f'P(X > 0) est. : {p_hat:.4f}   MC SE = {se:.6f}')
print(f'95% CI        : ({p_hat - 1.96*se:.4f}, {p_hat + 1.96*se:.4f})')

def mixture_cdf(x):
    return 0.3 * stats.norm.cdf(x, -2, 1) + 0.7 * stats.norm.cdf(x, 3, 0.5)
p_theory = 1 - mixture_cdf(0)
print(f'theoretical   : {p_theory:.4f}   |est - theory| = {abs(p_hat - p_theory):.4f} '
      f'(~{abs(p_hat-p_theory)/se:.1f} SE)')

# Figure: the mixture density (P(X>0) shaded) and the 100,000-sample histogram.
xx = np.linspace(-6, 6, 600)
dens = 0.3 * stats.norm.pdf(xx, -2, 1) + 0.7 * stats.norm.pdf(xx, 3, 0.5)
fig, axes = plt.subplots(1, 2, figsize=(11.5, 4.0))
axes[0].plot(xx, dens, color='k', lw=2, label='mixture density')
axes[0].plot(xx, 0.3*stats.norm.pdf(xx,-2,1), '--', color=BLUE, lw=1.4, label='0.3 N(-2,1)')
axes[0].plot(xx, 0.7*stats.norm.pdf(xx,3,0.5), '--', color=ORANGE, lw=1.4, label='0.7 N(3,0.5)')
axes[0].fill_between(xx, dens, where=(xx > 0), color=GREEN, alpha=0.3, label=f'P(X>0)={p_theory:.3f}')
axes[0].set_title('True mixture density'); axes[0].legend(fontsize=8)
axes[1].hist(all_samples, bins=80, density=True, color=PURPLE, alpha=0.55, edgecolor='white')
axes[1].plot(xx, dens, color='k', lw=2)
axes[1].set_title(f'{n_total:,} samples from 4 parallel workers')
for ax in axes: ax.set_xlabel('x'); ax.set_ylabel('density')
plt.tight_layout(); plt.show()

total samples : 100,000
P(X > 0) est. : 0.7107   MC SE = 0.001434
95% CI        : (0.7079, 0.7135)
theoretical   : 0.7068   |est - theory| = 0.0038 (~2.7 SE)


# (d) Reproducibility: same parent seed -> bit-identical results.
ss_replay = SeedSequence(parent_seed)
rngs_replay = [default_rng(s) for s in ss_replay.spawn(4)]
worker_replay = [sample_mixture(r, n_per_worker) for r in rngs_replay]
identical = all(np.array_equal(a, b) for a, b in zip(worker_samples, worker_replay))
print('results identical on replay:', identical)
print('worker 0 first 5 (run 1):', worker_samples[0][:5].round(6))
print('worker 0 first 5 (run 2):', worker_replay[0][:5].round(6))

results identical on replay: True
worker 0 first 5 (run 1): [ 2.888635  3.413628 -2.454651 -1.194422  3.103237]
worker 0 first 5 (run 2): [ 2.888635  3.413628 -2.454651 -1.194422  3.103237]


# (e) Why a SHARED generator breaks reproducibility: the result depends on the order
# in which workers happen to draw -- a race condition in real parallel code.
shared = np.random.default_rng(42)
run1_w0 = sample_mixture(shared, 100)   # worker 0 draws first ...
_       = sample_mixture(shared, 100)   # ... then worker 1

shared = np.random.default_rng(42)      # same seed, different scheduling
_       = sample_mixture(shared, 100)   # worker 1 draws first ...
run2_w0 = sample_mixture(shared, 100)   # ... then worker 0

print(f'shared generator, worker 0 mean when it draws FIRST : {run1_w0.mean():+.4f}')
print(f'shared generator, worker 0 mean when it draws SECOND: {run2_w0.mean():+.4f}')
print(f'difference = {abs(run1_w0.mean() - run2_w0.mean()):.4f}  -> NOT reproducible')
print('\nFix: SeedSequence.spawn() gives every worker its own deterministic stream,')
print('independent of execution order (parts b-d).')

shared generator, worker 0 mean when it draws FIRST : +1.4742
shared generator, worker 0 mean when it draws SECOND: +1.7813
difference = 0.3072  -> NOT reproducible

Fix: SeedSequence.spawn() gives every worker its own deterministic stream,
independent of execution order (parts b-d).

§	Topic	What you run
F.1	The Python ecosystem at a glance	`random` vs NumPy vs SciPy decision table
F.2	Understanding pseudo-randomness	reproducibility demo · Mersenne Twister vs PCG64 · uniformity + independence figure
F.3	The standard library `random`	basic draws · sequence ops · distribution generators · seed/state
F.4	NumPy: fast vectorized sampling	`Generator` API · performance figure · distribution gallery · multivariate · sampling utilities · parallel streams
F.5	SciPy stats: the complete toolkit	frozen distributions · 4-panel methods figure · parameterization · Gamma fit + KS test
F.6	Library selection guide	when to use which (and how to combine them)
F.7	Practice problem	reproducibility & parallel simulation (full worked solution)

Category	Library	Why you might pick it	Best for
Standard library	`random`	Always available; quick scalar draws; simple shuffles; 11 classic distributions	Teaching, prototyping, dependency-free scripts
Scientific computing	`numpy.random`	Vectorized C core; ~5–10× faster for simple draws; array output; 35+ distributions	Monte Carlo, bootstrap, large-scale simulation
Statistical modeling	`scipy.stats`	100+ distributions; PDF/CDF/PPF/fitting; statistical tests	Distribution fitting, hypothesis testing
Probabilistic programming	PyMC, Stan	High-level model syntax; MCMC & VI engines	Complex Bayesian models (Chapter 5)

	Python `random` (Mersenne Twister, MT19937)	NumPy `Generator` (PCG64)
Period	$2^{19937}-1 \approx 10^{6001}$	$2^{128}$
State size	624 × 32-bit ints (≈19,968 bits)	256 bits
Statistical quality	fails 2 linear-complexity tests of BigCrush	passes BigCrush
Parallel streams	manual	`SeedSequence.spawn()` (jumpable)
Crypto-secure?	no (use `secrets` for that)	no

Function	Output	Notes
`random.random()`	`float`	Uniform on $[0,1)$; the fundamental building block
`random.uniform(a, b)`	`float`	Uniform on $[a,b]$; implemented as `a + (b-a)*random()`
`random.randint(a, b)`	`int`	Discrete uniform on $\{a,\dots,b\}$ — inclusive both ends
`random.randrange(start, stop, step)`	`int`	Like `range()` but a random element — half-open

Function	Returns	Replacement?	Weights?	When to use
`choice(seq)`	one element	—	no	pick a single winner / coin-flip a label
`choices(seq, k=, weights=)`	list of `k`	with	yes	bootstrap, Monte-Carlo from a categorical
`sample(seq, k)`	`k` unique	without	no	deal cards, a CV hold-out subset
`shuffle(seq)`	`None` (in place)	—	—	permute a deck / treatment order

Appendix F — Python Random Generation¶

Learning outcomes¶

Section map¶

F.1 — The Python ecosystem at a glance¶

F.2 — Understanding pseudo-random number generation¶

F.3 — The standard library: the `random` module¶

Random operations on sequences¶

Distribution generators (and the parameterization trap)¶

F.4 — NumPy: fast vectorized random sampling¶

Univariate distributions¶

Multivariate distributions¶

Sampling utilities¶

Parallel random streams¶

F.5 — SciPy stats: the complete statistical toolkit¶

Parameterization: the most common error source¶

A complete analysis: fit a Gamma and test the fit¶

F.6 — Bringing it together: a library selection guide¶

F.7 — Practice problem: reproducibility & parallel simulation¶

Summary & looking ahead¶

Method	Math	Meaning
`pdf(x)` / `pmf(x)`	$f(x)$ or $P(X=x)$	density / mass
`cdf(x)`	$F(x)=P(X\le x)$	cumulative
`sf(x)`	$1-F(x)$	survival
`ppf(q)`	$F^{-1}(q)$	quantile (inverse CDF)
`rvs(size)`	—	random samples
`mean()`, `var()`, `std()`	$E[X]$, $\mathrm{Var}(X)$	moments
`fit(data)`	$\hat\theta_{\text{MLE}}$	maximum-likelihood fit

Need	Reach for	One-liner
A few random numbers	`random`	`random.random()`, `random.choice(seq)`
Fast arrays of draws	NumPy `Generator`	`rng = np.random.default_rng(42); rng.normal(size=n)`
PDF / CDF / quantile / fit	`scipy.stats`	`d = stats.norm(0,1); d.cdf(x); d.ppf(q)`
Parallel independent streams	`SeedSequence`	`[default_rng(s) for s in SeedSequence(42).spawn(k)]`

Appendix F — Python Random Generation¶

Learning outcomes¶

Section map¶

F.1 — The Python ecosystem at a glance¶

F.2 — Understanding pseudo-random number generation¶

F.3 — The standard library: the random module¶

Random operations on sequences¶

Distribution generators (and the parameterization trap)¶

F.4 — NumPy: fast vectorized random sampling¶

Univariate distributions¶

Multivariate distributions¶

Sampling utilities¶

Parallel random streams¶

F.5 — SciPy stats: the complete statistical toolkit¶

Parameterization: the most common error source¶

A complete analysis: fit a Gamma and test the fit¶

F.6 — Bringing it together: a library selection guide¶

F.7 — Practice problem: reproducibility & parallel simulation¶

Summary & looking ahead¶

F.3 — The standard library: the `random` module¶