import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(418)            # reproducibility (STYLE.md contract)
plt.rcParams.update({
    "figure.figsize": (7.0, 4.2), "figure.dpi": 110,
    "axes.grid": True, "grid.alpha": 0.25,
    "axes.spines.top": False, "axes.spines.right": False,
    "font.size": 11,
})
BLUE, RED, GREEN, ORANGE, GRAY = "#4C72B0", "#C44E52", "#55A868", "#DD8452", "#7f7f7f"

# Course data bucket: the least-squares section loads the Chapter 3 Advertising set.
BASE = ("https://pqyjaywwccbnqpwgeiuv.supabase.co/storage/v1/object/public/"
        "STAT%20418%20Images/Data/Chapter3/")
def url(name):
    return BASE + name

print("environment ready · numpy", np.__version__, "· pandas", pd.__version__)

environment ready · numpy 1.26.4 · pandas 2.3.3


# Inner product, norm, angle, and the Cauchy-Schwarz inequality.
x = np.array([3.0, 1.0, 4.0])
y = np.array([2.0, 7.0, 1.0])

dot = x @ y                       # = 3*2 + 1*7 + 4*1
nx, ny = np.linalg.norm(x), np.linalg.norm(y)
cos_ang = dot / (nx * ny)

print(f"x . y            = {dot:.4f}      (3*2 + 1*7 + 4*1)")
print(f"||x|| = sqrt(26) = {nx:.4f}")
print(f"||y|| = sqrt(54) = {ny:.4f}")
print(f"Cauchy-Schwarz : |x.y| = {abs(dot):.4f}  <=  ||x||*||y|| = {nx*ny:.4f}   -> {abs(dot) <= nx*ny}")
print(f"angle(x, y)      = {np.degrees(np.arccos(cos_ang)):.2f} deg")

# An orthogonal pair: inner product is exactly zero.
u = np.array([1.0, 0.0, -1.0]); v = np.array([1.0, 5.0, 1.0])
print(f"u . v            = {u @ v:.1f}   -> orthogonal: {np.isclose(u @ v, 0.0)}")

x . y            = 17.0000      (3*2 + 1*7 + 4*1)
||x|| = sqrt(26) = 5.0990
||y|| = sqrt(54) = 7.3485
Cauchy-Schwarz : |x.y| = 17.0000  <=  ||x||*||y|| = 37.4700   -> True
angle(x, y)      = 63.02 deg
u . v            = 0.0   -> orthogonal: True


# Orthogonal matrices preserve lengths and angles (Q^T Q = I), so they never
# amplify error -- the reason QR is numerically safer than forming X^T X.
theta = np.pi / 4
Q = np.array([[np.cos(theta), -np.sin(theta)],
              [np.sin(theta),  np.cos(theta)]])   # a rotation by 45 degrees
w = np.array([3.0, 4.0])

print("Q^T Q =\n", np.round(Q.T @ Q, 12))
print(f"||w||   = {np.linalg.norm(w):.4f}")
print(f"||Q w|| = {np.linalg.norm(Q @ w):.4f}   (length preserved by the rotation)")

Q^T Q =
 [[ 1. -0.]
 [-0.  1.]]
||w||   = 5.0000
||Q w|| = 5.0000   (length preserved by the rotation)


# Multiplication, transpose rule, trace, rank, determinant.
A = np.array([[1., 2., 3.],
              [4., 5., 6.],
              [7., 8., 9.]])
B = np.array([[1., 0., 2.],
              [0., 1., 1.],
              [1., 1., 0.]])

print("A @ B =\n", A @ B)
print(f"\n(AB)^T == B^T A^T : {np.allclose((A @ B).T, B.T @ A.T)}   (order reverses)")
print(f"AB == BA          : {np.allclose(A @ B, B @ A)}   (not commutative)")
print(f"trace(A)          = {np.trace(A):.1f}")
print(f"rank(A)           = {np.linalg.matrix_rank(A)}   (row3 = 2*row2 - row1, so only 2 independent)")
print(f"det(A)            = {np.linalg.det(A):.3e}   (~0  ->  singular, not invertible)")
print(f"det(B)            = {np.linalg.det(B):.1f}   (nonzero  ->  invertible)")
print(f"det(AB)=det(A)det(B): {np.isclose(np.linalg.det(A @ B), np.linalg.det(A)*np.linalg.det(B))}")

A @ B =
 [[ 4.  5.  4.]
 [10. 11. 13.]
 [16. 17. 22.]]

(AB)^T == B^T A^T : True   (order reverses)
AB == BA          : False   (not commutative)
trace(A)          = 15.0
rank(A)           = 2   (row3 = 2*row2 - row1, so only 2 independent)
det(A)            = 0.000e+00   (~0  ->  singular, not invertible)
det(B)            = -3.0   (nonzero  ->  invertible)
det(AB)=det(A)det(B): True


# Solve A x = b WITHOUT forming the inverse (faster + more stable; STYLE pitfall).
M = np.array([[4., 2.],
              [2., 3.]])
b = np.array([1.0, 1.0])

x_solve = np.linalg.solve(M, b)        # LU factorization internally
x_inv   = np.linalg.inv(M) @ b         # forms the full inverse first (avoid)
print(f"np.linalg.solve(M, b) = {x_solve}")
print(f"np.linalg.inv(M) @ b  = {x_inv}   (same answer, worse numerics)")
print(f"agree: {np.allclose(x_solve, x_inv)}")

# Trace cyclic property tr(ABC) = tr(CAB), and the inner-product identity x'Ax = tr(A x x').
rng = np.random.default_rng(0)
P, Qm, R = rng.standard_normal((3, 3)), rng.standard_normal((3, 3)), rng.standard_normal((3, 3))
print(f"\ntr(PQR) = tr(RPQ) : {np.isclose(np.trace(P @ Qm @ R), np.trace(R @ P @ Qm))}   (cyclic)")
xv = rng.standard_normal(2)
print(f"x'Mx == tr(M x x'): {np.isclose(xv @ M @ xv, np.trace(M @ np.outer(xv, xv)))}   (inner-product identity)")

np.linalg.solve(M, b) = [0.125 0.25 ]
np.linalg.inv(M) @ b  = [0.125 0.25 ]   (same answer, worse numerics)
agree: True

tr(PQR) = tr(RPQ) : True   (cyclic)
x'Mx == tr(M x x'): True   (inner-product identity)


# Symmetric -> spectral decomposition;  positive definite -> all eigenvalues > 0.
A = np.array([[4., 2.],
              [2., 3.]])                       # symmetric
vals, Qv = np.linalg.eigh(A)                   # eigh: symmetric solver
print(f"eigenvalues          = {vals.round(4)}   (both > 0  ->  positive definite)")
print(f"reconstruction error : {np.max(np.abs(A - Qv @ np.diag(vals) @ Qv.T)):.2e}   (A = Q Lambda Q^T)")

# Verify x'Ax > 0 for 1000 random x (definition of PD).
rng = np.random.default_rng(42)
quad_min = min(float(z @ A @ z) for z in rng.standard_normal((1000, 2)))
print(f"min x'Ax over 1000 random x = {quad_min:.4f}  (> 0, confirms PD)")

# Eigenvectors of a symmetric matrix are orthogonal:
print(f"q1 . q2 = {Qv[:,0] @ Qv[:,1]:.2e}  (orthogonal eigenvectors)")

eigenvalues          = [1.4384 5.5616]   (both > 0  ->  positive definite)
reconstruction error : 8.88e-16   (A = Q Lambda Q^T)
min x'Ax over 1000 random x = 0.0113  (> 0, confirms PD)
q1 . q2 = 0.00e+00  (orthogonal eigenvectors)


# Idempotent matrices are projections: eigenvalues are 0/1 and rank == trace.
rng = np.random.default_rng(42)
n, p = 8, 3
Xr = rng.standard_normal((n, p))
Pmat = Xr @ np.linalg.solve(Xr.T @ Xr, Xr.T)   # projection onto col-space of Xr

print(f"P^2 == P (idempotent) : {np.allclose(Pmat @ Pmat, Pmat)}")
print(f"P symmetric           : {np.allclose(Pmat, Pmat.T)}")
ev = np.linalg.eigvalsh(Pmat)
print(f"eigenvalues (rounded) : {ev.round(3)}")
print(f"  # near 0 = {np.sum(np.abs(ev) < 1e-9)},  # near 1 = {np.sum(np.abs(ev-1) < 1e-9)}")
print(f"rank(P) = {np.linalg.matrix_rank(Pmat)} == trace(P) = {np.trace(Pmat):.1f} == p = {p}")

P^2 == P (idempotent) : True
P symmetric           : True
eigenvalues (rounded) : [-0. -0. -0.  0.  0.  1.  1.  1.]
  # near 0 = 5,  # near 1 = 3
rank(P) = 3 == trace(P) = 3.0 == p = 3


# Stochastic (transition) matrix: rows are probabilities; stationary pi is the
# left eigenvector with eigenvalue 1, and P^k converges to rows of pi.
Pt = np.array([[0.7, 0.2, 0.1],
               [0.1, 0.6, 0.3],
               [0.2, 0.3, 0.5]])
print(f"row sums = {Pt.sum(axis=1)}   (each row is a probability distribution)")

# Left eigenvector for eigenvalue 1  ->  pi^T P = pi^T.
w, V = np.linalg.eig(Pt.T)
pi = np.real(V[:, np.argmin(np.abs(w - 1.0))]);  pi = pi / pi.sum()
print(f"stationary pi          = {pi.round(4)}")
print(f"pi^T P == pi^T         : {np.allclose(pi @ Pt, pi)}")

P50 = np.linalg.matrix_power(Pt, 50)
print(f"rows of P^50 (-> pi)   = {P50[0].round(4)}")
print(f"second-largest |lambda| = {np.sort(np.abs(np.linalg.eigvals(Pt)))[-2]:.4f}  (governs convergence speed)")

row sums = [1. 1. 1.]   (each row is a probability distribution)
stationary pi          = [0.3235 0.3824 0.2941]
pi^T P == pi^T         : True
rows of P^50 (-> pi)   = [0.3235 0.3824 0.2941]
second-largest |lambda| = 0.5414  (governs convergence speed)


# Schur complement: block determinant identity det(A) = det(A11) det(S).
rng = np.random.default_rng(7)
G = rng.standard_normal((5, 5)); A = G @ G.T + np.eye(5)   # symmetric PD, 2+3 blocks
A11, A12 = A[:2, :2], A[:2, 2:]
A21, A22 = A[2:, :2], A[2:, 2:]
S = A22 - A21 @ np.linalg.solve(A11, A12)                  # Schur complement of A11
print(f"det(A)                 = {np.linalg.det(A):.4f}")
print(f"det(A11) * det(S)      = {np.linalg.det(A11) * np.linalg.det(S):.4f}")
print(f"block determinant holds: {np.isclose(np.linalg.det(A), np.linalg.det(A11)*np.linalg.det(S))}")

det(A)                 = 588.6852
det(A11) * det(S)      = 588.6852
block determinant holds: True


# Sherman-Morrison-Woodbury: a rank-3 update, verified against direct inversion.
rng = np.random.default_rng(42)
n, k = 100, 3
Am = rng.standard_normal((n, n)); Am = Am @ Am.T + np.eye(n)   # PD
U = rng.standard_normal((n, k))
C = np.eye(k)
Vm = rng.standard_normal((k, n))

direct = np.linalg.inv(Am + U @ C @ Vm)                       # invert the full n x n
Ainv = np.linalg.inv(Am)
inner = np.linalg.inv(np.linalg.inv(C) + Vm @ Ainv @ U)       # only a k x k inverse
smw = Ainv - Ainv @ U @ inner @ Vm @ Ainv

print(f"n = {n}, rank-{k} update")
print(f"max |direct - SMW| = {np.max(np.abs(direct - smw)):.2e}   (k x k inverse reproduces the n x n one)")

n = 100, rank-3 update
max |direct - SMW| = 1.03e-15   (k x k inverse reproduces the n x n one)


# A 2-D picture of projection: column space = a line; y splits into y_hat + e.
a = np.array([2.0, 1.0])                      # spans the (1-D) column space
yv = np.array([1.0, 3.0])                     # the response vector
proj = (a @ yv) / (a @ a) * a                 # orthogonal projection of y onto span(a)
resid = yv - proj                             # residual, perpendicular to a

fig, ax = plt.subplots(figsize=(5.6, 5.2))
s = np.array([-0.6, 1.9])
ax.plot(s * a[0], s * a[1], color=GRAY, lw=1.5, label="column space  C(X)")
ax.annotate("", xy=yv,   xytext=(0, 0),  arrowprops=dict(arrowstyle="->", color=BLUE,  lw=2.2))
ax.annotate("", xy=proj, xytext=(0, 0),  arrowprops=dict(arrowstyle="->", color=GREEN, lw=2.2))
ax.annotate("", xy=yv,   xytext=proj,    arrowprops=dict(arrowstyle="->", color=RED,   lw=2.2))
ax.text(*(yv + 0.06),   r"$\mathbf{y}$",       color=BLUE,  fontsize=13)
ax.text(proj[0]+0.06, proj[1]-0.22, r"$\hat{\mathbf{y}}=\mathbf{Hy}$", color=GREEN, fontsize=13)
ax.text((yv[0]+proj[0])/2+0.05, (yv[1]+proj[1])/2, r"$\mathbf{e}\perp\mathbf{X}$", color=RED, fontsize=13)
# little right-angle marker at the foot of the projection
d1 = a / np.linalg.norm(a); d2 = resid / np.linalg.norm(resid); h = 0.18
corner = proj + h*d1
ax.plot(*np.array([proj+h*d1, proj+h*d1+h*d2, proj+h*d2]).T, color="k", lw=1)
ax.scatter([0], [0], color="k", zorder=5); ax.text(-0.18, -0.05, "0")
ax.set_xlim(-1.4, 2.6); ax.set_ylim(-0.6, 3.4); ax.set_aspect("equal")
ax.set_title("Least squares = orthogonal projection onto C(X)")
ax.legend(loc="lower right"); plt.tight_layout(); plt.show()
print(f"check  e . a = {resid @ a:.2e}  (residual orthogonal to the column space)")

check  e . a = 0.00e+00  (residual orthogonal to the column space)


# REAL DATA: solve the Chapter 3 Advertising OLS by the normal equations directly.
adv = pd.read_csv(url("Advertising.csv"))
Xmat = np.column_stack([np.ones(len(adv)),
                        adv["TV"].to_numpy(),
                        adv["Radio"].to_numpy(),
                        adv["Newspaper"].to_numpy()])
yv = adv["Sales"].to_numpy()
n, p = Xmat.shape

# Normal equations:  (X^T X) beta = X^T y   -- solved WITHOUT forming an inverse.
beta = np.linalg.solve(Xmat.T @ Xmat, Xmat.T @ yv)
names = ["intercept", "TV", "Radio", "Newspaper"]
print(f"Advertising OLS by normal equations   (n = {n}, p = {p})")
for nm, b in zip(names, beta):
    print(f"  beta[{nm:9s}] = {b:+.5f}")

# Cross-check 1: numpy lstsq (SVD) gives the same coefficients.
beta_ls = np.linalg.lstsq(Xmat, yv, rcond=None)[0]
print(f"\nmax |normal-eq - lstsq(SVD)|      = {np.max(np.abs(beta - beta_ls)):.2e}")

# Cross-check 2: statsmodels OLS from Chapter 3 -> identical betas.
import statsmodels.formula.api as smf
sm_fit = smf.ols("Sales ~ TV + Radio + Newspaper", data=adv).fit()
print(f"max |normal-eq - statsmodels (Ch3)| = {np.max(np.abs(beta - sm_fit.params.values)):.2e}")
print(f"  (Ch3 read-off: TV beta ~ {beta[1]:.4f}, Newspaper ~ {beta[3]:.4f} -- effectively zero)")

Advertising OLS by normal equations   (n = 200, p = 4)
  beta[intercept] = +2.93889
  beta[TV       ] = +0.04576
  beta[Radio    ] = +0.18853
  beta[Newspaper] = -0.00104

max |normal-eq - lstsq(SVD)|      = 2.04e-14
max |normal-eq - statsmodels (Ch3)| = 1.33e-14
  (Ch3 read-off: TV beta ~ 0.0458, Newspaper ~ -0.0010 -- effectively zero)


# Hat matrix on the real design: symmetric, idempotent, trace = p, residual orthogonal.
H = Xmat @ np.linalg.solve(Xmat.T @ Xmat, Xmat.T)   # n x n projection
yhat = H @ yv
e = yv - yhat
lev = np.diag(H)

print(f"H symmetric                 : {np.allclose(H, H.T)}")
print(f"H idempotent (H^2 = H)      : {np.allclose(H @ H, H)}")
print(f"trace(H) = {np.trace(H):.4f}  ==  p = {p}   (effective degrees of freedom)")
print(f"sum of leverages            = {lev.sum():.4f}  ==  p")
print(f"mean leverage = p/n         = {lev.mean():.4f}   (avg = {p/n:.4f})")
print(f"max leverage h_ii           = {lev.max():.4f}  at market #{int(np.argmax(lev))}")
print(f"orthogonality  max|X^T e|   = {np.max(np.abs(Xmat.T @ e)):.2e}   (residual perpendicular to every column)")

# Pythagoras in n dimensions:  ||y||^2 = ||y_hat||^2 + ||e||^2.
print(f"\nPythagoras: ||y||^2 = {yv@yv:.2f},  ||yhat||^2 + ||e||^2 = {yhat@yhat + e@e:.2f}")
R2 = 1 - (e @ e) / np.sum((yv - yv.mean())**2)
print(f"R^2 = 1 - RSS/TSS = {R2:.4f}   (matches the Chapter 3 OLS fit)")

H symmetric                 : True
H idempotent (H^2 = H)      : True
trace(H) = 4.0000  ==  p = 4   (effective degrees of freedom)
sum of leverages            = 4.0000  ==  p
mean leverage = p/n         = 0.0200   (avg = 0.0200)
max leverage h_ii           = 0.0863  at market #16
orthogonality  max|X^T e|   = 7.17e-11   (residual perpendicular to every column)

Pythagoras: ||y||^2 = 44743.25,  ||yhat||^2 + ||e||^2 = 44743.25
R^2 = 1 - RSS/TSS = 0.8972   (matches the Chapter 3 OLS fit)


# Figure: fitted (projected) vs actual sales -- the projection that minimizes RSS.
fig, ax = plt.subplots(figsize=(5.8, 5.2))
ax.scatter(yhat, yv, s=22, color=BLUE, alpha=0.6, edgecolor="white")
lims = [min(yv.min(), yhat.min()) - 1, max(yv.max(), yhat.max()) + 1]
ax.plot(lims, lims, color=RED, lw=1.6, ls="--", label="perfect fit (y = y_hat)")
ax.set_xlim(lims); ax.set_ylim(lims); ax.set_aspect("equal")
ax.set_xlabel(r"fitted  $\hat{y}=\mathbf{Hy}$  (sales)")
ax.set_ylabel("actual sales")
ax.set_title(f"Advertising OLS projection   ($R^2$ = {R2:.3f})")
ax.legend(loc="upper left"); plt.tight_layout(); plt.show()


# Eigendecomposition of a covariance matrix: eigenvectors are the principal axes,
# eigenvalues are the variance along each axis.
Sigma = np.array([[2.0, 1.2],
                  [1.2, 1.0]])
vals, vecs = np.linalg.eigh(Sigma)            # ascending eigenvalues
print(f"eigenvalues (variances) = {vals.round(4)}")
print(f"principal direction (largest var) = {vecs[:, -1].round(4)}")
# Matrix functions via the spectrum: a symmetric square root.
Sig_sqrt = vecs @ np.diag(np.sqrt(vals)) @ vecs.T
print(f"(Sigma^1/2)^2 == Sigma : {np.allclose(Sig_sqrt @ Sig_sqrt, Sigma)}")

# Draw the 2-sd covariance ellipse with its eigen-axes over a sample cloud.
rng = np.random.default_rng(418)
L = np.linalg.cholesky(Sigma)
pts = (rng.standard_normal((600, 2)) @ L.T)
tt = np.linspace(0, 2*np.pi, 200)
ell = vecs @ np.diag(2*np.sqrt(vals)) @ np.array([np.cos(tt), np.sin(tt)])
fig, ax = plt.subplots(figsize=(5.6, 5.2))
ax.scatter(pts[:, 0], pts[:, 1], s=10, alpha=0.25, color=BLUE)
ax.plot(ell[0], ell[1], color="k", lw=2, label="2-sd ellipse")
for i, col in [(-1, RED), (0, GREEN)]:
    ax.annotate("", xy=2*np.sqrt(vals[i])*vecs[:, i], xytext=(0, 0),
                arrowprops=dict(arrowstyle="->", color=col, lw=2.4))
ax.text(2.6, 1.6, r"$\mathbf{q}_1\,(\lambda_1)$", color=RED)
ax.text(-1.8, 1.4, r"$\mathbf{q}_2\,(\lambda_2)$", color=GREEN)
ax.set_aspect("equal"); ax.set_title("Eigendecomposition: principal axes of a covariance")
ax.legend(loc="lower right"); plt.tight_layout(); plt.show()

eigenvalues (variances) = [0.2 2.8]
principal direction (largest var) = [-0.8321 -0.5547]
(Sigma^1/2)^2 == Sigma : True


# Cholesky decomposition Sigma = L L^T is the matrix square root used to GENERATE
# correlated normals:  X = L Z  has Var(X) = L I L^T = Sigma.
Sig3 = np.array([[1.0, 0.8, 0.3],
                 [0.8, 1.0, 0.5],
                 [0.3, 0.5, 1.0]])
L = np.linalg.cholesky(Sig3)
print("L (lower triangular) =\n", L.round(4))
print(f"reconstruction error : {np.max(np.abs(Sig3 - L @ L.T)):.2e}")

rng = np.random.default_rng(42)
Z = rng.standard_normal((3, 100000))
Xc = L @ Z
print("sample covariance of X = L Z (-> Sigma):\n", np.cov(Xc).round(3))

# Figure: independent Z (left) vs correlated X = L Z (right), rho = 0.8.
L2 = np.linalg.cholesky(np.array([[1.0, 0.8], [0.8, 1.0]]))
Z2 = rng.standard_normal((2, 1500)); X2 = L2 @ Z2
fig, axes = plt.subplots(1, 2, figsize=(9.4, 4.6))
for ax, data, ttl, col in [(axes[0], Z2, r"independent  $\mathbf{Z}\sim N(0,\mathbf{I})$", GRAY),
                           (axes[1], X2, r"correlated  $\mathbf{X}=\mathbf{LZ}$  ($\rho=0.8$)", BLUE)]:
    ax.scatter(data[0], data[1], s=8, alpha=0.3, color=col)
    ax.set_xlim(-4, 4); ax.set_ylim(-4, 4); ax.set_aspect("equal"); ax.set_title(ttl)
plt.tight_layout(); plt.show()

L (lower triangular) =
 [[1.     0.     0.    ]
 [0.8    0.6    0.    ]
 [0.3    0.4333 0.8498]]
reconstruction error : 1.11e-16
sample covariance of X = L Z (-> Sigma):
 [[1.007 0.805 0.3  ]
 [0.805 1.005 0.504]
 [0.3   0.504 1.   ]]


# QR decomposition: numerically stable regression without forming X^T X.
rng = np.random.default_rng(42)
n, p = 100, 5
Xq = rng.standard_normal((n, p))
yq = Xq @ np.ones(p) + rng.standard_normal(n)

Qm, Rm = np.linalg.qr(Xq)                       # X = Q R, Q^T Q = I
beta_qr = np.linalg.solve(Rm, Qm.T @ yq)        # triangular solve, R beta = Q^T y
beta_ne = np.linalg.solve(Xq.T @ Xq, Xq.T @ yq) # direct normal equations
print(f"Q^T Q == I            : {np.allclose(Qm.T @ Qm, np.eye(p))}")
print(f"max |QR - normal eq|  = {np.max(np.abs(beta_qr - beta_ne)):.2e}")
# Leverage falls straight out of Q:  h_ii = ||row_i of Q||^2 = diag(Q Q^T).
lev_qr = np.sum(Qm**2, axis=1)
lev_H = np.diag(Xq @ np.linalg.solve(Xq.T @ Xq, Xq.T))
print(f"leverage via Q vs H   : max diff = {np.max(np.abs(lev_qr - lev_H)):.2e}")

Q^T Q == I            : True
max |QR - normal eq|  = 6.66e-16
leverage via Q vs H   : max diff = 5.55e-17


# SVD: the universal decomposition. sigma_i^2 are the eigenvalues of X^T X; the
# condition number is sigma_max/sigma_min; and ridge shrinks each SVD direction.
rng = np.random.default_rng(42)
n, p = 100, 5
Xs = rng.standard_normal((n, p))
U, sig, Vt = np.linalg.svd(Xs, full_matrices=False)
print(f"singular values   = {sig.round(3)}")
print(f"condition number  = sigma_max/sigma_min = {sig[0]/sig[-1]:.2f}")
print(f"sigma^2 == eig(X^T X): {np.allclose(np.sort(sig**2), np.sort(np.linalg.eigvalsh(Xs.T @ Xs)))}")

ys = Xs @ np.ones(p) + rng.standard_normal(n)
lam = 1.0
beta_ridge = Vt.T @ ((sig / (sig**2 + lam)) * (U.T @ ys))   # ridge via SVD
df_ridge = np.sum(sig**2 / (sig**2 + lam))                  # effective d.o.f.
print(f"ridge(lambda=1) effective df = {df_ridge:.2f}   (< p = {p})")

# Figure: shrinkage factor sigma^2/(sigma^2+lambda) by SVD direction, several lambda.
fig, ax = plt.subplots(figsize=(7.0, 4.2))
idx = np.arange(1, p + 1)
for lam, col in [(0.01, GREEN), (1.0, BLUE), (10.0, ORANGE), (100.0, RED)]:
    ax.plot(idx, sig**2 / (sig**2 + lam), "o-", color=col, lw=2, label=fr"$\lambda={lam:g}$")
ax.set_xticks(idx); ax.set_ylim(-0.03, 1.05)
ax.set_xlabel("SVD direction (decreasing singular value $\\rightarrow$)")
ax.set_ylabel(r"shrinkage  $\sigma_i^2/(\sigma_i^2+\lambda)$")
ax.set_title("Ridge shrinks weak (small-$\\sigma$) directions the most")
ax.legend(); plt.tight_layout(); plt.show()

singular values   = [11.379 10.055  9.283  9.116  7.749]
condition number  = sigma_max/sigma_min = 1.47
sigma^2 == eig(X^T X): True
ridge(lambda=1) effective df = 4.94   (< p = 5)


# Var(A X) = A Var(X) A^T, checked by Monte Carlo, then used for the OLS variance.
rng = np.random.default_rng(418)
Sig = np.array([[2.0, 0.6, 0.2],
                [0.6, 1.0, 0.3],
                [0.2, 0.3, 1.5]])
Amap = np.array([[1.0, 2.0, 0.0],
                 [0.0, 1.0, -1.0]])
L = np.linalg.cholesky(Sig)
Xd = rng.standard_normal((400000, 3)) @ L.T      # Var(X) = Sigma
Yd = Xd @ Amap.T                                 # Y = A X
print("empirical Var(AX) =\n", np.cov(Yd.T).round(3))
print("A Sigma A^T       =\n", (Amap @ Sig @ Amap.T).round(3))
print(f"max diff = {np.max(np.abs(np.cov(Yd.T) - Amap @ Sig @ Amap.T)):.3f}\n")

# OLS variance Var(beta_hat) = sigma^2 (X^T X)^-1 is the same rule with A = (X^T X)^-1 X^T.
n, p = 1000, 3
Xo = rng.standard_normal((n, p)); true_beta = np.array([1.0, 2.0, -1.0]); sigma = 0.5
yo = Xo @ true_beta + sigma * rng.standard_normal(n)
XtX_inv = np.linalg.inv(Xo.T @ Xo)
bhat = XtX_inv @ Xo.T @ yo
s2 = np.sum((yo - Xo @ bhat)**2) / (n - p)
se = np.sqrt(np.diag(s2 * XtX_inv))
print(f"beta_hat        = {bhat.round(4)}   (true = {true_beta})")
print(f"standard errors = {se.round(4)}   (~ sigma/sqrt(n) = {sigma/np.sqrt(n):.4f})")

empirical Var(AX) =
 [[8.392 1.791]
 [1.791 1.896]]
A Sigma A^T       =
 [[8.4 1.8]
 [1.8 1.9]]
max diff = 0.009

beta_hat        = [ 1.0163  1.9958 -0.984 ]   (true = [ 1.  2. -1.])
standard errors = [0.0158 0.0169 0.0171]   (~ sigma/sqrt(n) = 0.0158)


# Chi-square from an idempotent quadratic form:  Z^T P Z ~ chi^2_r  (r = rank P).
rng = np.random.default_rng(418)
nz, r = 12, 4
Xrnd = rng.standard_normal((nz, r))
Pproj = Xrnd @ np.linalg.solve(Xrnd.T @ Xrnd, Xrnd.T)   # symmetric idempotent, rank r
Zd = rng.standard_normal((300000, nz))
quad = np.einsum("ij,jk,ik->i", Zd, Pproj, Zd)          # z^T P z for each draw
print(f"rank(P) = {np.linalg.matrix_rank(Pproj)}  (target chi^2 degrees of freedom)")
print(f"mean(z'Pz) = {quad.mean():.3f}   (chi^2_{r} mean  = {r})")
print(f"var(z'Pz)  = {quad.var():.3f}   (chi^2_{r} var   = {2*r})")

rank(P) = 4  (target chi^2 degrees of freedom)
mean(z'Pz) = 3.997   (chi^2_4 mean  = 4)
var(z'Pz)  = 7.997   (chi^2_4 var   = 8)


# Gradient identities, verified against central finite differences.
rng = np.random.default_rng(0)
Asym = np.array([[4.0, 1.0],
                 [1.0, 3.0]])               # symmetric
avec = np.array([2.0, -1.0])
x0 = np.array([1.0, -2.0])
eps = 1e-6

def fd_grad(f, x):
    g = np.zeros_like(x)
    for k in range(len(x)):
        d = np.zeros_like(x); d[k] = eps
        g[k] = (f(x + d) - f(x - d)) / (2 * eps)
    return g

g_lin = fd_grad(lambda v: avec @ v, x0)
g_quad = fd_grad(lambda v: v @ Asym @ v, x0)
print(f"grad(a^T x): numeric {g_lin.round(5)}  vs  a            = {avec}")
print(f"grad(x^T A x): numeric {g_quad.round(5)}  vs  2 A x = {2 * Asym @ x0}")

# Derivative of log-det:  d/dt logdet(A0 + t E) = tr(A0^-1 E)  at t = 0.
M = rng.standard_normal((4, 4)); A0 = M @ M.T + 4 * np.eye(4)
E = rng.standard_normal((4, 4)); E = E + E.T
ld = lambda t: np.linalg.slogdet(A0 + t * E)[1]
print(f"\nd/dt log det: numeric {(ld(eps) - ld(-eps)) / (2*eps):.5f}  vs  tr(A^-1 E) = {np.trace(np.linalg.solve(A0, E)):.5f}")

grad(a^T x): numeric [ 2. -1.]  vs  a            = [ 2. -1.]
grad(x^T A x): numeric [  4. -10.]  vs  2 A x = [  4. -10.]

d/dt log det: numeric 0.04060  vs  tr(A^-1 E) = 0.04060


# Jacobian and the multivariate delta method: Var(g) ~ J Var(theta) J^T.
def g(th):                                   # g: R^2 -> R^2
    return np.array([th[0] * th[1], th[0]**2])
theta = np.array([2.0, 3.0])
Sig_th = np.array([[0.10, 0.02],
                   [0.02, 0.05]])            # Var(theta_hat)

# Analytic Jacobian J = d g / d theta^T.
J = np.array([[theta[1], theta[0]],
              [2*theta[0], 0.0]])
var_delta = J @ Sig_th @ J.T

# Monte-Carlo check by pushing samples of theta_hat through g.
rng = np.random.default_rng(418)
samp = rng.multivariate_normal(theta, Sig_th, size=400000)
gv = np.column_stack([samp[:, 0]*samp[:, 1], samp[:, 0]**2])
print("Jacobian J =\n", J)
print("delta-method  J Sig J^T =\n", var_delta.round(4))
print("Monte-Carlo Var(g)      =\n", np.cov(gv.T).round(4))
print(f"max diff = {np.max(np.abs(var_delta - np.cov(gv.T))):.3f}  (close: first-order Taylor is good here)")

Jacobian J =
 [[3. 2.]
 [4. 0.]]
delta-method  J Sig J^T =
 [[1.34 1.36]
 [1.36 1.6 ]]
Monte-Carlo Var(g)      =
 [[1.3426 1.3603]
 [1.3603 1.6155]]
max diff = 0.016  (close: first-order Taylor is good here)


# Condition number squaring, and QR staying stable where the normal equations rot.
rng = np.random.default_rng(42)
n, p = 100, 5
Xw = rng.standard_normal((n, p))                       # well conditioned
Xi = rng.standard_normal((n, p))
Xi[:, 4] = Xi[:, 3] + 1e-8 * rng.standard_normal(n)    # column 5 ~ column 4 (near collinear)

for tag, Xx in [("well-conditioned", Xw), ("near-collinear ", Xi)]:
    kX = np.linalg.cond(Xx); kXtX = np.linalg.cond(Xx.T @ Xx)
    print(f"{tag}: kappa(X) = {kX:.3e},  kappa(X^T X) = {kXtX:.3e}  (~ kappa(X)^2 = {kX**2:.3e})")

# The tiny singular value is what blows up X^T X:
print(f"\nsingular values of the collinear X: {np.linalg.svd(Xi, compute_uv=False).round(8)}")

# QR still solves cleanly on the ill-conditioned design.
yq = rng.standard_normal(n)
Qm, Rm = np.linalg.qr(Xi)
beta_qr = np.linalg.solve(Rm, Qm.T @ yq)
print(f"QR residual norm on collinear X = {np.linalg.norm(yq - Xi @ beta_qr):.4f}  (solves fine)")

well-conditioned: kappa(X) = 1.468e+00,  kappa(X^T X) = 2.156e+00  (~ kappa(X)^2 = 2.156e+00)
near-collinear : kappa(X) = 2.285e+08,  kappa(X^T X) = 5.893e+15  (~ kappa(X)^2 = 5.221e+16)

singular values of the collinear X: [1.45507003e+01 1.08176175e+01 9.69189333e+00 8.86428910e+00
 6.00000000e-08]
QR residual norm on collinear X = 9.6664  (solves fine)


# Figure: singular-value spectra -- a well-conditioned design vs a collinear one.
sv_w = np.linalg.svd(Xw, compute_uv=False)
sv_i = np.linalg.svd(Xi, compute_uv=False)
fig, ax = plt.subplots(figsize=(7.0, 4.2))
idx = np.arange(1, p + 1)
ax.semilogy(idx, sv_w, "o-", color=BLUE, lw=2, label=f"well-conditioned ($\\kappa$={sv_w[0]/sv_w[-1]:.1f})")
ax.semilogy(idx, sv_i, "s-", color=RED, lw=2, label=f"near-collinear ($\\kappa$={sv_i[0]/sv_i[-1]:.1e})")
ax.set_xticks(idx); ax.set_xlabel("singular value index"); ax.set_ylabel(r"$\sigma_i$ (log scale)")
ax.set_title("Collinearity creates a tiny singular value (huge condition number)")
ax.legend(); plt.tight_layout(); plt.show()


# Solution 1.
rng = np.random.default_rng(42)
n, p = 50, 3
X = np.column_stack([np.ones(n), rng.standard_normal(n), rng.standard_normal(n)])
H = X @ np.linalg.solve(X.T @ X, X.T)
ev = np.linalg.eigvalsh(H)
print(f"(a) symmetric={np.allclose(H, H.T)}, idempotent={np.allclose(H @ H, H)}, "
      f"trace={np.trace(H):.4f} (=p={p})")
print(f"    eigenvalues: {np.sum(np.abs(ev) < 1e-9)} near 0, {np.sum(np.abs(ev-1) < 1e-9)} near 1")

lev = np.diag(H)
print(f"(b) leverage: min={lev.min():.4f} (>= 1/n={1/n:.4f}), max={lev.max():.4f} (<= 1), "
      f"mean={lev.mean():.4f} (=p/n)")

# (c) outlier at a high-leverage location.
beta_true = np.array([2.0, -1.5, 0.8])
y = X @ beta_true + 0.5 * rng.standard_normal(n)
Xo, yo = X.copy(), y.copy()
Xo[0, 1] = 8.0; Xo[0, 2] = 8.0; yo[0] = 50.0
Ho = Xo @ np.linalg.solve(Xo.T @ Xo, Xo.T)
beta_o = np.linalg.solve(Xo.T @ Xo, Xo.T @ yo)
yhat0 = (Xo @ beta_o)[0]
print(f"(c) outlier leverage h_00={np.diag(Ho)[0]:.4f}; y_0=50.00, y_hat_0={yhat0:.4f} "
      f"(fit dragged {100*np.diag(Ho)[0]:.1f}% toward the outlier)")

# (d) orthogonality.
e = y - H @ y
print(f"(d) max|X^T e| = {np.max(np.abs(X.T @ e)):.2e}; sum(e)={np.sum(e):.2e} (orthogonal to ones)")

(a) symmetric=True, idempotent=True, trace=3.0000 (=p=3)
    eigenvalues: 47 near 0, 3 near 1
(b) leverage: min=0.0203 (>= 1/n=0.0200), max=0.1834 (<= 1), mean=0.0600 (=p/n)
(c) outlier leverage h_00=0.8254; y_0=50.00, y_hat_0=40.8597 (fit dragged 82.5% toward the outlier)
(d) max|X^T e| = 4.44e-15; sum(e)=4.44e-15 (orthogonal to ones)


# Solution 2.
rng = np.random.default_rng(42)
n, p = 100, 5
X = rng.standard_normal((n, p))
X[:, 4] = X[:, 3] + 1e-6 * rng.standard_normal(n)
beta_true = np.array([1.0, -2.0, 1.5, 0.5, -0.3])
y = X @ beta_true + 0.1 * rng.standard_normal(n)

cX, cXtX = np.linalg.cond(X), np.linalg.cond(X.T @ X)
print(f"(a) kappa(X)={cX:.3e}, kappa(X^T X)={cXtX:.3e}, kappa(X)^2={cX**2:.3e}")

b_normal = np.linalg.inv(X.T @ X) @ X.T @ y
Qm, Rm = np.linalg.qr(X); b_qr = np.linalg.solve(Rm, Qm.T @ y)
b_lstsq = np.linalg.lstsq(X, y, rcond=None)[0]
print(f"(b) beta_4 estimates: normal={b_normal[3]:.1f}, qr={b_qr[3]:.1f}, lstsq={b_lstsq[3]:.1f} (wild!)")
print(f"    but beta_4+beta_5: true={beta_true[3]+beta_true[4]:.3f}, "
      f"est={b_lstsq[3]+b_lstsq[4]:.3f} (identifiable)")
print(f"    residual norms identical: {np.allclose(np.linalg.norm(y-X@b_normal), np.linalg.norm(y-X@b_lstsq))}")

U, sig, Vt = np.linalg.svd(X, full_matrices=False)
print(f"    singular values: {sig.round(8)}  (sigma_5 is ~1e-6)")
print("\n(c) ridge via SVD:")
print(f"    {'lambda':>8} {'df':>7} {'max|err|':>10}")
for lam in [0.001, 0.1, 1.0, 10.0]:
    b_r = Vt.T @ ((sig / (sig**2 + lam)) * (U.T @ y))
    print(f"    {lam:>8.3f} {np.sum(sig**2/(sig**2+lam)):>7.4f} {np.max(np.abs(b_r - beta_true)):>10.4e}")
print(f"    OLS max error was {np.max(np.abs(b_lstsq - beta_true)):.3e} -> ridge cuts it to ~0.40")

(a) kappa(X)=2.045e+06, kappa(X^T X)=4.180e+12, kappa(X)^2=4.184e+12
(b) beta_4 estimates: normal=-1261.6, qr=-1262.7, lstsq=-1262.7 (wild!)
    but beta_4+beta_5: true=0.200, est=0.196 (identifiable)
    residual norms identical: True
    singular values: [1.49647921e+01 9.96665833e+00 9.12872408e+00 8.24476672e+00
 7.32000000e-06]  (sigma_5 is ~1e-6)

(c) ridge via SVD:
      lambda      df   max|err|
       0.001  4.0000 4.0226e-01
       0.100  3.9959 4.0229e-01
       1.000  3.9592 4.0311e-01
      10.000  3.6304 4.1043e-01
    OLS max error was 1.263e+03 -> ridge cuts it to ~0.40


# Solution 3.
rng = np.random.default_rng(42)
n, p = 80, 4
X = np.column_stack([np.ones(n), rng.standard_normal((n, p - 1))])
beta_true = np.array([3.0, -1.0, 2.0, 0.5])
y = X @ beta_true + 0.5 * rng.standard_normal(n)

H = X @ np.linalg.solve(X.T @ X, X.T)
e = y - H @ y
lev = np.diag(H)
loo_formula = e / (1 - lev)                  # SMW shortcut: one fit only

loo_brute = np.zeros(n)                       # refit n times
for i in range(n):
    Xi_, yi_ = np.delete(X, i, 0), np.delete(y, i, 0)
    b_i = np.linalg.lstsq(Xi_, yi_, rcond=None)[0]
    loo_brute[i] = y[i] - X[i] @ b_i

print(f"max |formula - brute force| = {np.max(np.abs(loo_formula - loo_brute)):.2e}  (machine precision)")
print(f"LOO-CV MSE (formula)     = {np.mean(loo_formula**2):.6f}")
print(f"LOO-CV MSE (brute force) = {np.mean(loo_brute**2):.6f}")
print(f"agreement to 1e-12: {np.allclose(loo_formula, loo_brute, atol=1e-12)}  "
      f"-> O(n p^2) instead of O(n^2 p^2)")

max |formula - brute force| = 4.36e-15  (machine precision)
LOO-CV MSE (formula)     = 0.288641
LOO-CV MSE (brute force) = 0.288641
agreement to 1e-12: True  -> O(n p^2) instead of O(n^2 p^2)

§	Topic	What we run
B.1	Vectors & inner products	dot / norm / Cauchy–Schwarz; orthogonal matrices preserve length
B.2	Matrix operations	multiply, `solve` vs `inv`, trace, determinant, rank
B.3	Special structures	symmetric, idempotent, positive-definite, stochastic
B.4	Block matrices	Schur complement + Sherman–Morrison–Woodbury
B.5	Projections & least squares	Advertising normal equations + hat matrix (→ Ch 3)
B.6	Decompositions	eigen, Cholesky (sampling), QR, SVD (ridge)
B.7	Quadratic forms & covariance	$\mathrm{Var}(\mathbf{A}\mathbf{X})$, $\chi^2$ from idempotents
B.8	Matrix calculus	gradients, log-det, Jacobian / delta method
B.9	Numerical considerations	condition number, why never form $\mathbf{X}^\top\mathbf{X}$
—	Exercises	hat matrix · ill-conditioned ridge · block / LOOCV

Decomposition	Needs	Primary use in this course
Eigen $\mathbf{Q}\boldsymbol\Lambda\mathbf{Q}^\top$	symmetric	PCA, Markov convergence, definiteness
Cholesky $\mathbf{L}\mathbf{L}^\top$	symmetric PD	sampling correlated normals, PD solves
QR $\mathbf{Q}\mathbf{R}$	full column rank	stable regression, leverage
SVD $\mathbf{U}\boldsymbol\Sigma\mathbf{V}^\top$	any matrix	ridge, condition number, rank

Task	Naive (avoid)	Stable (prefer)
Solve $\mathbf{X}^\top\mathbf{X}\beta=\mathbf{X}^\top\mathbf{y}$	`inv(X.T@X)@X.T@y`	`np.linalg.lstsq(X, y)` (SVD)
Leverage $h_{ii}$	`diag(X@inv(X.T@X)@X.T)`	`Q,R=qr(X); sum(Q**2,1)`
PD solve $\mathbf{A}\mathbf{x}=\mathbf{b}$	`inv(A)@b`	`solve(A,b)` / `cho_solve`

Idea	Formula	Where it shows up
Orthogonal projection	$\mathbf{H}=\mathbf{X}(\mathbf{X}^\top\mathbf{X})^{-1}\mathbf{X}^\top$	OLS fitted values (Ch 3)
Normal equations	$\mathbf{X}^\top\mathbf{X}\hat\beta=\mathbf{X}^\top\mathbf{y}$	Advertising fit, machine-precision match
Variance of a linear map	$\mathrm{Var}(\mathbf{A}\mathbf{X})=\mathbf{A}\mathrm{Var}(\mathbf{X})\mathbf{A}^\top$	$\sigma^2(\mathbf{X}^\top\mathbf{X})^{-1}$, delta method
Eigen / spectral	$\mathbf{A}=\mathbf{Q}\boldsymbol\Lambda\mathbf{Q}^\top$	PCA, Markov convergence (Ch 5)
Cholesky	$\boldsymbol\Sigma=\mathbf{L}\mathbf{L}^\top$	sampling correlated normals (Ch 2)
SVD / condition number	$\mathbf{X}=\mathbf{U}\boldsymbol\Sigma\mathbf{V}^\top$, $\kappa=\sigma_1/\sigma_p$	ridge shrinkage, stability
SMW rank update	$(\mathbf{A}+\mathbf{u}\mathbf{v}^\top)^{-1}=\cdots$	LOOCV shortcut (Ch 4)

Appendix B — Linear Algebra for Data Science¶

Learning outcomes¶

Section map¶

Section B.1 — Vectors and Inner Products¶

Section B.2 — Matrix Operations and Properties¶

Section B.3 — Special Matrix Structures¶

Section B.4 — Block Matrices¶

Section B.5 — Column Space, Projections, and Least Squares¶

Section B.6 — Matrix Decompositions¶

Section B.7 — Quadratic Forms and Covariance¶

Section B.8 — Matrix Calculus¶

Section B.9 — Numerical Considerations¶

Exercises¶

Summary & connections¶