import json
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (cohen_kappa_score, accuracy_score, f1_score,
                             confusion_matrix, classification_report)
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

np.random.seed(418)            # reproducibility (STYLE.md contract)
plt.rcParams.update({
    'figure.figsize': (7.2, 4.2), 'figure.dpi': 110,
    'axes.grid': True, 'grid.alpha': 0.25,
    'axes.spines.top': False, 'axes.spines.right': False,
    'font.size': 11,
})
BLUE, RED, GREEN, ORANGE, PURPLE = '#4C72B0', '#C44E52', '#55A868', '#DD8452', '#8172B3'

# Course data bucket: the shared Chapter 6 reviews corpus (no repo, no API key).
CORPUS_URL = ('https://pqyjaywwccbnqpwgeiuv.supabase.co/storage/v1/object/public/'
              'STAT%20418%20Images/Data/Chapter6/reviews.jsonl')

raw = urllib.request.urlopen(CORPUS_URL, timeout=120).read().decode('utf-8')
reviews = [json.loads(line) for line in raw.splitlines() if line.strip()]
df = pd.DataFrame(reviews)

# A light HTML/whitespace clean (the full pipeline is built in Section 6.3).
import re as _re
def light_clean(t):
    t = _re.sub(r'<[^>]+>', ' ', str(t))      # strip HTML tags like <br />
    return _re.sub(r'\s+', ' ', t).strip()
df['text'] = df['text'].map(light_clean)
df['n_words'] = df['text'].str.split().map(len)

print('reviews loaded:', len(df))
print('by sentiment :', df['sentiment'].value_counts().to_dict())
print('by rating    :', dict(sorted(df['rating'].value_counts().items())))
print('categories   :', list(df['category'].unique()))
print('median length:', int(df['n_words'].median()), 'words')

reviews loaded: 9000
by sentiment : {'positive': 3600, 'negative': 3600, 'neutral': 1800}
by rating    : {1: 1800, 2: 1800, 3: 1800, 4: 1800, 5: 1800}
categories   : ['Video_Games', 'Electronics', 'Beauty_and_Personal_Care', 'Cell_Phones_and_Accessories', 'Office_Products', 'Software']
median length: 29 words

from genai_studio import GenAIStudio
ai = GenAIStudio()
ai.select_model("gemma3:12b")

print(ai.chat("What is the Central Limit Theorem? Answer in two sentences."))
# -> "The sampling distribution of the sample mean approaches a normal
#     distribution as n grows, regardless of the population's shape. This
#     underpins confidence intervals and tests when the population is unknown."

resp = ai.chat_complete("Explain correlation vs causation in one paragraph.")
print(resp.model, resp.prompt_tokens, resp.completion_tokens, resp.total_tokens)
# -> gemma3:12b 18 112 130


# Attention as weighted voting (a tiny, self-contained numeric illustration).
# We hand-build toy 4-d vectors for "I sat by the river bank" so that the focus
# word 'bank' is geometrically closest to 'river' -- then scaled dot-product
# attention (the transformer's actual formula) lets 'river' win the vote.
sentence = ['I', 'sat', 'by', 'the', 'river', 'bank']
V = np.array([
    [0.1, 0.0, 0.0, 0.0],   # I
    [0.0, 0.2, 0.1, 0.0],   # sat
    [0.0, 0.0, 0.2, 0.0],   # by
    [0.0, 0.0, 0.0, 0.2],   # the
    [0.9, 0.1, 0.0, 0.3],   # river  (semantically near 'bank')
    [0.8, 0.2, 0.0, 0.3],   # bank   (the query / focus word)
])
q = V[sentence.index('bank')]                       # query = 'bank'
d = V.shape[1]
scores = V @ q / np.sqrt(d)                          # scaled dot-product scores
weights = np.exp(scores - scores.max())
weights = weights / weights.sum()                    # softmax -> attention weights

print("attention weights when processing 'bank' (a probability distribution):")
for w, word in sorted(zip(weights, sentence), reverse=True):
    print(f"  {word:6s}: {w:.3f}")
print(f"\n'river' draws the most attention ({weights[sentence.index('river')]:.2f}) "
      f"-> 'bank' takes the river/nature sense, not the financial one.")

attention weights when processing 'bank' (a probability distribution):
  river : 0.214
  bank  : 0.208
  I     : 0.147
  the   : 0.146
  sat   : 0.144
  by    : 0.141

'river' draws the most attention (0.21) -> 'bank' takes the river/nature sense, not the financial one.


# Figure: the attention 'votes' over the sentence.
order = np.argsort(weights)[::-1]
cols = [GREEN if sentence[i] == 'river' else (RED if sentence[i] == 'bank' else BLUE)
        for i in order]
fig, ax = plt.subplots(figsize=(7.2, 3.4))
ax.bar([sentence[i] for i in order], weights[order], color=cols, edgecolor='white')
for rank, i in enumerate(order):
    ax.text(rank, weights[i] + 0.005, f'{weights[i]:.2f}', ha='center', fontsize=9)
ax.set_ylabel('attention weight'); ax.set_ylim(0, weights.max() * 1.25)
ax.set_title("Attention as weighted voting: processing the word 'bank'")
ax.text(0.98, 0.95, "green = winner 'river'\nred = focus 'bank'", transform=ax.transAxes,
        ha='right', va='top', fontsize=9)
plt.tight_layout(); plt.show()


# Sparse representations have NO semantic awareness: two phrases are 'similar' only
# if they share literal words. TF-IDF cosine on six short phrases is near-identity.
phrases = ['machine learning', 'deep learning', 'neural network',
           'linear regression', 'banana split', 'ice cream']
Mp = TfidfVectorizer().fit_transform(phrases)
Sp = cosine_similarity(Mp)
print('Sparse TF-IDF cosine-similarity matrix:')
print(np.round(Sp, 2))
print(f"\n  'machine learning' vs 'deep learning'  = {Sp[0,1]:.2f}  (share the word 'learning')")
print(f"  'machine learning' vs 'neural network' = {Sp[0,2]:.2f}  (no shared word -> 0, "
      "though they ARE related)")
print("  Lesson: sparse vectors miss synonymy/relatedness. Dense embeddings (ai.embed)")
print("  place 'neural network' near 'machine learning' (cosine ~0.85). That is the")
print("  whole reason embeddings exist -- see the reference values below.")

Sparse TF-IDF cosine-similarity matrix:
[[1.  0.4 0.  0.  0.  0. ]
 [0.4 1.  0.  0.  0.  0. ]
 [0.  0.  1.  0.  0.  0. ]
 [0.  0.  0.  1.  0.  0. ]
 [0.  0.  0.  0.  1.  0. ]
 [0.  0.  0.  0.  0.  1. ]]

  'machine learning' vs 'deep learning'  = 0.40  (share the word 'learning')
  'machine learning' vs 'neural network' = 0.00  (no shared word -> 0, though they ARE related)
  Lesson: sparse vectors miss synonymy/relatedness. Dense embeddings (ai.embed)
  place 'neural network' near 'machine learning' (cosine ~0.85). That is the
  whole reason embeddings exist -- see the reference values below.


# Figure: the near-identity sparse similarity matrix (the problem dense embeddings fix).
fig, ax = plt.subplots(figsize=(5.6, 4.8))
im = ax.imshow(Sp, cmap='viridis', vmin=0, vmax=1)
ax.set_xticks(range(len(phrases))); ax.set_yticks(range(len(phrases)))
ax.set_xticklabels(phrases, rotation=45, ha='right', fontsize=8)
ax.set_yticklabels(phrases, fontsize=8)
for i in range(len(phrases)):
    for j in range(len(phrases)):
        ax.text(j, i, f'{Sp[i,j]:.2f}', ha='center', va='center',
                color='white' if Sp[i,j] < 0.6 else 'black', fontsize=8)
ax.set_title('Sparse TF-IDF: related phrases score 0 (no semantic awareness)')
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.tight_layout(); plt.show()

# Webbook (read-only, needs API key): dense contextual embeddings
embeddings = ai.embed([r["text"] for r in reviews])   # (n, 3072) from llama3.2
print(GenAIStudio.cosine_similarity(
    ai.embed("The mean is sensitive to outliers."),
    ai.embed("The average is affected by extreme values.")))   # -> 0.8734


# OPTIONAL, GUARDED: real dense embeddings via a small transformers model.
# Uses local_files_only=True so it NEVER downloads -> if the model isn't cached it
# skips and the offline TF-IDF path (next cell) is used everywhere.
USE_DENSE = False
dense_emb = None
try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    _name = 'sentence-transformers/all-MiniLM-L6-v2'
    _tok = AutoTokenizer.from_pretrained(_name, local_files_only=True)
    _mdl = AutoModel.from_pretrained(_name, local_files_only=True)
    _sample = df['text'].iloc[:32].tolist()
    _enc = _tok(_sample, padding=True, truncation=True, max_length=64, return_tensors='pt')
    with torch.no_grad():
        _out = _mdl(**_enc).last_hidden_state.mean(dim=1)   # mean-pooled embedding
    dense_emb = _out.numpy()
    USE_DENSE = True
    print(f'optional dense embeddings ACTIVE: {dense_emb.shape} from {_name}')
except Exception as e:
    print(f'optional dense embeddings skipped ({type(e).__name__}); '
          'using offline TF-IDF everywhere (this is the default, expected path).')

optional dense embeddings skipped (OSError); using offline TF-IDF everywhere (this is the default, expected path).


# Offline feature matrix: TF-IDF over all 9,000 reviews.
vec = TfidfVectorizer(max_features=5000, stop_words='english', min_df=5)
X = vec.fit_transform(df['text'])
print(f'TF-IDF matrix: {X.shape[0]} reviews x {X.shape[1]} vocabulary terms '
      f'({100*X.nnz/(X.shape[0]*X.shape[1]):.2f}% non-zero -> sparse)')

# Nearest-neighbour search (the engine behind retrieval, Section 6.5).
q = 0
sims = cosine_similarity(X[q], X).ravel()
nn = np.argsort(sims)[::-1][1:4]
print(f"\nQuery review  [rating {df['rating'].iloc[q]}, {df['category'].iloc[q]}]:")
print('  ' + df['text'].iloc[q][:90] + '...')
print('Most similar reviews by TF-IDF cosine:')
for i in nn:
    print(f"  [{sims[i]:.3f}] r={df['rating'].iloc[i]} {df['category'].iloc[i]:28s} "
          f"{df['text'].iloc[i][:55]}...")

TF-IDF matrix: 9000 reviews x 3487 vocabulary terms (0.38% non-zero -> sparse)

Query review  [rating 4, Video_Games]:
  I recommend this playset for some variety. The Avengers playset and the Spiderman playset ...
Most similar reviews by TF-IDF cosine:
  [0.213] r=1 Video_Games                  I would have to recommend you pass on this one. I thoug...
  [0.212] r=2 Office_Products              Each picture is so similar to the other that I'm bored ...
  [0.208] r=2 Software                     Not a game I would get again but too all that likes it ...


# Decode the TF-IDF space: SVD components vs. known metadata.
svd = TruncatedSVD(n_components=10, random_state=42)
S = svd.fit_transform(X)
length = df['n_words'].to_numpy()
sent_sign = df['sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1}).to_numpy()
cat = df['category'].to_numpy()

def corr_ratio(component, groups):           # eta: categorical association
    grand = component.mean()
    ss_between = sum(((component[groups == g].mean() - grand) ** 2) * (groups == g).sum()
                     for g in np.unique(groups))
    return np.sqrt(ss_between / np.sum((component - grand) ** 2))

print('component |  |r|_length  |r|_sentiment  eta_category')
decode = []
for i in range(7):
    pc = S[:, i]
    rl = abs(np.corrcoef(pc, length)[0, 1])
    rs = abs(np.corrcoef(pc, sent_sign)[0, 1])
    et = corr_ratio(pc, cat)
    decode.append((rl, rs, et))
    print(f'  PC{i+1:<2}    |   {rl:.2f}        {rs:.2f}          {et:.2f}')
decode = np.array(decode)
print(f'\nHighest-variance PC1 loads on LENGTH (|r|={decode[0,0]:.2f}), not sentiment '
      f'(|r|={decode[0,1]:.2f}).')
print('"Variance is not meaning": the top axis is verbosity; category sits on later PCs.')

component |  |r|_length  |r|_sentiment  eta_category
  PC1     |   0.35        0.19          0.18
  PC2     |   0.05        0.01          0.50
  PC3     |   0.15        0.24          0.25
  PC4     |   0.07        0.16          0.46
  PC5     |   0.02        0.15          0.09
  PC6     |   0.09        0.18          0.25
  PC7     |   0.02        0.03          0.29

Highest-variance PC1 loads on LENGTH (|r|=0.35), not sentiment (|r|=0.19).
"Variance is not meaning": the top axis is verbosity; category sits on later PCs.


# Figure: the decoder ring -- |association| of each PC with three known properties.
fig, ax = plt.subplots(figsize=(6.4, 4.4))
im = ax.imshow(decode, cmap='magma', vmin=0, vmax=max(0.5, decode.max()), aspect='auto')
ax.set_xticks(range(3)); ax.set_xticklabels(['|r| length', '|r| sentiment', 'eta category'])
ax.set_yticks(range(7)); ax.set_yticklabels([f'PC{i+1}' for i in range(7)])
for i in range(7):
    for j in range(3):
        ax.text(j, i, f'{decode[i,j]:.2f}', ha='center', va='center',
                color='white' if decode[i,j] < 0.3 else 'black', fontsize=9)
ax.set_title('Decoder ring: what each TF-IDF principal component encodes')
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.tight_layout(); plt.show()


# (a) Binary sentiment: positive vs negative, 5-fold cross-validated (Section 4 CV).
mask = df['sentiment'].to_numpy() != 'neutral'
y_bin = (df['sentiment'].to_numpy()[mask] == 'positive').astype(int)
cv_acc = cross_val_score(LogisticRegression(max_iter=1000), X[mask], y_bin,
                         cv=5, scoring='accuracy')
print(f'Binary pos-vs-neg logistic GLM (n={mask.sum()}): '
      f'5-fold CV accuracy = {cv_acc.mean():.3f} +/- {cv_acc.std():.3f}')
print(f'  (webbook reference, real dense embeddings: 0.899 +/- 0.008)')

# (b) Three-class sentiment on a hold-out: where does it break?
y = df['sentiment'].to_numpy()
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
clf3 = LogisticRegression(max_iter=1000).fit(Xtr, ytr)
pred = clf3.predict(Xte)
LAB = ['positive', 'neutral', 'negative']
cm = confusion_matrix(yte, pred, labels=LAB)
acc3 = accuracy_score(yte, pred)
f1w = f1_score(yte, pred, average='weighted')
kap = cohen_kappa_score(yte, pred)
print(f'\n3-class hold-out: accuracy={acc3:.3f}  weighted-F1={f1w:.3f}  '
      f'Cohen kappa vs gold={kap:.3f}')
print('Confusion (rows=gold, cols=pred) order positive/neutral/negative:')
print(cm)
neutral_recall = cm[1, 1] / cm[1].sum()
print(f'  Neutral recall = {neutral_recall:.2f}: the 3-star class is by far the hardest, '
      'exactly\n  the class that drags the LLM annotator down in Section 6.4.')

Binary pos-vs-neg logistic GLM (n=7200): 5-fold CV accuracy = 0.824 +/- 0.007
  (webbook reference, real dense embeddings: 0.899 +/- 0.008)

3-class hold-out: accuracy=0.664  weighted-F1=0.625  Cohen kappa vs gold=0.451
Confusion (rows=gold, cols=pred) order positive/neutral/negative:
[[716  36 148]
 [164  50 236]
 [124  48 728]]
  Neutral recall = 0.11: the 3-star class is by far the hardest, exactly
  the class that drags the LLM annotator down in Section 6.4.


# Figure: 3-class confusion matrix (the neutral class is the leaky one).
fig, ax = plt.subplots(figsize=(5.4, 4.6))
im = ax.imshow(cm, cmap='Blues')
ax.set_xticks(range(3)); ax.set_yticks(range(3))
ax.set_xticklabels(LAB); ax.set_yticklabels(LAB)
ax.set_xlabel('predicted'); ax.set_ylabel('gold (from star rating)')
for i in range(3):
    for j in range(3):
        ax.text(j, i, cm[i, j], ha='center', va='center',
                color='white' if cm[i, j] > cm.max() * 0.5 else 'black', fontsize=12)
ax.set_title(f'TF-IDF sentiment classifier (acc={acc3:.2f}, kappa={kap:.2f})')
plt.tight_layout(); plt.show()


# Regression: rating ~ 5 SVD components + review length, standardized coefficients.
pcs5 = S[:, :5]
Xr = StandardScaler().fit_transform(np.column_stack([pcs5, length]))
names = [f'PC{i+1}' for i in range(5)] + ['review_length']
rating = df['rating'].to_numpy().astype(float)
ols = LinearRegression().fit(Xr, rating)
print('Standardized OLS coefficients (rating ~ PC1..PC5 + review_length):')
for nm, c in zip(names, ols.coef_):
    print(f'  {nm:>13}: {c:+.3f}')
print(f'  {"intercept":>13}: {ols.intercept_:+.3f}')
print(f'  R^2 = {ols.score(Xr, rating):.3f}   (webbook ref, dense embeddings: ~0.46)')

# Bootstrap 95% CIs on every coefficient (Chapter 4 percentile method).
rng = np.random.default_rng(42)
n, B = len(rating), 1000
boot = np.zeros((B, Xr.shape[1]))
for b in range(B):
    idx = rng.choice(n, n, replace=True)
    boot[b] = LinearRegression().fit(Xr[idx], rating[idx]).coef_
print('\n95% bootstrap CIs (*** excludes 0):')
ci = {}
for i, nm in enumerate(names):
    lo, hi = np.percentile(boot[:, i], [2.5, 97.5])
    ci[nm] = (boot[:, i].mean(), lo, hi)
    star = '***' if (lo > 0 or hi < 0) else ''
    print(f'  {nm:>13}: {boot[:,i].mean():+.3f}  [{lo:+.3f}, {hi:+.3f}] {star}')
print(f"\n  review_length CI = [{ci['review_length'][1]:+.3f}, {ci['review_length'][2]:+.3f}] "
      "straddles 0 -> verbosity per se does NOT predict the rating.")

Standardized OLS coefficients (rating ~ PC1..PC5 + review_length):
            PC1: +0.168
            PC2: -0.004
            PC3: +0.356
            PC4: +0.218
            PC5: +0.173
  review_length: +0.014
      intercept: +3.000
  R^2 = 0.131   (webbook ref, dense embeddings: ~0.46)

95% bootstrap CIs (*** excludes 0):
            PC1: +0.168  [+0.137, +0.196] ***
            PC2: -0.004  [-0.031, +0.022] 
            PC3: +0.357  [+0.329, +0.385] ***
            PC4: +0.219  [+0.192, +0.246] ***
            PC5: +0.173  [+0.148, +0.199] ***
  review_length: +0.014  [-0.016, +0.041] 

  review_length CI = [-0.016, +0.041] straddles 0 -> verbosity per se does NOT predict the rating.


# Figure: forest plot of the bootstrap coefficient CIs.
fig, ax = plt.subplots(figsize=(7.0, 3.8))
ys = np.arange(len(names))[::-1]
for yk, nm in zip(ys, names):
    mean, lo, hi = ci[nm]
    sig = (lo > 0 or hi < 0)
    col = GREEN if sig else RED
    ax.plot([lo, hi], [yk, yk], color=col, lw=3, solid_capstyle='round')
    ax.plot(mean, yk, 'o', color=col, ms=7)
ax.axvline(0, color='k', lw=1, ls='--')
ax.set_yticks(ys); ax.set_yticklabels(names)
ax.set_xlabel('standardized coefficient on rating')
ax.set_title('Bootstrap 95% CIs: review_length (red) includes 0; PCs (green) do not')
plt.tight_layout(); plt.show()


# Token estimation + context-window budget (deterministic heuristics).
def estimate_tokens(text, method='words'):
    if method == 'words':
        return int(len(text.split()) * 1.3)
    return len(text) // 4

sample = 'The bootstrap resamples data with replacement.'
print(f"sample: {sample!r}")
print(f"  word-based estimate: {estimate_tokens(sample, 'words')} tokens")
print(f"  char-based estimate: {estimate_tokens(sample, 'chars')} tokens")

def check_fits_context(text, model_context=8192, system_tokens=200, output_tokens=1000):
    est = estimate_tokens(text, 'words')
    available = model_context - system_tokens - output_tokens
    return est, available, est <= available

for name, text in [('a real review', df['text'].iloc[0]),
                   ('5000x a sentence', 'Statistical analysis reveals patterns. ' * 5000)]:
    est, avail, fits = check_fits_context(text)
    print(f"  {name:18s}: ~{est:>6,} input tokens vs {avail:,} available -> "
          f"{'fits' if fits else 'CHUNK required'}")

sample: 'The bootstrap resamples data with replacement.'
  word-based estimate: 7 tokens
  char-based estimate: 11 tokens
  a real review     : ~   109 input tokens vs 6,992 available -> fits
  5000x a sentence  : ~26,000 input tokens vs 6,992 available -> CHUNK required


# Chunking strategies: fixed-size, overlap, and semantic (paragraph-aware).
def chunk_fixed(text, size=50):
    w = text.split()
    return [' '.join(w[i:i+size]) for i in range(0, len(w), size)]

def chunk_overlap(text, size=50, overlap=10):
    w, out, step = text.split(), [], size - overlap
    for i in range(0, len(w), step):
        c = ' '.join(w[i:i+size])
        if c:
            out.append(c)
        if i + size >= len(w):
            break
    return out

def chunk_semantic(text, max_size=40):
    paras = [p.strip() for p in text.split('\n\n') if p.strip()]
    out, cur, size = [], [], 0
    for p in paras:
        ps = len(p.split())
        if size + ps > max_size and cur:
            out.append('\n\n'.join(cur)); cur, size = [p], ps
        else:
            cur.append(p); size += ps
    if cur:
        out.append('\n\n'.join(cur))
    return out

long_text = ' '.join(f'Sentence {i} discusses topic {i % 5}.' for i in range(200))
fixed = chunk_fixed(long_text, 50)
over = chunk_overlap(long_text, 50, 10)
print(f'document: {len(long_text.split())} words')
print(f'  fixed (size 50)        : {len(fixed)} chunks')
print(f'  fixed + 10-word overlap: {len(over)} chunks  '
      f'(+{len(over)-len(fixed)} = {100*(len(over)/len(fixed)-1):.0f}% more, for boundary safety)')

structured = ('Intro\n\nThe bootstrap resamples with replacement to estimate a sampling '
              'distribution.\n\nMethod\n\nDraw n values with replacement, compute the statistic, '
              'repeat B times.\n\nIntervals\n\nThe percentile method uses bootstrap quantiles '
              'as the interval endpoints.')
sem = chunk_semantic(structured, max_size=20)
print(f'  semantic (<=20 words)  : {len(sem)} chunks, each a whole section:')
for i, c in enumerate(sem):
    print(f'    chunk {i} ({len(c.split())} w): {c[:48].replace(chr(10), " ")}...')

document: 1000 words
  fixed (size 50)        : 20 chunks
  fixed + 10-word overlap: 25 chunks  (+5 = 25% more, for boundary safety)
  semantic (<=20 words)  : 3 chunks, each a whole section:
    chunk 0 (12 w): Intro  The bootstrap resamples with replacement ...
    chunk 1 (12 w): Draw n values with replacement, compute the stat...
    chunk 2 (10 w): The percentile method uses bootstrap quantiles a...


# Cleaning: remove NOISE (HTML, URLs, control chars, repeated whitespace),
# but KEEP signal (case, punctuation, stop words). Measure tokens saved on real reviews.
def clean_text(text):
    text = _re.sub(r'<[^>]+>', '', text)
    text = _re.sub(r'https?://\S+', '[URL]', text)
    text = _re.sub(r'\s+', ' ', text)
    text = _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    return text.strip()

raw_demo = ('<p>This is a <b>test</b> of the   cleaning pipeline.</p> '
            'Visit https://example.com/very/long/url for more info.')
print('raw  :', repr(raw_demo))
print('clean:', repr(clean_text(raw_demo)))

# On a real noisy slice of the corpus (use the ORIGINAL un-cleaned text from the bucket).
orig = [r['text'] for r in reviews][:300]   # 'reviews' holds the raw JSON
before = sum(estimate_tokens(t, 'words') for t in orig)
after = sum(estimate_tokens(clean_text(t), 'words') for t in orig)
print(f'\n300 raw reviews: ~{before:,} tokens -> ~{after:,} after cleaning '
      f'({100*(before-after)/before:.1f}% saved by stripping HTML/markup/whitespace)')

raw  : '<p>This is a <b>test</b> of the   cleaning pipeline.</p> Visit https://example.com/very/long/url for more info.'
clean: 'This is a test of the cleaning pipeline. Visit [URL] for more info.'

300 raw reviews: ~14,219 tokens -> ~14,136 after cleaning (0.6% saved by stripping HTML/markup/whitespace)


# Figure: chunk count by strategy, and the overlap cost.
fig, ax = plt.subplots(figsize=(6.4, 3.6))
labels = ['fixed', 'fixed+overlap', 'semantic\n(structured doc)']
counts = [len(fixed), len(over), len(sem)]
bars = ax.bar(labels, counts, color=[BLUE, ORANGE, GREEN], edgecolor='white')
for b, c in zip(bars, counts):
    ax.text(b.get_x() + b.get_width()/2, c + 0.3, str(c), ha='center', fontsize=11)
ax.set_ylabel('number of chunks')
ax.set_title('Chunking strategies: overlap adds chunks for boundary safety')
plt.tight_layout(); plt.show()

ai.select_model("gemma3:12b")
ANNOTATION_PROMPT = '''Classify the sentiment of the review.
Respond with ONLY: positive, negative, or neutral.
Review: {text}
Label:'''
def annotate(text):
    r = ai.chat(ANNOTATION_PROMPT.format(text=text)).strip().lower()
    return r if r in {"positive","negative","neutral"} else "unparseable"
# -> "Absolutely love this!" -> positive ;  "Broke in two days." -> negative


# Cohen's kappa from the frozen gemma3:12b vs. gold agreement matrix (74 reviews).
# Rows = gold (from star rating); columns = the LLM's predicted label.
gold = ['pos'] * 24 + ['neu'] * 23 + ['neg'] * 27
llm = (['pos'] * 18 + ['neu'] * 5  + ['neg'] * 1 +     # gold positive (24)
       ['pos'] * 2  + ['neu'] * 9  + ['neg'] * 12 +    # gold neutral  (23)
       ['pos'] * 1  + ['neu'] * 1  + ['neg'] * 25)     # gold negative (27)
LABELS = ['pos', 'neu', 'neg']
acc = accuracy_score(gold, llm)
kappa = cohen_kappa_score(gold, llm, labels=LABELS)
print(f'n = {len(gold)} reviews   accuracy = {acc:.3f}   Cohen kappa = {kappa:.3f}')
interp = ('almost perfect' if kappa > .8 else 'substantial' if kappa > .6
          else 'MODERATE' if kappa > .4 else 'fair/poor')
print(f'  interpretation: {interp} agreement')
print(f'  A 0.70 accuracy looks fine, but chance agreement among 3 classes is ~1/3;')
print(f'  removing it leaves only kappa = {kappa:.2f} -- the honest signal.')
agree_mat = confusion_matrix(gold, llm, labels=LABELS)
print('\nAgreement matrix (rows gold, cols LLM) order pos/neu/neg:')
print(agree_mat)
print(f"  diagonal (agreement) = {np.trace(agree_mat)} of {len(gold)}; biggest error = "
      f"gold-neutral labeled negative ({agree_mat[1,2]} of {agree_mat[1].sum()}).")

n = 74 reviews   accuracy = 0.703   Cohen kappa = 0.548
  interpretation: MODERATE agreement
  A 0.70 accuracy looks fine, but chance agreement among 3 classes is ~1/3;
  removing it leaves only kappa = 0.55 -- the honest signal.

Agreement matrix (rows gold, cols LLM) order pos/neu/neg:
[[18  5  1]
 [ 2  9 12]
 [ 1  1 25]]
  diagonal (agreement) = 52 of 74; biggest error = gold-neutral labeled negative (12 of 23).


# Bootstrap the kappa (Chapter 4): how uncertain is 'moderate'?
rng = np.random.default_rng(42)
n = len(gold)
kappas = []
for _ in range(1000):
    idx = rng.choice(n, n, replace=True)
    try:
        kappas.append(cohen_kappa_score([gold[i] for i in idx],
                                        [llm[i] for i in idx], labels=LABELS))
    except ValueError:
        continue
kappas = np.array(kappas)
lo, hi = np.percentile(kappas, [2.5, 97.5])
print(f'bootstrap kappa: mean = {kappas.mean():.3f}   95% CI = [{lo:.3f}, {hi:.3f}]   '
      f'SE = {kappas.std():.3f}')
print(f'  The CI straddles the 0.6 "substantial" line -> we CANNOT claim substantial')
print('  agreement. Decision: route the ambiguous (3-star/neutral) reviews to humans')
print('  before trusting these labels at scale. Reporting the interval, not a bare')
print('  point estimate, is the statistical rigor (vs. blindly trusting the LLM).')

bootstrap kappa: mean = 0.544   95% CI = [0.386, 0.688]   SE = 0.078
  The CI straddles the 0.6 "substantial" line -> we CANNOT claim substantial
  agreement. Decision: route the ambiguous (3-star/neutral) reviews to humans
  before trusting these labels at scale. Reporting the interval, not a bare
  point estimate, is the statistical rigor (vs. blindly trusting the LLM).


# Figure: agreement matrix (left) + bootstrap kappa distribution (right).
fig, (a1, a2) = plt.subplots(1, 2, figsize=(11, 4.2))
im = a1.imshow(agree_mat, cmap='Greens')
a1.set_xticks(range(3)); a1.set_yticks(range(3))
a1.set_xticklabels(LABELS); a1.set_yticklabels(LABELS)
a1.set_xlabel('LLM (gemma3:12b)'); a1.set_ylabel('gold (from rating)')
for i in range(3):
    for j in range(3):
        a1.set_axisbelow(True)
        a1.text(j, i, agree_mat[i, j], ha='center', va='center',
                color='white' if agree_mat[i, j] > agree_mat.max()*0.5 else 'black', fontsize=12)
a1.grid(False)
a1.set_title(f'Agreement matrix (kappa = {kappa:.3f}, moderate)')
a2.hist(kappas, bins=30, color=BLUE, edgecolor='white', alpha=0.85)
a2.axvline(kappa, color=RED, lw=2, label=f'point kappa = {kappa:.3f}')
a2.axvline(lo, color='k', ls='--', lw=1); a2.axvline(hi, color='k', ls='--', lw=1,
                                                     label=f'95% CI [{lo:.2f}, {hi:.2f}]')
a2.axvline(0.6, color=GREEN, ls=':', lw=2, label='"substantial" 0.6')
a2.set_xlabel("bootstrap Cohen's kappa"); a2.set_ylabel('count')
a2.set_title('Bootstrap CI on kappa straddles the 0.6 line'); a2.legend(fontsize=8)
plt.tight_layout(); plt.show()

file = ai.upload_file("stat418_syllabus.pdf")
kb = ai.create_knowledge_base("STAT 418 Materials")
ai.add_file_to_knowledge_base(kb.id, file.id)
ai.chat("What is the late homework policy?", collections=[kb.id])
# -> grounded answer quoting the syllabus, instead of "I don't have that info."


# Manual RAG: a tiny statistics knowledge base, retrieved offline with TF-IDF.
documents = {
 'bootstrap.txt': ('The bootstrap is a resampling method introduced by Bradley Efron in '
   '1979. It estimates the sampling distribution of a statistic by repeatedly drawing '
   'samples with replacement from the observed data. Bootstrap confidence intervals use '
   'the percentile, basic, or BCa method. The bootstrap is distribution-free.'),
 'bayesian.txt': ('Bayesian inference treats parameters as random variables with prior '
   'distributions updated via Bayes theorem after observing data. Markov Chain Monte '
   'Carlo methods, especially Metropolis-Hastings and Hamiltonian Monte Carlo, sample '
   'from posterior distributions when analytical solutions are unavailable.'),
 'cross_validation.txt': ('Cross-validation estimates out-of-sample prediction error by '
   'partitioning data into folds. In k-fold cross-validation the model trains on k-1 '
   'folds and is evaluated on the held-out fold, rotating through all k. Leave-one-out '
   'is the special case k equals n.'),
}
def chunk_doc(text, size=40, overlap=8):
    w, out, step = text.split(), [], size - overlap
    for i in range(0, len(w), step):
        c = ' '.join(w[i:i+size])
        if c.strip():
            out.append(c)
        if i + size >= len(w):
            break
    return out

rag_chunks, rag_src = [], []
for fn, content in documents.items():
    for c in chunk_doc(content):
        rag_chunks.append(c); rag_src.append(fn)
rag_vec = TfidfVectorizer(stop_words='english')
rag_M = rag_vec.fit_transform(rag_chunks)
print(f'indexed {len(rag_chunks)} chunks from {len(documents)} documents')

def retrieve(query, k=3):
    sims = cosine_similarity(rag_vec.transform([query]), rag_M).ravel()
    top = np.argsort(sims)[::-1][:k]
    return [(rag_src[i], sims[i], rag_chunks[i]) for i in top]

query = 'How do I construct a confidence interval without assumptions?'
hits = retrieve(query)
print(f"\nquery: {query!r}\nretrieved top-3:")
for s, sim, c in hits:
    print(f"  [{sim:.3f}] [{s}] {c[:60]}...")

# The augmented prompt we would send (the generation step needs the LLM; shown, not run).
context = '\n\n'.join(f'[{s}] {c}' for s, _, c in hits)
print('\n--- augmented prompt (sent to ai.chat in production) ---')
print(f'Answer ONLY from the context.\n### Context ###\n{context[:160]}...\n'
      f'Question: {query}\nAnswer:')

indexed 4 chunks from 3 documents

query: 'How do I construct a confidence interval without assumptions?'
retrieved top-3:
  [0.210] [bootstrap.txt] The bootstrap is a resampling method introduced by Bradley E...
  [0.000] [cross_validation.txt] Cross-validation estimates out-of-sample prediction error by...
  [0.000] [bayesian.txt] Bayesian inference treats parameters as random variables wit...

--- augmented prompt (sent to ai.chat in production) ---
Answer ONLY from the context.
### Context ###
[bootstrap.txt] The bootstrap is a resampling method introduced by Bradley Efron in 1979. It estimates the sampling distribution of a statistic by repeatedly dr...
Question: How do I construct a confidence interval without assumptions?
Answer:


# Evaluate retrieval (offline): Precision@k and hit@k over labelled queries.
queries = ['How does bootstrap resampling work?',
           'What is a prior distribution?',
           'How many folds in cross-validation?']
expected = ['bootstrap.txt', 'bayesian.txt', 'cross_validation.txt']
precs, hits_at_k = [], []
for q, exp in zip(queries, expected):
    got = [s for s, _, _ in retrieve(q, k=3)]
    precs.append(sum(s == exp for s in got) / 3)     # fraction of top-3 from right source
    hits_at_k.append(1.0 if exp in got else 0.0)     # did we retrieve the right source at all
    print(f"  q={q[:34]:34s} top-3 sources={got}")
print(f'\nPrecision@3 = {np.mean(precs):.3f}   hit@3 = {np.mean(hits_at_k):.3f}')
print('  hit@3 = 1.00: the correct source is always retrieved. Precision@3 < 1 because')
print('  TF-IDF keyword overlap mis-ranks some chunks (e.g. "distribution" pulls')
print('  bootstrap for a Bayesian query) -- the precise gap dense embeddings close.')

  q=How does bootstrap resampling work top-3 sources=['bootstrap.txt', 'bootstrap.txt', 'cross_validation.txt']
  q=What is a prior distribution?      top-3 sources=['bootstrap.txt', 'bayesian.txt', 'bootstrap.txt']
  q=How many folds in cross-validation top-3 sources=['cross_validation.txt', 'bayesian.txt', 'bootstrap.txt']

Precision@3 = 0.444   hit@3 = 1.000
  hit@3 = 1.00: the correct source is always retrieved. Precision@3 < 1 because
  TF-IDF keyword overlap mis-ranks some chunks (e.g. "distribution" pulls
  bootstrap for a Bayesian query) -- the precise gap dense embeddings close.


# Figure: top-1 retrieval similarity per query, coloured by whether the source is right.
fig, ax = plt.subplots(figsize=(7.2, 3.6))
top1 = [retrieve(q, k=1)[0] for q in queries]
sims = [t[1] for t in top1]
right = [t[0] == e for t, e in zip(top1, expected)]
cols = [GREEN if r else RED for r in right]
bars = ax.bar(range(len(queries)), sims, color=cols, edgecolor='white')
ax.set_xticks(range(len(queries)))
ax.set_xticklabels([q[:22] + '...' for q in queries], rotation=12, ha='right', fontsize=8)
for b, t in zip(bars, top1):
    ax.text(b.get_x() + b.get_width()/2, t[1] + 0.01, t[0].replace('.txt', ''),
            ha='center', fontsize=8)
ax.set_ylabel('top-1 cosine similarity')
ax.set_title('Manual RAG retrieval (green = correct source retrieved at rank 1)')
plt.tight_layout(); plt.show()

SENTIMENT = ("Classify as positive, negative, or neutral. One word only.\n"
             "Text: {text}\nLabel:")               # versioned template, stored in repo
# self-consistency: majority vote over independent runs (Wang et al., 2022)
runs = [ai.chat(SENTIMENT.format(text=t)).strip().lower() for _ in range(7)]
majority, k = Counter(runs).most_common(1)[0]
confidence = k / len(runs)        # agreement rate == a natural uncertainty measure


# Self-consistency as a bootstrap: does majority-vote over noisy reasoning paths help?
# Simulate a 'reasoner' that is correct with probability p on each independent run;
# take the majority vote over k runs and measure accuracy (a Condorcet-jury result).
def majority_vote_accuracy(p, k, trials=20000, seed=418):
    rng = np.random.default_rng(seed)
    draws = rng.random((trials, k)) < p          # each run correct w.p. p
    return (draws.sum(axis=1) > k / 2).mean()    # majority correct

print('Per-path accuracy p = 0.65 (a shaky single answer):')
for k in [1, 3, 5, 7, 9, 15]:
    print(f'  majority vote of k={k:2d} runs -> accuracy {majority_vote_accuracy(0.65, k):.3f}')
print('\nBelow the p=0.5 break-even, voting HURTS (it amplifies a biased reasoner):')
for k in [1, 5, 15]:
    print(f'  p=0.45, k={k:2d} -> {majority_vote_accuracy(0.45, k):.3f}')
print('\nHigh agreement across runs is like a narrow bootstrap CI: the answer is robust.')
print('Low agreement is a wide CI: the model is uncertain and the answer is suspect.')

Per-path accuracy p = 0.65 (a shaky single answer):
  majority vote of k= 1 runs -> accuracy 0.654
  majority vote of k= 3 runs -> accuracy 0.717
  majority vote of k= 5 runs -> accuracy 0.764
  majority vote of k= 7 runs -> accuracy 0.797
  majority vote of k= 9 runs -> accuracy 0.827
  majority vote of k=15 runs -> accuracy 0.890

Below the p=0.5 break-even, voting HURTS (it amplifies a biased reasoner):
  p=0.45, k= 1 -> 0.451
  p=0.45, k= 5 -> 0.408
  p=0.45, k=15 -> 0.350

High agreement across runs is like a narrow bootstrap CI: the answer is robust.
Low agreement is a wide CI: the model is uncertain and the answer is suspect.


# Figure: majority-vote accuracy vs number of reasoning paths, for several p.
fig, ax = plt.subplots(figsize=(7.2, 4.0))
ks = np.array([1, 3, 5, 7, 9, 11, 15, 21])
for p, col in [(0.75, GREEN), (0.65, BLUE), (0.55, ORANGE), (0.45, RED)]:
    accs = [majority_vote_accuracy(p, int(k)) for k in ks]
    ax.plot(ks, accs, 'o-', color=col, label=f'per-path p = {p}')
ax.axhline(0.5, color='k', lw=1, ls='--', alpha=0.6)
ax.set_xlabel('number of reasoning paths (votes)'); ax.set_ylabel('majority-vote accuracy')
ax.set_title('Self-consistency = bootstrap-style voting (Condorcet): more paths sharpen a\n'
             'competent reasoner (p>0.5) but amplify a biased one (p<0.5)')
ax.legend(fontsize=9); plt.tight_layout(); plt.show()

from genai_studio.agents import tool
@tool
def query_dataset(table: str, column: str, condition: str = "") -> str:
    "Return values from a named dataset, optionally filtered."
    ...
resp = ai.chat_raw(messages, tools=[query_dataset.spec.to_openai()], tool_choice="auto")
call = resp.choices[0].message.tool_calls[0]          # model REQUESTS the call
result = query_dataset(**json.loads(call.function.arguments))   # YOUR code runs it
# ...append the result as a 'tool' message, ask the model to finish -> grounded answer


# Offline: derive the JSON schema a model would see, straight from type hints + docstring.
import inspect, typing

def build_tool_schema(fn):
    hints = typing.get_type_hints(fn)
    sig = inspect.signature(fn)
    py2json = {str: 'string', int: 'integer', float: 'number', bool: 'boolean'}
    props, required = {}, []
    doc_lines = [ln.strip() for ln in (fn.__doc__ or '').splitlines()]
    arg_docs = {}
    for ln in doc_lines:
        if ':' in ln and not ln.lower().startswith(('args', 'returns')):
            k, _, v = ln.partition(':')
            arg_docs[k.strip()] = v.strip()
    for name, param in sig.parameters.items():
        props[name] = {'type': py2json.get(hints.get(name, str), 'string'),
                       'description': arg_docs.get(name, '')}
        if param.default is inspect._empty:
            required.append(name)
        else:
            props[name]['default'] = param.default
    return {'name': fn.__name__, 'parameters': {'type': 'object',
            'properties': props, 'required': required}}

def z_score(value: float, mean: float, sd: float, two_sided: bool = True) -> str:
    ("Standardize a value against a distribution.\n"
     "value: the observed value to standardize.\n"
     "mean: the distribution mean.\n"
     "sd: the distribution standard deviation (must be > 0).\n"
     "two_sided: whether to report a two-sided tail.")
    return str((value - mean) / sd)

print(json.dumps(build_tool_schema(z_score), indent=2))
print(f"\nrequired args (no default): {build_tool_schema(z_score)['parameters']['required']}")
print('A decorated tool is still an ordinary function:')
print('  z_score(value=130, mean=100, sd=15) =', z_score(130, 100, 15), '(a clean 2.0)')
print('\nThe model never runs this -- it only emits {name, arguments}; your code executes.')

{
  "name": "z_score",
  "parameters": {
    "type": "object",
    "properties": {
      "value": {
        "type": "number",
        "description": "the observed value to standardize."
      },
      "mean": {
        "type": "number",
        "description": "the distribution mean."
      },
      "sd": {
        "type": "number",
        "description": "the distribution standard deviation (must be > 0)."
      },
      "two_sided": {
        "type": "boolean",
        "description": "whether to report a two-sided tail.",
        "default": true
      }
    },
    "required": [
      "value",
      "mean",
      "sd"
    ]
  }
}

required args (no default): ['value', 'mean', 'sd']
A decorated tool is still an ordinary function:
  z_score(value=130, mean=100, sd=15) = 2.0 (a clean 2.0)

The model never runs this -- it only emits {name, arguments}; your code executes.


# Reliability diagram + ECE on a deliberately miscalibrated set (reproduces book 0.149).
def reliability_diagram(conf, correct, n_bins=10):
    bins = np.linspace(0, 1, n_bins + 1)
    bc, ba, bn = [], [], []
    for i in range(n_bins):
        m = (conf >= bins[i]) & (conf < bins[i + 1])
        if m.sum() > 0:
            bc.append(conf[m].mean()); ba.append(correct[m].mean()); bn.append(m.sum())
    ece = sum(c * abs(a - m) for c, a, m in zip(bn, ba, bc)) / sum(bn)
    return np.array(bc), np.array(ba), np.array(bn), ece

np.random.seed(42)
conf = np.random.beta(5, 2, 50)            # stated confidences
true_p = conf * 0.85 + 0.05               # actual accuracy is LOWER than stated
correct = np.random.binomial(1, true_p)
bc, ba, bn, ece = reliability_diagram(conf, correct)
print(f'Expected Calibration Error (ECE) = {ece:.3f}   (> 0.1 -> notably miscalibrated)')
for c, a, k in zip(bc, ba, bn):
    print(f'  stated conf {c:.2f}: actual accuracy {a:.2f} (n={k:2d}) '
          f"[{'overconfident' if c > a else 'underconfident'}]")

Expected Calibration Error (ECE) = 0.149   (> 0.1 -> notably miscalibrated)
  stated conf 0.39: actual accuracy 0.50 (n= 2) [underconfident]
  stated conf 0.50: actual accuracy 0.67 (n= 3) [underconfident]
  stated conf 0.55: actual accuracy 0.57 (n= 7) [underconfident]
  stated conf 0.65: actual accuracy 0.30 (n=10) [overconfident]
  stated conf 0.75: actual accuracy 0.67 (n=18) [overconfident]
  stated conf 0.84: actual accuracy 1.00 (n= 6) [underconfident]
  stated conf 0.94: actual accuracy 0.75 (n= 4) [overconfident]


# Figure: the reliability diagram (points below the diagonal = overconfident).
fig, ax = plt.subplots(figsize=(5.6, 5.2))
ax.plot([0, 1], [0, 1], 'k--', lw=1.2, label='perfect calibration')
ax.scatter(bc, ba, s=np.array(bn) * 18, color=BLUE, alpha=0.8, zorder=3,
           label='confidence bins (size = n)')
for c, a in zip(bc, ba):
    ax.plot([c, c], [c, a], color=RED, lw=1, alpha=0.6)
ax.set_xlim(0, 1); ax.set_ylim(0, 1)
ax.set_xlabel('stated confidence'); ax.set_ylabel('actual accuracy')
ax.set_title(f'Reliability diagram (ECE = {ece:.3f}): LLMs tend to be overconfident')
ax.legend(loc='upper left', fontsize=9); plt.tight_layout(); plt.show()


# Evaluation protocol: score our Section 6.2 TF-IDF classifier against deployment gates.
metrics = {'accuracy': acc3, 'f1_weighted': f1w, 'kappa': kap}
DEPLOY = {'accuracy': 0.80, 'f1_weighted': 0.75, 'kappa': 0.60}     # minimums
print('Deployment readiness (3-class sentiment system):')
for m, thresh in DEPLOY.items():
    val = metrics[m]
    print(f'  {m:12s} = {val:.3f}  (need >= {thresh:.2f})  '
          f"{'PASS' if val >= thresh else 'FAIL'}")
# Binary system for contrast.
print(f'\n  binary pos/neg CV accuracy = {cv_acc.mean():.3f}  (>= 0.80) '
      f"{'PASS' if cv_acc.mean() >= 0.80 else 'FAIL'}")
print('Verdict: the binary system clears the bar; the 3-class system FAILS on the')
print('neutral class -- deploy binary auto-labelling, route neutrals to human review.')

Deployment readiness (3-class sentiment system):
  accuracy     = 0.664  (need >= 0.80)  FAIL
  f1_weighted  = 0.625  (need >= 0.75)  FAIL
  kappa        = 0.451  (need >= 0.60)  FAIL

  binary pos/neg CV accuracy = 0.824  (>= 0.80) PASS
Verdict: the binary system clears the bar; the 3-class system FAILS on the
neutral class -- deploy binary auto-labelling, route neutrals to human review.


# PII detection + redaction (regex; clean noise, keep the message).
PII_PATTERNS = {
    'email':       r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b',
    'phone':       r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
    'ssn':         r'\b\d{3}-\d{2}-\d{4}\b',
    'credit_card': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
}
def detect_pii(text):
    return {k: _re.findall(p, text) for k, p in PII_PATTERNS.items() if _re.findall(p, text)}
def redact_pii(text):
    for k, p in PII_PATTERNS.items():
        text = _re.sub(p, f'[{k.upper()}]', text)
    return text

sample = ('Contact John Smith at john.smith@example.com or call 765-555-1234. '
          'SSN: 123-45-6789.')
print('detected PII:', detect_pii(sample))
print('redacted    :', redact_pii(sample))
print('  -> the redacted text is safe to send to any LLM; redact BEFORE the API call.')

detected PII: {'email': ['john.smith@example.com'], 'phone': ['765-555-1234'], 'ssn': ['123-45-6789']}
redacted    : Contact John Smith at [EMAIL] or call [PHONE]. SSN: [SSN].
  -> the redacted text is safe to send to any LLM; redact BEFORE the API call.


# Disclosure statement + deployment checklist (Purdue AI Competency, Pillar 2).
def generate_disclosure(ai_uses):
    if not ai_uses:
        return 'No AI tools were used in this analysis.'
    lines = ['AI Disclosure Statement', '', 'The following AI tools were used:']
    for u in ai_uses:
        lines.append(f"- {u['task']}: {u['tool']} for {u['description']}. {u['human_role']}")
    lines += ['', 'All AI outputs were reviewed and validated by the authors.']
    return '\n'.join(lines)

print(generate_disclosure([{
    'task': 'Data Annotation',
    'tool': 'gemma3:12b via Purdue GenAI Studio',
    'description': 'sentiment labelling of 9,000 reviews',
    'human_role': "Validated on 74 reviews (Cohen's kappa = 0.55, moderate).",
}]))

checklist = {
    'Privacy':      ['No unredacted PII', 'Complies with HIPAA/FERPA/GDPR'],
    'Reliability':  ['Test set n >= 30', 'Accuracy past threshold', 'ECE < 0.15'],
    'Bias':         ['Differential-treatment probe run'],
    'Transparency': ['AI use disclosed', 'Methodology documented'],
    'Oversight':    ['Human review for flagged/uncertain cases'],
}
print('\nDeployment checklist (sentiment annotation of 10,000 reviews):')
for cat, items in checklist.items():
    print(f'  {cat}:')
    for it in items:
        print(f'    [ ] {it}')

AI Disclosure Statement

The following AI tools were used:
- Data Annotation: gemma3:12b via Purdue GenAI Studio for sentiment labelling of 9,000 reviews. Validated on 74 reviews (Cohen's kappa = 0.55, moderate).

All AI outputs were reviewed and validated by the authors.

Deployment checklist (sentiment annotation of 10,000 reviews):
  Privacy:
    [ ] No unredacted PII
    [ ] Complies with HIPAA/FERPA/GDPR
  Reliability:
    [ ] Test set n >= 30
    [ ] Accuracy past threshold
    [ ] ECE < 0.15
  Bias:
    [ ] Differential-treatment probe run
  Transparency:
    [ ] AI use disclosed
    [ ] Methodology documented
  Oversight:
    [ ] Human review for flagged/uncertain cases


# Solution 1.
from sklearn.pipeline import make_pipeline
svd100 = TruncatedSVD(n_components=100, random_state=42)
acc_full = cross_val_score(LogisticRegression(max_iter=1000), X[mask], y_bin, cv=5).mean()
acc_svd = cross_val_score(make_pipeline(svd100, LogisticRegression(max_iter=1000)),
                          X[mask], y_bin, cv=5).mean()
print(f'full 5000-dim TF-IDF : CV accuracy = {acc_full:.3f}')
print(f'SVD-100 compressed   : CV accuracy = {acc_svd:.3f}')
print(f'  Compressing 5000 -> 100 dims costs only {100*(acc_full-acc_svd):.1f} pp: the')
print('  sentiment signal lives in a LOW-dimensional subspace (cf. the decoder ring).')

full 5000-dim TF-IDF : CV accuracy = 0.824
SVD-100 compressed   : CV accuracy = 0.781
  Compressing 5000 -> 100 dims costs only 4.3 pp: the
  sentiment signal lives in a LOW-dimensional subspace (cf. the decoder ring).


# Solution 2.
above_04 = (kappas > 0.4).mean()
print(f'kappa point estimate = {kappa:.3f} (moderate), 95% CI = [{lo:.3f}, {hi:.3f}]')
print(f'  bootstrap P(kappa > 0.4) = {above_04:.3f}, but the 95% CI lower bound {lo:.3f}')
print(f'  dips just BELOW 0.4 -> agreement sits right at the fair/moderate boundary and')
print('  (since the CI also crosses 0.6) is nowhere near "substantial". Decision: use the')
print('  LLM labels as a first pass and route neutral/ambiguous cases to human review.')

kappa point estimate = 0.548 (moderate), 95% CI = [0.386, 0.688]
  bootstrap P(kappa > 0.4) = 0.965, but the 95% CI lower bound 0.386
  dips just BELOW 0.4 -> agreement sits right at the fair/moderate boundary and
  (since the CI also crosses 0.6) is nowhere near "substantial". Decision: use the
  LLM labels as a first pass and route neutral/ambiguous cases to human review.


# Solution 3.
q = 'What is a prior distribution?'
for k in (1, 2, 3):
    got = [s for s, _, _ in retrieve(q, k=k)]
    print(f'  top-{k}: {got}   bayesian.txt retrieved = {"bayesian.txt" in got}')
print('  Recall climbs with k: by top-3 the right source is in hand even though it is')
print('  not rank-1. Raising top_k is the cheap mitigation for imperfect retrieval')
print('  (at the cost of a longer, noisier context).')

  top-1: ['bootstrap.txt']   bayesian.txt retrieved = False
  top-2: ['bootstrap.txt', 'bayesian.txt']   bayesian.txt retrieved = True
  top-3: ['bootstrap.txt', 'bayesian.txt', 'bootstrap.txt']   bayesian.txt retrieved = True
  Recall climbs with k: by top-3 the right source is in hand even though it is
  not rank-1. Raising top_k is the cheap mitigation for imperfect retrieval
  (at the cost of a longer, noisier context).


# Solution 4.
_, _, _, ece5 = reliability_diagram(conf, correct, n_bins=5)
print(f'ECE (10 bins) = {ece:.3f}')
print(f'ECE (5 bins)  = {ece5:.3f}')
print('  Both flag miscalibration (> 0.1). Fewer bins average over wider confidence')
print('  ranges, so over- and under-confident items can partly cancel -> ECE is a')
print('  binning-dependent summary; report the bin count alongside the value.')

ECE (10 bins) = 0.149
ECE (5 bins)  = 0.119
  Both flag miscalibration (> 0.1). Fewer bins average over wider confidence
  ranges, so over- and under-confident items can partly cancel -> ECE is a
  binning-dependent summary; report the bin count alongside the value.

§	Topic	Runnable offline artifact
6.1	LLM foundations	attention-as-weighted-voting demo (+ read-only SDK)
6.2	Embeddings & features	TF-IDF cosine, PCA/SVD decoder ring, classifier, bootstrap regression
6.3	Text preprocessing	token heuristics, context budget, fixed/overlap/semantic chunking, cleaning
6.4	LLM annotation	Cohen's κ = 0.548 (moderate) + bootstrap CI on κ
6.5	RAG	manual chunk→TF-IDF→retrieve pipeline + Precision@k / hit@k
6.6	Prompt engineering	self-consistency as a bootstrap / Condorcet majority vote
6.7	Tool use	function → JSON-schema introspection (the `@tool` contract)
6.8	Reliability & evaluation	reliability diagram + ECE = 0.149; deployment-threshold report
6.9	Responsible AI	PII detection/redaction, disclosure statement, deployment checklist
6.10	Summary	reference tables + connections back to Ch 2–5

§	Technique	Offline artifact in this notebook	Stat connection
6.1	Attention	softmax-weighted "voting" over a sentence	—
6.2	Embeddings → features	TF-IDF cosine, SVD decoder ring, bootstrap regression	Ch 3 OLS/GLM, Ch 4 bootstrap
6.3	Preprocessing	token budget, fixed/overlap/semantic chunking	—
6.4	Annotation	Cohen's κ = 0.548 (moderate) + bootstrap CI	Ch 4 bootstrap
6.5	RAG	manual chunk→TF-IDF→retrieve, Precision@k / hit@k	nearest-neighbour search
6.6	Prompting	self-consistency = Condorcet/bootstrap voting	Ch 2 Monte Carlo, Ch 4 bootstrap
6.7	Tool use	function → JSON-schema introspection	—
6.8	Reliability	reliability diagram, ECE = 0.149, deploy gates	Ch 5 calibration
6.9	Responsible AI	PII redaction, disclosure, checklist	—

Chapter 6 — Large Language Models in Data Science¶

Learning outcomes¶

Section map (all core; SDK-dependent parts shown read-only)¶

Section 6.1 — LLM Foundations: Architecture, Training, Deployment¶

The `genai_studio` SDK (read-only — needs an API key)¶

Section 6.2 — Embeddings and Feature Extraction¶

Generating features: dense embeddings (read-only) vs. offline TF-IDF¶

What an embedding encodes — the decoder ring¶

Classification with embeddings — a binomial GLM on text features¶

Embeddings as covariates + bootstrap CIs (ties to Chapters 3 & 4)¶

Section 6.3 — Text Preprocessing for LLM Pipelines¶

Section 6.4 — LLM-Assisted Data Annotation¶

The annotation call (read-only — needs an API key)¶

Section 6.5 — Retrieval-Augmented Generation (RAG)¶

Built-in RAG (read-only — needs an API key)¶

Section 6.6 — Prompt Engineering for Data Science¶

Prompts as code & self-consistency (read-only — needs an API key)¶

Section 6.7 — Tool Use¶

The `@tool` cycle (read-only — needs an API key)¶

Section 6.8 — Reliability and Evaluation¶

Section 6.9 — Responsible AI Practices¶

Exercises¶

Section 6.10 — Chapter Summary & Connections¶

Chapter 6 — Large Language Models in Data Science¶

Learning outcomes¶

Section map (all core; SDK-dependent parts shown read-only)¶

Section 6.1 — LLM Foundations: Architecture, Training, Deployment¶

The genai_studio SDK (read-only — needs an API key)¶

Section 6.2 — Embeddings and Feature Extraction¶

Generating features: dense embeddings (read-only) vs. offline TF-IDF¶

What an embedding encodes — the decoder ring¶

Classification with embeddings — a binomial GLM on text features¶

Embeddings as covariates + bootstrap CIs (ties to Chapters 3 & 4)¶

Section 6.3 — Text Preprocessing for LLM Pipelines¶

Section 6.4 — LLM-Assisted Data Annotation¶

The annotation call (read-only — needs an API key)¶

Section 6.5 — Retrieval-Augmented Generation (RAG)¶

Built-in RAG (read-only — needs an API key)¶

Section 6.6 — Prompt Engineering for Data Science¶

Prompts as code & self-consistency (read-only — needs an API key)¶

Section 6.7 — Tool Use¶

The @tool cycle (read-only — needs an API key)¶

Section 6.8 — Reliability and Evaluation¶

Section 6.9 — Responsible AI Practices¶

Exercises¶

Section 6.10 — Chapter Summary & Connections¶

The `genai_studio` SDK (read-only — needs an API key)¶

The `@tool` cycle (read-only — needs an API key)¶