src/stats.ts at main · danabra.mov/woodshed

danabra.mov / woodshed
fork atom
Create, run, rate, and iterate on your Claude Skills
claude-skills
fork atom
woodshed / src / stats.ts
at main 604 lines 17 kB view raw
wrap content
danabra.mov woodshed 1mo ago
d8ce6966
  1/**
  2 * Statistical functions for benchmark analysis
  3 *
  4 * All functions use sample statistics (Bessel's correction: n-1 denominator)
  5 * for unbiased estimation, which is standard for inferential statistics.
  6 */
  7
  8/**
  9 * Compute mean of an array
 10 */
 11export function mean(arr: number[]): number {
 12  if (arr.length === 0) return 0;
 13  return arr.reduce((sum, x) => sum + x, 0) / arr.length;
 14}
 15
 16/**
 17 * Compute sample standard deviation (Bessel-corrected, n-1 denominator)
 18 * This is the unbiased estimator of population std.
 19 */
 20export function sampleStd(arr: number[]): number {
 21  if (arr.length < 2) return 0;
 22  const m = mean(arr);
 23  const variance =
 24    arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / (arr.length - 1);
 25  return Math.sqrt(variance);
 26}
 27
 28/**
 29 * Compute sample variance (Bessel-corrected)
 30 */
 31export function sampleVariance(arr: number[]): number {
 32  if (arr.length < 2) return 0;
 33  const m = mean(arr);
 34  return arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / (arr.length - 1);
 35}
 36
 37/**
 38 * Welch's t-test for two independent samples with unequal variances
 39 * Returns two-tailed p-value
 40 *
 41 * This is the proper implementation using:
 42 * - Sample variances (n-1 denominator)
 43 * - Welch-Satterthwaite degrees of freedom
 44 * - Student's t-distribution (not normal approximation)
 45 */
 46export function welchTTest(group1: number[], group2: number[]): number {
 47  if (group1.length < 2 || group2.length < 2) return 1;
 48
 49  const n1 = group1.length;
 50  const n2 = group2.length;
 51  const mean1 = mean(group1);
 52  const mean2 = mean(group2);
 53  const var1 = sampleVariance(group1);
 54  const var2 = sampleVariance(group2);
 55
 56  // Standard error of the difference
 57  const se1 = var1 / n1;
 58  const se2 = var2 / n2;
 59  const se = Math.sqrt(se1 + se2);
 60
 61  if (se === 0) return mean1 === mean2 ? 1 : 0;
 62
 63  // t-statistic
 64  const t = Math.abs(mean1 - mean2) / se;
 65
 66  // Welch-Satterthwaite degrees of freedom
 67  const df = (se1 + se2) ** 2 / (se1 ** 2 / (n1 - 1) + se2 ** 2 / (n2 - 1));
 68
 69  // Two-tailed p-value from t-distribution
 70  return tDistributionPValue(t, df);
 71}
 72
 73/**
 74 * Two-tailed p-value from Student's t-distribution
 75 * Uses the regularized incomplete beta function
 76 */
 77function tDistributionPValue(t: number, df: number): number {
 78  // P(|T| > t) = I_{df/(df+t²)}(df/2, 1/2) for two-tailed test
 79  const x = df / (df + t * t);
 80  return regularizedIncompleteBeta(x, df / 2, 0.5);
 81}
 82
 83/**
 84 * Regularized incomplete beta function I_x(a,b)
 85 * Used for computing t-distribution CDF
 86 */
 87function regularizedIncompleteBeta(x: number, a: number, b: number): number {
 88  if (x === 0) return 0;
 89  if (x === 1) return 1;
 90
 91  // Use continued fraction representation
 92  const bt = Math.exp(
 93    logGamma(a + b) -
 94      logGamma(a) -
 95      logGamma(b) +
 96      a * Math.log(x) +
 97      b * Math.log(1 - x),
 98  );
 99
100  // Use symmetry for better convergence
101  if (x < (a + 1) / (a + b + 2)) {
102    return (bt * betaContinuedFraction(x, a, b)) / a;
103  } else {
104    return 1 - (bt * betaContinuedFraction(1 - x, b, a)) / b;
105  }
106}
107
108/**
109 * Log gamma function using Lanczos approximation
110 * Accurate to ~15 decimal places
111 */
112function logGamma(x: number): number {
113  const g = 7;
114  const c = [
115    0.99999999999980993, 676.5203681218851, -1259.1392167224028,
116    771.32342877765313, -176.61502916214059, 12.507343278686905,
117    -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7,
118  ];
119
120  if (x < 0.5) {
121    return Math.log(Math.PI / Math.sin(Math.PI * x)) - logGamma(1 - x);
122  }
123
124  x -= 1;
125  let sum = c[0]!;
126  for (let i = 1; i < g + 2; i++) {
127    sum += c[i]! / (x + i);
128  }
129
130  const t = x + g + 0.5;
131  return (
132    0.5 * Math.log(2 * Math.PI) + (x + 0.5) * Math.log(t) - t + Math.log(sum)
133  );
134}
135
136/**
137 * Continued fraction expansion for incomplete beta function
138 * Uses modified Lentz's algorithm for stability
139 */
140function betaContinuedFraction(x: number, a: number, b: number): number {
141  const maxIter = 200;
142  const eps = 1e-14;
143  const tiny = 1e-30;
144
145  const qab = a + b;
146  const qap = a + 1;
147  const qam = a - 1;
148
149  let c = 1;
150  let d = 1 - (qab * x) / qap;
151  if (Math.abs(d) < tiny) d = tiny;
152  d = 1 / d;
153  let h = d;
154
155  for (let m = 1; m <= maxIter; m++) {
156    const m2 = 2 * m;
157
158    // Even step
159    let aa = (m * (b - m) * x) / ((qam + m2) * (a + m2));
160    d = 1 + aa * d;
161    if (Math.abs(d) < tiny) d = tiny;
162    c = 1 + aa / c;
163    if (Math.abs(c) < tiny) c = tiny;
164    d = 1 / d;
165    h *= d * c;
166
167    // Odd step
168    aa = (-(a + m) * (qab + m) * x) / ((a + m2) * (qap + m2));
169    d = 1 + aa * d;
170    if (Math.abs(d) < tiny) d = tiny;
171    c = 1 + aa / c;
172    if (Math.abs(c) < tiny) c = tiny;
173    d = 1 / d;
174    const del = d * c;
175    h *= del;
176
177    if (Math.abs(del - 1) < eps) break;
178  }
179
180  return h;
181}
182
183/**
184 * Summary statistics for a group
185 */
186export interface GroupStats {
187  n: number;
188  mean: number;
189  std: number; // sample std (n-1 denominator)
190  scores: number[];
191}
192
193/**
194 * Compute "probability best" for each combo using Monte Carlo simulation
195 * with empirical Bayes variance shrinkage.
196 *
197 * Each group's variance is shrunk toward the pooled variance across all groups.
198 * This prevents overconfidence when a group shows zero or low variance due to
199 * small sample size (e.g., all 4s doesn't mean the true population can't produce 5s).
200 *
201 * @param groups - Array of group statistics
202 * @param iterations - Number of Monte Carlo iterations (default 2000)
203 * @returns Array of probabilities in same order as input
204 */
205export function computeProbabilityBest(
206  groups: GroupStats[],
207  iterations = 2000,
208): number[] {
209  if (groups.length === 0) return [];
210  if (groups.length === 1) return [1];
211
212  // Need at least 2 samples per group for meaningful estimation
213  const validGroups = groups.filter((g) => g.n >= 2);
214  if (validGroups.length === 0) return groups.map(() => 1 / groups.length);
215
216  // Compute pooled variance across all groups (empirical Bayes prior)
217  const allScores = groups.flatMap((g) => g.scores);
218  const pooledVar = allScores.length >= 2 ? sampleVariance(allScores) : 0;
219
220  // If all scores are identical, we can't distinguish - return equal probabilities
221  if (pooledVar === 0) {
222    return groups.map(() => 1 / groups.length);
223  }
224
225  // Prior strength: how many "pseudo-samples" of pooled variance to blend in
226  // 2 is conventional - enough to prevent zero-variance issues without
227  // overwhelming the observed data as sample size grows
228  const priorStrength = 2;
229
230  // Compute standard error of mean for each group with shrinkage
231  const standardErrors = groups.map((g) => {
232    if (g.n < 2) return Math.sqrt(pooledVar); // fallback for tiny samples
233    const groupVar = g.std * g.std;
234    // Shrink group variance toward pooled: weighted average
235    const effectiveVar =
236      (g.n * groupVar + priorStrength * pooledVar) / (g.n + priorStrength);
237    // Standard error of mean
238    return Math.sqrt(effectiveVar / g.n);
239  });
240
241  // Seed RNG from scores for deterministic results (prevents flicker)
242  const rng = createSeededRng(hashScores(groups));
243
244  const wins = groups.map(() => 0);
245
246  for (let iter = 0; iter < iterations; iter++) {
247    // Sample plausible true means from each group's posterior
248    const sampledMeans = groups.map(
249      (g, i) => g.mean + gaussianRandom(rng) * standardErrors[i]!,
250    );
251
252    // Find winner (highest sampled mean)
253    let maxMean = -Infinity;
254    let winner = 0;
255    for (let i = 0; i < sampledMeans.length; i++) {
256      if (sampledMeans[i]! > maxMean) {
257        maxMean = sampledMeans[i]!;
258        winner = i;
259      }
260    }
261    wins[winner]++;
262  }
263
264  return wins.map((w) => w / iterations);
265}
266
267/**
268 * Seeded PRNG (mulberry32) for deterministic results
269 * Same input data → same probabilities → no flicker
270 */
271function createSeededRng(seed: number): () => number {
272  return () => {
273    seed |= 0;
274    seed = (seed + 0x6d2b79f5) | 0;
275    let t = Math.imul(seed ^ (seed >>> 15), 1 | seed);
276    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
277    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
278  };
279}
280
281/**
282 * Hash scores to create a deterministic seed
283 */
284function hashScores(groups: GroupStats[]): number {
285  let hash = 0;
286  for (const g of groups) {
287    for (const s of g.scores) {
288      hash = ((hash << 5) - hash + s * 1000) | 0;
289    }
290  }
291  return hash;
292}
293
294/**
295 * Box-Muller transform for generating standard normal random variates
296 */
297function gaussianRandom(rng: () => number): number {
298  let u: number, v: number, s: number;
299  do {
300    u = rng() * 2 - 1;
301    v = rng() * 2 - 1;
302    s = u * u + v * v;
303  } while (s >= 1 || s === 0);
304  return u * Math.sqrt((-2 * Math.log(s)) / s);
305}
306
307/**
308 * Confidence level based on sample size
309 * low: n < 5, medium: 5-14, high: n ≥ 15
310 */
311export type ConfidenceLevel = "low" | "medium" | "high";
312
313export function sampleSizeConfidence(n: number): ConfidenceLevel {
314  if (n < 5) return "low";
315  if (n < 15) return "medium";
316  return "high";
317}
318
319/** Confidence level using minimum sample size across groups */
320export function comparisonConfidence(groups: GroupStats[]): ConfidenceLevel {
321  if (groups.length === 0) return "low";
322  const minN = Math.min(...groups.map((g) => g.n));
323  return sampleSizeConfidence(minN);
324}
325
326/**
327 * Standard normal CDF (Φ function)
328 * Uses Abramowitz & Stegun approximation (7.1.26), accurate to 1.5×10⁻⁷
329 */
330export function normalCDF(x: number): number {
331  const a1 = 0.254829592;
332  const a2 = -0.284496736;
333  const a3 = 1.421413741;
334  const a4 = -1.453152027;
335  const a5 = 1.061405429;
336  const p = 0.3275911;
337
338  const sign = x < 0 ? -1 : 1;
339  x = Math.abs(x) / Math.sqrt(2);
340
341  const t = 1 / (1 + p * x);
342  const y =
343    1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
344
345  return 0.5 * (1 + sign * y);
346}
347
348/**
349 * Probability that group A is better (higher mean) than group B
350 * Uses Bayesian posterior with uninformative prior
351 *
352 * P(A > B) = Φ((μA - μB) / √(SE_A² + SE_B²))
353 * where SE = σ/√n is the standard error of the mean
354 *
355 * Returns value in [0, 1]. Returns 0.5 if insufficient data.
356 */
357export function probabilityABetterThanB(a: GroupStats, b: GroupStats): number {
358  // Need at least 2 samples each for variance estimate
359  if (a.n < 2 || b.n < 2) return 0.5;
360
361  const meanDiff = a.mean - b.mean;
362  const seA = a.std / Math.sqrt(a.n);
363  const seB = b.std / Math.sqrt(b.n);
364  const seDiff = Math.sqrt(seA * seA + seB * seB);
365
366  if (seDiff === 0) {
367    // No variance - return based on mean comparison
368    return meanDiff > 0 ? 1 : meanDiff < 0 ? 0 : 0.5;
369  }
370
371  return normalCDF(meanDiff / seDiff);
372}
373
374/**
375 * Effect size interpretation thresholds (Cohen's conventions)
376 */
377export type EffectMagnitude = "negligible" | "small" | "medium" | "large";
378
379export interface EffectSize {
380  d: number;
381  magnitude: EffectMagnitude;
382}
383
384/**
385 * Cohen's d effect size: (μA - μB) / pooledStd
386 *
387 * Interpretation: |d| < 0.2 negligible, 0.2-0.5 small, 0.5-0.8 medium, ≥0.8 large
388 */
389export function cohensD(a: GroupStats, b: GroupStats): EffectSize {
390  if (a.n < 2 || b.n < 2) return { d: 0, magnitude: "negligible" };
391
392  // Weighted pooled standard deviation (proper formula)
393  const var1 = a.std * a.std;
394  const var2 = b.std * b.std;
395  const pooledVar = ((a.n - 1) * var1 + (b.n - 1) * var2) / (a.n + b.n - 2);
396  const pooledStd = Math.sqrt(pooledVar);
397
398  if (pooledStd === 0) {
399    return {
400      d: a.mean === b.mean ? 0 : Infinity,
401      magnitude: a.mean === b.mean ? "negligible" : "large",
402    };
403  }
404
405  const d = (a.mean - b.mean) / pooledStd;
406  const absD = Math.abs(d);
407
408  let magnitude: EffectMagnitude;
409  if (absD < 0.2) magnitude = "negligible";
410  else if (absD < 0.5) magnitude = "small";
411  else if (absD < 0.8) magnitude = "medium";
412  else magnitude = "large";
413
414  return { d, magnitude };
415}
416
417/**
418 * Estimate runs needed for leader to reach target probability
419 * Uses simulation of how standard error decreases with more samples
420 *
421 * Returns estimated additional runs needed, or Infinity if not achievable
422 */
423export function runsToTargetProbability(
424  leader: GroupStats,
425  others: GroupStats[],
426  targetProb: number = 0.95,
427): number {
428  if (others.length === 0) return 0;
429
430  // Current probability
431  const currentProbs = computeProbabilityBest([leader, ...others]);
432  const currentLeaderProb = currentProbs[0]!;
433
434  if (currentLeaderProb >= targetProb) return 0;
435  if (currentLeaderProb < 0.5) return Infinity; // Not currently winning
436
437  // Find the closest competitor
438  let minProbVsOther = 1;
439  let closestOther: GroupStats | null = null;
440  for (const other of others) {
441    const prob = probabilityABetterThanB(leader, other);
442    if (prob < minProbVsOther) {
443      minProbVsOther = prob;
444      closestOther = other;
445    }
446  }
447
448  if (!closestOther || minProbVsOther >= targetProb) return 0;
449
450  // Simulate adding samples and see when we'd hit target
451  // SE decreases as 1/√n, so variance decreases as 1/n
452  // We need to solve for n where P(leader > closest) = targetProb
453
454  const meanDiff = leader.mean - closestOther.mean;
455  if (meanDiff <= 0) return Infinity;
456
457  const currentN = Math.min(leader.n, closestOther.n);
458  const varLeader = leader.std * leader.std;
459  const varOther = closestOther.std * closestOther.std;
460
461  // Target z-score for desired probability
462  // P(X > 0) = Φ(μ/σ) = targetProb => μ/σ = Φ⁻¹(targetProb)
463  // For 0.95: z ≈ 1.645
464  // For 0.99: z ≈ 2.326
465  const targetZ = inverseCDF(targetProb);
466
467  // We need: meanDiff / √(varLeader/n + varOther/n) = targetZ
468  // Solving for n: n = (varLeader + varOther) * targetZ² / meanDiff²
469  const requiredN = Math.ceil(
470    ((varLeader + varOther) * targetZ * targetZ) / (meanDiff * meanDiff),
471  );
472
473  return Math.max(0, requiredN - currentN);
474}
475
476/**
477 * Inverse standard normal CDF (quantile function)
478 * Uses Acklam's approximation, accurate to ~1.15×10⁻⁹
479 */
480function inverseCDF(p: number): number {
481  if (p <= 0) return -Infinity;
482  if (p >= 1) return Infinity;
483  if (p === 0.5) return 0;
484
485  const a = [
486    -3.969683028665376e1, 2.209460984245205e2, -2.759285104469687e2,
487    1.38357751867269e2, -3.066479806614716e1, 2.506628277459239,
488  ];
489  const b = [
490    -5.447609879822406e1, 1.615858368580409e2, -1.556989798598866e2,
491    6.680131188771972e1, -1.328068155288572e1,
492  ];
493  const c = [
494    -7.784894002430293e-3, -3.223964580411365e-1, -2.400758277161838,
495    -2.549732539343734, 4.374664141464968, 2.938163982698783,
496  ];
497  const d = [
498    7.784695709041462e-3, 3.224671290700398e-1, 2.445134137142996,
499    3.754408661907416,
500  ];
501
502  const pLow = 0.02425;
503  const pHigh = 1 - pLow;
504
505  let q: number, r: number;
506
507  if (p < pLow) {
508    q = Math.sqrt(-2 * Math.log(p));
509    return (
510      (((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q +
511        c[5]!) /
512      ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1)
513    );
514  } else if (p <= pHigh) {
515    q = p - 0.5;
516    r = q * q;
517    return (
518      ((((((a[0]! * r + a[1]!) * r + a[2]!) * r + a[3]!) * r + a[4]!) * r +
519        a[5]!) *
520        q) /
521      (((((b[0]! * r + b[1]!) * r + b[2]!) * r + b[3]!) * r + b[4]!) * r + 1)
522    );
523  } else {
524    q = Math.sqrt(-2 * Math.log(1 - p));
525    return (
526      -(
527        ((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q +
528        c[5]!
529      ) /
530      ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1)
531    );
532  }
533}
534
535/** Round run estimates to avoid false precision: 5, 10, 25, 50, 100, or ∞ */
536export function bucketRunEstimate(n: number): number {
537  if (!isFinite(n) || n <= 0) return 0;
538  if (n <= 7) return 5;
539  if (n <= 17) return 10;
540  if (n <= 35) return 25;
541  if (n <= 75) return 50;
542  if (n <= 150) return 100;
543  return Infinity;
544}
545
546/**
547 * Format a bucketed run estimate for display
548 */
549export function formatRunEstimate(n: number): string {
550  const bucketed = bucketRunEstimate(n);
551  if (bucketed === 0) return "";
552  if (!isFinite(bucketed)) return "~100+";
553  return `~${bucketed}`;
554}
555
556/**
557 * Interpret probability for verdict
558 */
559export type VerdictLevel = "winner" | "likely" | "unclear" | "insufficient";
560
561export function verdictFromProbability(prob: number, n: number): VerdictLevel {
562  if (n < 5) return "insufficient";
563  if (prob >= 0.95) return "winner";
564  if (prob >= 0.8) return "likely";
565  return "unclear";
566}
567
568/**
569 * Uncertainty margin for a probability estimate based on sample size
570 * Uses standard error of a proportion: sqrt(p*(1-p)/n)
571 *
572 * Returns a value representing the uncertainty (e.g., 0.05 means ±5%)
573 */
574export function probabilityUncertainty(probability: number, n: number): number {
575  if (n < 2) return 0.5; // Maximum uncertainty with insufficient data
576  // Standard error of a proportion
577  return Math.sqrt((probability * (1 - probability)) / n);
578}
579
580/**
581 * Effect size of leader vs closest competitor
582 * Returns the Cohen's d effect size comparing leader to the closest other group
583 * Returns null if there are no other groups to compare
584 */
585export function leaderEffectSize(
586  leader: GroupStats,
587  others: GroupStats[],
588): EffectSize | null {
589  if (others.length === 0) return null;
590
591  // Find the closest competitor (highest mean among others)
592  let closest: GroupStats | null = null;
593  let highestMean = -Infinity;
594  for (const other of others) {
595    if (other.n >= 2 && other.mean > highestMean) {
596      highestMean = other.mean;
597      closest = other;
598    }
599  }
600
601  if (!closest || leader.n < 2) return null;
602
603  return cohensD(leader, closest);
604}