Create, run, rate, and iterate on your Claude Skills
claude-skills
at main 604 lines 17 kB view raw
1/** 2 * Statistical functions for benchmark analysis 3 * 4 * All functions use sample statistics (Bessel's correction: n-1 denominator) 5 * for unbiased estimation, which is standard for inferential statistics. 6 */ 7 8/** 9 * Compute mean of an array 10 */ 11export function mean(arr: number[]): number { 12 if (arr.length === 0) return 0; 13 return arr.reduce((sum, x) => sum + x, 0) / arr.length; 14} 15 16/** 17 * Compute sample standard deviation (Bessel-corrected, n-1 denominator) 18 * This is the unbiased estimator of population std. 19 */ 20export function sampleStd(arr: number[]): number { 21 if (arr.length < 2) return 0; 22 const m = mean(arr); 23 const variance = 24 arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / (arr.length - 1); 25 return Math.sqrt(variance); 26} 27 28/** 29 * Compute sample variance (Bessel-corrected) 30 */ 31export function sampleVariance(arr: number[]): number { 32 if (arr.length < 2) return 0; 33 const m = mean(arr); 34 return arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / (arr.length - 1); 35} 36 37/** 38 * Welch's t-test for two independent samples with unequal variances 39 * Returns two-tailed p-value 40 * 41 * This is the proper implementation using: 42 * - Sample variances (n-1 denominator) 43 * - Welch-Satterthwaite degrees of freedom 44 * - Student's t-distribution (not normal approximation) 45 */ 46export function welchTTest(group1: number[], group2: number[]): number { 47 if (group1.length < 2 || group2.length < 2) return 1; 48 49 const n1 = group1.length; 50 const n2 = group2.length; 51 const mean1 = mean(group1); 52 const mean2 = mean(group2); 53 const var1 = sampleVariance(group1); 54 const var2 = sampleVariance(group2); 55 56 // Standard error of the difference 57 const se1 = var1 / n1; 58 const se2 = var2 / n2; 59 const se = Math.sqrt(se1 + se2); 60 61 if (se === 0) return mean1 === mean2 ? 1 : 0; 62 63 // t-statistic 64 const t = Math.abs(mean1 - mean2) / se; 65 66 // Welch-Satterthwaite degrees of freedom 67 const df = (se1 + se2) ** 2 / (se1 ** 2 / (n1 - 1) + se2 ** 2 / (n2 - 1)); 68 69 // Two-tailed p-value from t-distribution 70 return tDistributionPValue(t, df); 71} 72 73/** 74 * Two-tailed p-value from Student's t-distribution 75 * Uses the regularized incomplete beta function 76 */ 77function tDistributionPValue(t: number, df: number): number { 78 // P(|T| > t) = I_{df/(df+t²)}(df/2, 1/2) for two-tailed test 79 const x = df / (df + t * t); 80 return regularizedIncompleteBeta(x, df / 2, 0.5); 81} 82 83/** 84 * Regularized incomplete beta function I_x(a,b) 85 * Used for computing t-distribution CDF 86 */ 87function regularizedIncompleteBeta(x: number, a: number, b: number): number { 88 if (x === 0) return 0; 89 if (x === 1) return 1; 90 91 // Use continued fraction representation 92 const bt = Math.exp( 93 logGamma(a + b) - 94 logGamma(a) - 95 logGamma(b) + 96 a * Math.log(x) + 97 b * Math.log(1 - x), 98 ); 99 100 // Use symmetry for better convergence 101 if (x < (a + 1) / (a + b + 2)) { 102 return (bt * betaContinuedFraction(x, a, b)) / a; 103 } else { 104 return 1 - (bt * betaContinuedFraction(1 - x, b, a)) / b; 105 } 106} 107 108/** 109 * Log gamma function using Lanczos approximation 110 * Accurate to ~15 decimal places 111 */ 112function logGamma(x: number): number { 113 const g = 7; 114 const c = [ 115 0.99999999999980993, 676.5203681218851, -1259.1392167224028, 116 771.32342877765313, -176.61502916214059, 12.507343278686905, 117 -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7, 118 ]; 119 120 if (x < 0.5) { 121 return Math.log(Math.PI / Math.sin(Math.PI * x)) - logGamma(1 - x); 122 } 123 124 x -= 1; 125 let sum = c[0]!; 126 for (let i = 1; i < g + 2; i++) { 127 sum += c[i]! / (x + i); 128 } 129 130 const t = x + g + 0.5; 131 return ( 132 0.5 * Math.log(2 * Math.PI) + (x + 0.5) * Math.log(t) - t + Math.log(sum) 133 ); 134} 135 136/** 137 * Continued fraction expansion for incomplete beta function 138 * Uses modified Lentz's algorithm for stability 139 */ 140function betaContinuedFraction(x: number, a: number, b: number): number { 141 const maxIter = 200; 142 const eps = 1e-14; 143 const tiny = 1e-30; 144 145 const qab = a + b; 146 const qap = a + 1; 147 const qam = a - 1; 148 149 let c = 1; 150 let d = 1 - (qab * x) / qap; 151 if (Math.abs(d) < tiny) d = tiny; 152 d = 1 / d; 153 let h = d; 154 155 for (let m = 1; m <= maxIter; m++) { 156 const m2 = 2 * m; 157 158 // Even step 159 let aa = (m * (b - m) * x) / ((qam + m2) * (a + m2)); 160 d = 1 + aa * d; 161 if (Math.abs(d) < tiny) d = tiny; 162 c = 1 + aa / c; 163 if (Math.abs(c) < tiny) c = tiny; 164 d = 1 / d; 165 h *= d * c; 166 167 // Odd step 168 aa = (-(a + m) * (qab + m) * x) / ((a + m2) * (qap + m2)); 169 d = 1 + aa * d; 170 if (Math.abs(d) < tiny) d = tiny; 171 c = 1 + aa / c; 172 if (Math.abs(c) < tiny) c = tiny; 173 d = 1 / d; 174 const del = d * c; 175 h *= del; 176 177 if (Math.abs(del - 1) < eps) break; 178 } 179 180 return h; 181} 182 183/** 184 * Summary statistics for a group 185 */ 186export interface GroupStats { 187 n: number; 188 mean: number; 189 std: number; // sample std (n-1 denominator) 190 scores: number[]; 191} 192 193/** 194 * Compute "probability best" for each combo using Monte Carlo simulation 195 * with empirical Bayes variance shrinkage. 196 * 197 * Each group's variance is shrunk toward the pooled variance across all groups. 198 * This prevents overconfidence when a group shows zero or low variance due to 199 * small sample size (e.g., all 4s doesn't mean the true population can't produce 5s). 200 * 201 * @param groups - Array of group statistics 202 * @param iterations - Number of Monte Carlo iterations (default 2000) 203 * @returns Array of probabilities in same order as input 204 */ 205export function computeProbabilityBest( 206 groups: GroupStats[], 207 iterations = 2000, 208): number[] { 209 if (groups.length === 0) return []; 210 if (groups.length === 1) return [1]; 211 212 // Need at least 2 samples per group for meaningful estimation 213 const validGroups = groups.filter((g) => g.n >= 2); 214 if (validGroups.length === 0) return groups.map(() => 1 / groups.length); 215 216 // Compute pooled variance across all groups (empirical Bayes prior) 217 const allScores = groups.flatMap((g) => g.scores); 218 const pooledVar = allScores.length >= 2 ? sampleVariance(allScores) : 0; 219 220 // If all scores are identical, we can't distinguish - return equal probabilities 221 if (pooledVar === 0) { 222 return groups.map(() => 1 / groups.length); 223 } 224 225 // Prior strength: how many "pseudo-samples" of pooled variance to blend in 226 // 2 is conventional - enough to prevent zero-variance issues without 227 // overwhelming the observed data as sample size grows 228 const priorStrength = 2; 229 230 // Compute standard error of mean for each group with shrinkage 231 const standardErrors = groups.map((g) => { 232 if (g.n < 2) return Math.sqrt(pooledVar); // fallback for tiny samples 233 const groupVar = g.std * g.std; 234 // Shrink group variance toward pooled: weighted average 235 const effectiveVar = 236 (g.n * groupVar + priorStrength * pooledVar) / (g.n + priorStrength); 237 // Standard error of mean 238 return Math.sqrt(effectiveVar / g.n); 239 }); 240 241 // Seed RNG from scores for deterministic results (prevents flicker) 242 const rng = createSeededRng(hashScores(groups)); 243 244 const wins = groups.map(() => 0); 245 246 for (let iter = 0; iter < iterations; iter++) { 247 // Sample plausible true means from each group's posterior 248 const sampledMeans = groups.map( 249 (g, i) => g.mean + gaussianRandom(rng) * standardErrors[i]!, 250 ); 251 252 // Find winner (highest sampled mean) 253 let maxMean = -Infinity; 254 let winner = 0; 255 for (let i = 0; i < sampledMeans.length; i++) { 256 if (sampledMeans[i]! > maxMean) { 257 maxMean = sampledMeans[i]!; 258 winner = i; 259 } 260 } 261 wins[winner]++; 262 } 263 264 return wins.map((w) => w / iterations); 265} 266 267/** 268 * Seeded PRNG (mulberry32) for deterministic results 269 * Same input data → same probabilities → no flicker 270 */ 271function createSeededRng(seed: number): () => number { 272 return () => { 273 seed |= 0; 274 seed = (seed + 0x6d2b79f5) | 0; 275 let t = Math.imul(seed ^ (seed >>> 15), 1 | seed); 276 t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; 277 return ((t ^ (t >>> 14)) >>> 0) / 4294967296; 278 }; 279} 280 281/** 282 * Hash scores to create a deterministic seed 283 */ 284function hashScores(groups: GroupStats[]): number { 285 let hash = 0; 286 for (const g of groups) { 287 for (const s of g.scores) { 288 hash = ((hash << 5) - hash + s * 1000) | 0; 289 } 290 } 291 return hash; 292} 293 294/** 295 * Box-Muller transform for generating standard normal random variates 296 */ 297function gaussianRandom(rng: () => number): number { 298 let u: number, v: number, s: number; 299 do { 300 u = rng() * 2 - 1; 301 v = rng() * 2 - 1; 302 s = u * u + v * v; 303 } while (s >= 1 || s === 0); 304 return u * Math.sqrt((-2 * Math.log(s)) / s); 305} 306 307/** 308 * Confidence level based on sample size 309 * low: n < 5, medium: 5-14, high: n ≥ 15 310 */ 311export type ConfidenceLevel = "low" | "medium" | "high"; 312 313export function sampleSizeConfidence(n: number): ConfidenceLevel { 314 if (n < 5) return "low"; 315 if (n < 15) return "medium"; 316 return "high"; 317} 318 319/** Confidence level using minimum sample size across groups */ 320export function comparisonConfidence(groups: GroupStats[]): ConfidenceLevel { 321 if (groups.length === 0) return "low"; 322 const minN = Math.min(...groups.map((g) => g.n)); 323 return sampleSizeConfidence(minN); 324} 325 326/** 327 * Standard normal CDF (Φ function) 328 * Uses Abramowitz & Stegun approximation (7.1.26), accurate to 1.5×10⁻⁷ 329 */ 330export function normalCDF(x: number): number { 331 const a1 = 0.254829592; 332 const a2 = -0.284496736; 333 const a3 = 1.421413741; 334 const a4 = -1.453152027; 335 const a5 = 1.061405429; 336 const p = 0.3275911; 337 338 const sign = x < 0 ? -1 : 1; 339 x = Math.abs(x) / Math.sqrt(2); 340 341 const t = 1 / (1 + p * x); 342 const y = 343 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x); 344 345 return 0.5 * (1 + sign * y); 346} 347 348/** 349 * Probability that group A is better (higher mean) than group B 350 * Uses Bayesian posterior with uninformative prior 351 * 352 * P(A > B) = Φ((μA - μB) / √(SE_A² + SE_B²)) 353 * where SE = σ/√n is the standard error of the mean 354 * 355 * Returns value in [0, 1]. Returns 0.5 if insufficient data. 356 */ 357export function probabilityABetterThanB(a: GroupStats, b: GroupStats): number { 358 // Need at least 2 samples each for variance estimate 359 if (a.n < 2 || b.n < 2) return 0.5; 360 361 const meanDiff = a.mean - b.mean; 362 const seA = a.std / Math.sqrt(a.n); 363 const seB = b.std / Math.sqrt(b.n); 364 const seDiff = Math.sqrt(seA * seA + seB * seB); 365 366 if (seDiff === 0) { 367 // No variance - return based on mean comparison 368 return meanDiff > 0 ? 1 : meanDiff < 0 ? 0 : 0.5; 369 } 370 371 return normalCDF(meanDiff / seDiff); 372} 373 374/** 375 * Effect size interpretation thresholds (Cohen's conventions) 376 */ 377export type EffectMagnitude = "negligible" | "small" | "medium" | "large"; 378 379export interface EffectSize { 380 d: number; 381 magnitude: EffectMagnitude; 382} 383 384/** 385 * Cohen's d effect size: (μA - μB) / pooledStd 386 * 387 * Interpretation: |d| < 0.2 negligible, 0.2-0.5 small, 0.5-0.8 medium, ≥0.8 large 388 */ 389export function cohensD(a: GroupStats, b: GroupStats): EffectSize { 390 if (a.n < 2 || b.n < 2) return { d: 0, magnitude: "negligible" }; 391 392 // Weighted pooled standard deviation (proper formula) 393 const var1 = a.std * a.std; 394 const var2 = b.std * b.std; 395 const pooledVar = ((a.n - 1) * var1 + (b.n - 1) * var2) / (a.n + b.n - 2); 396 const pooledStd = Math.sqrt(pooledVar); 397 398 if (pooledStd === 0) { 399 return { 400 d: a.mean === b.mean ? 0 : Infinity, 401 magnitude: a.mean === b.mean ? "negligible" : "large", 402 }; 403 } 404 405 const d = (a.mean - b.mean) / pooledStd; 406 const absD = Math.abs(d); 407 408 let magnitude: EffectMagnitude; 409 if (absD < 0.2) magnitude = "negligible"; 410 else if (absD < 0.5) magnitude = "small"; 411 else if (absD < 0.8) magnitude = "medium"; 412 else magnitude = "large"; 413 414 return { d, magnitude }; 415} 416 417/** 418 * Estimate runs needed for leader to reach target probability 419 * Uses simulation of how standard error decreases with more samples 420 * 421 * Returns estimated additional runs needed, or Infinity if not achievable 422 */ 423export function runsToTargetProbability( 424 leader: GroupStats, 425 others: GroupStats[], 426 targetProb: number = 0.95, 427): number { 428 if (others.length === 0) return 0; 429 430 // Current probability 431 const currentProbs = computeProbabilityBest([leader, ...others]); 432 const currentLeaderProb = currentProbs[0]!; 433 434 if (currentLeaderProb >= targetProb) return 0; 435 if (currentLeaderProb < 0.5) return Infinity; // Not currently winning 436 437 // Find the closest competitor 438 let minProbVsOther = 1; 439 let closestOther: GroupStats | null = null; 440 for (const other of others) { 441 const prob = probabilityABetterThanB(leader, other); 442 if (prob < minProbVsOther) { 443 minProbVsOther = prob; 444 closestOther = other; 445 } 446 } 447 448 if (!closestOther || minProbVsOther >= targetProb) return 0; 449 450 // Simulate adding samples and see when we'd hit target 451 // SE decreases as 1/√n, so variance decreases as 1/n 452 // We need to solve for n where P(leader > closest) = targetProb 453 454 const meanDiff = leader.mean - closestOther.mean; 455 if (meanDiff <= 0) return Infinity; 456 457 const currentN = Math.min(leader.n, closestOther.n); 458 const varLeader = leader.std * leader.std; 459 const varOther = closestOther.std * closestOther.std; 460 461 // Target z-score for desired probability 462 // P(X > 0) = Φ(μ/σ) = targetProb => μ/σ = Φ⁻¹(targetProb) 463 // For 0.95: z ≈ 1.645 464 // For 0.99: z ≈ 2.326 465 const targetZ = inverseCDF(targetProb); 466 467 // We need: meanDiff / √(varLeader/n + varOther/n) = targetZ 468 // Solving for n: n = (varLeader + varOther) * targetZ² / meanDiff² 469 const requiredN = Math.ceil( 470 ((varLeader + varOther) * targetZ * targetZ) / (meanDiff * meanDiff), 471 ); 472 473 return Math.max(0, requiredN - currentN); 474} 475 476/** 477 * Inverse standard normal CDF (quantile function) 478 * Uses Acklam's approximation, accurate to ~1.15×10⁻⁹ 479 */ 480function inverseCDF(p: number): number { 481 if (p <= 0) return -Infinity; 482 if (p >= 1) return Infinity; 483 if (p === 0.5) return 0; 484 485 const a = [ 486 -3.969683028665376e1, 2.209460984245205e2, -2.759285104469687e2, 487 1.38357751867269e2, -3.066479806614716e1, 2.506628277459239, 488 ]; 489 const b = [ 490 -5.447609879822406e1, 1.615858368580409e2, -1.556989798598866e2, 491 6.680131188771972e1, -1.328068155288572e1, 492 ]; 493 const c = [ 494 -7.784894002430293e-3, -3.223964580411365e-1, -2.400758277161838, 495 -2.549732539343734, 4.374664141464968, 2.938163982698783, 496 ]; 497 const d = [ 498 7.784695709041462e-3, 3.224671290700398e-1, 2.445134137142996, 499 3.754408661907416, 500 ]; 501 502 const pLow = 0.02425; 503 const pHigh = 1 - pLow; 504 505 let q: number, r: number; 506 507 if (p < pLow) { 508 q = Math.sqrt(-2 * Math.log(p)); 509 return ( 510 (((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q + 511 c[5]!) / 512 ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1) 513 ); 514 } else if (p <= pHigh) { 515 q = p - 0.5; 516 r = q * q; 517 return ( 518 ((((((a[0]! * r + a[1]!) * r + a[2]!) * r + a[3]!) * r + a[4]!) * r + 519 a[5]!) * 520 q) / 521 (((((b[0]! * r + b[1]!) * r + b[2]!) * r + b[3]!) * r + b[4]!) * r + 1) 522 ); 523 } else { 524 q = Math.sqrt(-2 * Math.log(1 - p)); 525 return ( 526 -( 527 ((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q + 528 c[5]! 529 ) / 530 ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1) 531 ); 532 } 533} 534 535/** Round run estimates to avoid false precision: 5, 10, 25, 50, 100, or ∞ */ 536export function bucketRunEstimate(n: number): number { 537 if (!isFinite(n) || n <= 0) return 0; 538 if (n <= 7) return 5; 539 if (n <= 17) return 10; 540 if (n <= 35) return 25; 541 if (n <= 75) return 50; 542 if (n <= 150) return 100; 543 return Infinity; 544} 545 546/** 547 * Format a bucketed run estimate for display 548 */ 549export function formatRunEstimate(n: number): string { 550 const bucketed = bucketRunEstimate(n); 551 if (bucketed === 0) return ""; 552 if (!isFinite(bucketed)) return "~100+"; 553 return `~${bucketed}`; 554} 555 556/** 557 * Interpret probability for verdict 558 */ 559export type VerdictLevel = "winner" | "likely" | "unclear" | "insufficient"; 560 561export function verdictFromProbability(prob: number, n: number): VerdictLevel { 562 if (n < 5) return "insufficient"; 563 if (prob >= 0.95) return "winner"; 564 if (prob >= 0.8) return "likely"; 565 return "unclear"; 566} 567 568/** 569 * Uncertainty margin for a probability estimate based on sample size 570 * Uses standard error of a proportion: sqrt(p*(1-p)/n) 571 * 572 * Returns a value representing the uncertainty (e.g., 0.05 means ±5%) 573 */ 574export function probabilityUncertainty(probability: number, n: number): number { 575 if (n < 2) return 0.5; // Maximum uncertainty with insufficient data 576 // Standard error of a proportion 577 return Math.sqrt((probability * (1 - probability)) / n); 578} 579 580/** 581 * Effect size of leader vs closest competitor 582 * Returns the Cohen's d effect size comparing leader to the closest other group 583 * Returns null if there are no other groups to compare 584 */ 585export function leaderEffectSize( 586 leader: GroupStats, 587 others: GroupStats[], 588): EffectSize | null { 589 if (others.length === 0) return null; 590 591 // Find the closest competitor (highest mean among others) 592 let closest: GroupStats | null = null; 593 let highestMean = -Infinity; 594 for (const other of others) { 595 if (other.n >= 2 && other.mean > highestMean) { 596 highestMean = other.mean; 597 closest = other; 598 } 599 } 600 601 if (!closest || leader.n < 2) return null; 602 603 return cohensD(leader, closest); 604}