Create, run, rate, and iterate on your Claude Skills
claude-skills
1/**
2 * Statistical functions for benchmark analysis
3 *
4 * All functions use sample statistics (Bessel's correction: n-1 denominator)
5 * for unbiased estimation, which is standard for inferential statistics.
6 */
7
8/**
9 * Compute mean of an array
10 */
11export function mean(arr: number[]): number {
12 if (arr.length === 0) return 0;
13 return arr.reduce((sum, x) => sum + x, 0) / arr.length;
14}
15
16/**
17 * Compute sample standard deviation (Bessel-corrected, n-1 denominator)
18 * This is the unbiased estimator of population std.
19 */
20export function sampleStd(arr: number[]): number {
21 if (arr.length < 2) return 0;
22 const m = mean(arr);
23 const variance =
24 arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / (arr.length - 1);
25 return Math.sqrt(variance);
26}
27
28/**
29 * Compute sample variance (Bessel-corrected)
30 */
31export function sampleVariance(arr: number[]): number {
32 if (arr.length < 2) return 0;
33 const m = mean(arr);
34 return arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / (arr.length - 1);
35}
36
37/**
38 * Welch's t-test for two independent samples with unequal variances
39 * Returns two-tailed p-value
40 *
41 * This is the proper implementation using:
42 * - Sample variances (n-1 denominator)
43 * - Welch-Satterthwaite degrees of freedom
44 * - Student's t-distribution (not normal approximation)
45 */
46export function welchTTest(group1: number[], group2: number[]): number {
47 if (group1.length < 2 || group2.length < 2) return 1;
48
49 const n1 = group1.length;
50 const n2 = group2.length;
51 const mean1 = mean(group1);
52 const mean2 = mean(group2);
53 const var1 = sampleVariance(group1);
54 const var2 = sampleVariance(group2);
55
56 // Standard error of the difference
57 const se1 = var1 / n1;
58 const se2 = var2 / n2;
59 const se = Math.sqrt(se1 + se2);
60
61 if (se === 0) return mean1 === mean2 ? 1 : 0;
62
63 // t-statistic
64 const t = Math.abs(mean1 - mean2) / se;
65
66 // Welch-Satterthwaite degrees of freedom
67 const df = (se1 + se2) ** 2 / (se1 ** 2 / (n1 - 1) + se2 ** 2 / (n2 - 1));
68
69 // Two-tailed p-value from t-distribution
70 return tDistributionPValue(t, df);
71}
72
73/**
74 * Two-tailed p-value from Student's t-distribution
75 * Uses the regularized incomplete beta function
76 */
77function tDistributionPValue(t: number, df: number): number {
78 // P(|T| > t) = I_{df/(df+t²)}(df/2, 1/2) for two-tailed test
79 const x = df / (df + t * t);
80 return regularizedIncompleteBeta(x, df / 2, 0.5);
81}
82
83/**
84 * Regularized incomplete beta function I_x(a,b)
85 * Used for computing t-distribution CDF
86 */
87function regularizedIncompleteBeta(x: number, a: number, b: number): number {
88 if (x === 0) return 0;
89 if (x === 1) return 1;
90
91 // Use continued fraction representation
92 const bt = Math.exp(
93 logGamma(a + b) -
94 logGamma(a) -
95 logGamma(b) +
96 a * Math.log(x) +
97 b * Math.log(1 - x),
98 );
99
100 // Use symmetry for better convergence
101 if (x < (a + 1) / (a + b + 2)) {
102 return (bt * betaContinuedFraction(x, a, b)) / a;
103 } else {
104 return 1 - (bt * betaContinuedFraction(1 - x, b, a)) / b;
105 }
106}
107
108/**
109 * Log gamma function using Lanczos approximation
110 * Accurate to ~15 decimal places
111 */
112function logGamma(x: number): number {
113 const g = 7;
114 const c = [
115 0.99999999999980993, 676.5203681218851, -1259.1392167224028,
116 771.32342877765313, -176.61502916214059, 12.507343278686905,
117 -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7,
118 ];
119
120 if (x < 0.5) {
121 return Math.log(Math.PI / Math.sin(Math.PI * x)) - logGamma(1 - x);
122 }
123
124 x -= 1;
125 let sum = c[0]!;
126 for (let i = 1; i < g + 2; i++) {
127 sum += c[i]! / (x + i);
128 }
129
130 const t = x + g + 0.5;
131 return (
132 0.5 * Math.log(2 * Math.PI) + (x + 0.5) * Math.log(t) - t + Math.log(sum)
133 );
134}
135
136/**
137 * Continued fraction expansion for incomplete beta function
138 * Uses modified Lentz's algorithm for stability
139 */
140function betaContinuedFraction(x: number, a: number, b: number): number {
141 const maxIter = 200;
142 const eps = 1e-14;
143 const tiny = 1e-30;
144
145 const qab = a + b;
146 const qap = a + 1;
147 const qam = a - 1;
148
149 let c = 1;
150 let d = 1 - (qab * x) / qap;
151 if (Math.abs(d) < tiny) d = tiny;
152 d = 1 / d;
153 let h = d;
154
155 for (let m = 1; m <= maxIter; m++) {
156 const m2 = 2 * m;
157
158 // Even step
159 let aa = (m * (b - m) * x) / ((qam + m2) * (a + m2));
160 d = 1 + aa * d;
161 if (Math.abs(d) < tiny) d = tiny;
162 c = 1 + aa / c;
163 if (Math.abs(c) < tiny) c = tiny;
164 d = 1 / d;
165 h *= d * c;
166
167 // Odd step
168 aa = (-(a + m) * (qab + m) * x) / ((a + m2) * (qap + m2));
169 d = 1 + aa * d;
170 if (Math.abs(d) < tiny) d = tiny;
171 c = 1 + aa / c;
172 if (Math.abs(c) < tiny) c = tiny;
173 d = 1 / d;
174 const del = d * c;
175 h *= del;
176
177 if (Math.abs(del - 1) < eps) break;
178 }
179
180 return h;
181}
182
183/**
184 * Summary statistics for a group
185 */
186export interface GroupStats {
187 n: number;
188 mean: number;
189 std: number; // sample std (n-1 denominator)
190 scores: number[];
191}
192
193/**
194 * Compute "probability best" for each combo using Monte Carlo simulation
195 * with empirical Bayes variance shrinkage.
196 *
197 * Each group's variance is shrunk toward the pooled variance across all groups.
198 * This prevents overconfidence when a group shows zero or low variance due to
199 * small sample size (e.g., all 4s doesn't mean the true population can't produce 5s).
200 *
201 * @param groups - Array of group statistics
202 * @param iterations - Number of Monte Carlo iterations (default 2000)
203 * @returns Array of probabilities in same order as input
204 */
205export function computeProbabilityBest(
206 groups: GroupStats[],
207 iterations = 2000,
208): number[] {
209 if (groups.length === 0) return [];
210 if (groups.length === 1) return [1];
211
212 // Need at least 2 samples per group for meaningful estimation
213 const validGroups = groups.filter((g) => g.n >= 2);
214 if (validGroups.length === 0) return groups.map(() => 1 / groups.length);
215
216 // Compute pooled variance across all groups (empirical Bayes prior)
217 const allScores = groups.flatMap((g) => g.scores);
218 const pooledVar = allScores.length >= 2 ? sampleVariance(allScores) : 0;
219
220 // If all scores are identical, we can't distinguish - return equal probabilities
221 if (pooledVar === 0) {
222 return groups.map(() => 1 / groups.length);
223 }
224
225 // Prior strength: how many "pseudo-samples" of pooled variance to blend in
226 // 2 is conventional - enough to prevent zero-variance issues without
227 // overwhelming the observed data as sample size grows
228 const priorStrength = 2;
229
230 // Compute standard error of mean for each group with shrinkage
231 const standardErrors = groups.map((g) => {
232 if (g.n < 2) return Math.sqrt(pooledVar); // fallback for tiny samples
233 const groupVar = g.std * g.std;
234 // Shrink group variance toward pooled: weighted average
235 const effectiveVar =
236 (g.n * groupVar + priorStrength * pooledVar) / (g.n + priorStrength);
237 // Standard error of mean
238 return Math.sqrt(effectiveVar / g.n);
239 });
240
241 // Seed RNG from scores for deterministic results (prevents flicker)
242 const rng = createSeededRng(hashScores(groups));
243
244 const wins = groups.map(() => 0);
245
246 for (let iter = 0; iter < iterations; iter++) {
247 // Sample plausible true means from each group's posterior
248 const sampledMeans = groups.map(
249 (g, i) => g.mean + gaussianRandom(rng) * standardErrors[i]!,
250 );
251
252 // Find winner (highest sampled mean)
253 let maxMean = -Infinity;
254 let winner = 0;
255 for (let i = 0; i < sampledMeans.length; i++) {
256 if (sampledMeans[i]! > maxMean) {
257 maxMean = sampledMeans[i]!;
258 winner = i;
259 }
260 }
261 wins[winner]++;
262 }
263
264 return wins.map((w) => w / iterations);
265}
266
267/**
268 * Seeded PRNG (mulberry32) for deterministic results
269 * Same input data → same probabilities → no flicker
270 */
271function createSeededRng(seed: number): () => number {
272 return () => {
273 seed |= 0;
274 seed = (seed + 0x6d2b79f5) | 0;
275 let t = Math.imul(seed ^ (seed >>> 15), 1 | seed);
276 t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
277 return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
278 };
279}
280
281/**
282 * Hash scores to create a deterministic seed
283 */
284function hashScores(groups: GroupStats[]): number {
285 let hash = 0;
286 for (const g of groups) {
287 for (const s of g.scores) {
288 hash = ((hash << 5) - hash + s * 1000) | 0;
289 }
290 }
291 return hash;
292}
293
294/**
295 * Box-Muller transform for generating standard normal random variates
296 */
297function gaussianRandom(rng: () => number): number {
298 let u: number, v: number, s: number;
299 do {
300 u = rng() * 2 - 1;
301 v = rng() * 2 - 1;
302 s = u * u + v * v;
303 } while (s >= 1 || s === 0);
304 return u * Math.sqrt((-2 * Math.log(s)) / s);
305}
306
307/**
308 * Confidence level based on sample size
309 * low: n < 5, medium: 5-14, high: n ≥ 15
310 */
311export type ConfidenceLevel = "low" | "medium" | "high";
312
313export function sampleSizeConfidence(n: number): ConfidenceLevel {
314 if (n < 5) return "low";
315 if (n < 15) return "medium";
316 return "high";
317}
318
319/** Confidence level using minimum sample size across groups */
320export function comparisonConfidence(groups: GroupStats[]): ConfidenceLevel {
321 if (groups.length === 0) return "low";
322 const minN = Math.min(...groups.map((g) => g.n));
323 return sampleSizeConfidence(minN);
324}
325
326/**
327 * Standard normal CDF (Φ function)
328 * Uses Abramowitz & Stegun approximation (7.1.26), accurate to 1.5×10⁻⁷
329 */
330export function normalCDF(x: number): number {
331 const a1 = 0.254829592;
332 const a2 = -0.284496736;
333 const a3 = 1.421413741;
334 const a4 = -1.453152027;
335 const a5 = 1.061405429;
336 const p = 0.3275911;
337
338 const sign = x < 0 ? -1 : 1;
339 x = Math.abs(x) / Math.sqrt(2);
340
341 const t = 1 / (1 + p * x);
342 const y =
343 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
344
345 return 0.5 * (1 + sign * y);
346}
347
348/**
349 * Probability that group A is better (higher mean) than group B
350 * Uses Bayesian posterior with uninformative prior
351 *
352 * P(A > B) = Φ((μA - μB) / √(SE_A² + SE_B²))
353 * where SE = σ/√n is the standard error of the mean
354 *
355 * Returns value in [0, 1]. Returns 0.5 if insufficient data.
356 */
357export function probabilityABetterThanB(a: GroupStats, b: GroupStats): number {
358 // Need at least 2 samples each for variance estimate
359 if (a.n < 2 || b.n < 2) return 0.5;
360
361 const meanDiff = a.mean - b.mean;
362 const seA = a.std / Math.sqrt(a.n);
363 const seB = b.std / Math.sqrt(b.n);
364 const seDiff = Math.sqrt(seA * seA + seB * seB);
365
366 if (seDiff === 0) {
367 // No variance - return based on mean comparison
368 return meanDiff > 0 ? 1 : meanDiff < 0 ? 0 : 0.5;
369 }
370
371 return normalCDF(meanDiff / seDiff);
372}
373
374/**
375 * Effect size interpretation thresholds (Cohen's conventions)
376 */
377export type EffectMagnitude = "negligible" | "small" | "medium" | "large";
378
379export interface EffectSize {
380 d: number;
381 magnitude: EffectMagnitude;
382}
383
384/**
385 * Cohen's d effect size: (μA - μB) / pooledStd
386 *
387 * Interpretation: |d| < 0.2 negligible, 0.2-0.5 small, 0.5-0.8 medium, ≥0.8 large
388 */
389export function cohensD(a: GroupStats, b: GroupStats): EffectSize {
390 if (a.n < 2 || b.n < 2) return { d: 0, magnitude: "negligible" };
391
392 // Weighted pooled standard deviation (proper formula)
393 const var1 = a.std * a.std;
394 const var2 = b.std * b.std;
395 const pooledVar = ((a.n - 1) * var1 + (b.n - 1) * var2) / (a.n + b.n - 2);
396 const pooledStd = Math.sqrt(pooledVar);
397
398 if (pooledStd === 0) {
399 return {
400 d: a.mean === b.mean ? 0 : Infinity,
401 magnitude: a.mean === b.mean ? "negligible" : "large",
402 };
403 }
404
405 const d = (a.mean - b.mean) / pooledStd;
406 const absD = Math.abs(d);
407
408 let magnitude: EffectMagnitude;
409 if (absD < 0.2) magnitude = "negligible";
410 else if (absD < 0.5) magnitude = "small";
411 else if (absD < 0.8) magnitude = "medium";
412 else magnitude = "large";
413
414 return { d, magnitude };
415}
416
417/**
418 * Estimate runs needed for leader to reach target probability
419 * Uses simulation of how standard error decreases with more samples
420 *
421 * Returns estimated additional runs needed, or Infinity if not achievable
422 */
423export function runsToTargetProbability(
424 leader: GroupStats,
425 others: GroupStats[],
426 targetProb: number = 0.95,
427): number {
428 if (others.length === 0) return 0;
429
430 // Current probability
431 const currentProbs = computeProbabilityBest([leader, ...others]);
432 const currentLeaderProb = currentProbs[0]!;
433
434 if (currentLeaderProb >= targetProb) return 0;
435 if (currentLeaderProb < 0.5) return Infinity; // Not currently winning
436
437 // Find the closest competitor
438 let minProbVsOther = 1;
439 let closestOther: GroupStats | null = null;
440 for (const other of others) {
441 const prob = probabilityABetterThanB(leader, other);
442 if (prob < minProbVsOther) {
443 minProbVsOther = prob;
444 closestOther = other;
445 }
446 }
447
448 if (!closestOther || minProbVsOther >= targetProb) return 0;
449
450 // Simulate adding samples and see when we'd hit target
451 // SE decreases as 1/√n, so variance decreases as 1/n
452 // We need to solve for n where P(leader > closest) = targetProb
453
454 const meanDiff = leader.mean - closestOther.mean;
455 if (meanDiff <= 0) return Infinity;
456
457 const currentN = Math.min(leader.n, closestOther.n);
458 const varLeader = leader.std * leader.std;
459 const varOther = closestOther.std * closestOther.std;
460
461 // Target z-score for desired probability
462 // P(X > 0) = Φ(μ/σ) = targetProb => μ/σ = Φ⁻¹(targetProb)
463 // For 0.95: z ≈ 1.645
464 // For 0.99: z ≈ 2.326
465 const targetZ = inverseCDF(targetProb);
466
467 // We need: meanDiff / √(varLeader/n + varOther/n) = targetZ
468 // Solving for n: n = (varLeader + varOther) * targetZ² / meanDiff²
469 const requiredN = Math.ceil(
470 ((varLeader + varOther) * targetZ * targetZ) / (meanDiff * meanDiff),
471 );
472
473 return Math.max(0, requiredN - currentN);
474}
475
476/**
477 * Inverse standard normal CDF (quantile function)
478 * Uses Acklam's approximation, accurate to ~1.15×10⁻⁹
479 */
480function inverseCDF(p: number): number {
481 if (p <= 0) return -Infinity;
482 if (p >= 1) return Infinity;
483 if (p === 0.5) return 0;
484
485 const a = [
486 -3.969683028665376e1, 2.209460984245205e2, -2.759285104469687e2,
487 1.38357751867269e2, -3.066479806614716e1, 2.506628277459239,
488 ];
489 const b = [
490 -5.447609879822406e1, 1.615858368580409e2, -1.556989798598866e2,
491 6.680131188771972e1, -1.328068155288572e1,
492 ];
493 const c = [
494 -7.784894002430293e-3, -3.223964580411365e-1, -2.400758277161838,
495 -2.549732539343734, 4.374664141464968, 2.938163982698783,
496 ];
497 const d = [
498 7.784695709041462e-3, 3.224671290700398e-1, 2.445134137142996,
499 3.754408661907416,
500 ];
501
502 const pLow = 0.02425;
503 const pHigh = 1 - pLow;
504
505 let q: number, r: number;
506
507 if (p < pLow) {
508 q = Math.sqrt(-2 * Math.log(p));
509 return (
510 (((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q +
511 c[5]!) /
512 ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1)
513 );
514 } else if (p <= pHigh) {
515 q = p - 0.5;
516 r = q * q;
517 return (
518 ((((((a[0]! * r + a[1]!) * r + a[2]!) * r + a[3]!) * r + a[4]!) * r +
519 a[5]!) *
520 q) /
521 (((((b[0]! * r + b[1]!) * r + b[2]!) * r + b[3]!) * r + b[4]!) * r + 1)
522 );
523 } else {
524 q = Math.sqrt(-2 * Math.log(1 - p));
525 return (
526 -(
527 ((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q +
528 c[5]!
529 ) /
530 ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1)
531 );
532 }
533}
534
535/** Round run estimates to avoid false precision: 5, 10, 25, 50, 100, or ∞ */
536export function bucketRunEstimate(n: number): number {
537 if (!isFinite(n) || n <= 0) return 0;
538 if (n <= 7) return 5;
539 if (n <= 17) return 10;
540 if (n <= 35) return 25;
541 if (n <= 75) return 50;
542 if (n <= 150) return 100;
543 return Infinity;
544}
545
546/**
547 * Format a bucketed run estimate for display
548 */
549export function formatRunEstimate(n: number): string {
550 const bucketed = bucketRunEstimate(n);
551 if (bucketed === 0) return "";
552 if (!isFinite(bucketed)) return "~100+";
553 return `~${bucketed}`;
554}
555
556/**
557 * Interpret probability for verdict
558 */
559export type VerdictLevel = "winner" | "likely" | "unclear" | "insufficient";
560
561export function verdictFromProbability(prob: number, n: number): VerdictLevel {
562 if (n < 5) return "insufficient";
563 if (prob >= 0.95) return "winner";
564 if (prob >= 0.8) return "likely";
565 return "unclear";
566}
567
568/**
569 * Uncertainty margin for a probability estimate based on sample size
570 * Uses standard error of a proportion: sqrt(p*(1-p)/n)
571 *
572 * Returns a value representing the uncertainty (e.g., 0.05 means ±5%)
573 */
574export function probabilityUncertainty(probability: number, n: number): number {
575 if (n < 2) return 0.5; // Maximum uncertainty with insufficient data
576 // Standard error of a proportion
577 return Math.sqrt((probability * (1 - probability)) / n);
578}
579
580/**
581 * Effect size of leader vs closest competitor
582 * Returns the Cohen's d effect size comparing leader to the closest other group
583 * Returns null if there are no other groups to compare
584 */
585export function leaderEffectSize(
586 leader: GroupStats,
587 others: GroupStats[],
588): EffectSize | null {
589 if (others.length === 0) return null;
590
591 // Find the closest competitor (highest mean among others)
592 let closest: GroupStats | null = null;
593 let highestMean = -Infinity;
594 for (const other of others) {
595 if (other.n >= 2 && other.mean > highestMean) {
596 highestMean = other.mean;
597 closest = other;
598 }
599 }
600
601 if (!closest || leader.n < 2) return null;
602
603 return cohensD(leader, closest);
604}