Create, run, rate, and iterate on your Claude Skills
claude-skills
at main 93 lines 2.4 kB view raw
1import { mkdir, readdir, readFile, writeFile } from "node:fs/promises"; 2import { join } from "node:path"; 3import { mean, sampleStd } from "./stats.js"; 4 5export interface RunResult { 6 status: "ok" | "fail"; 7 score?: number; 8 reason?: string; 9 error?: string; 10 durationMs: number; 11 costUsd?: number; 12} 13 14export async function getResultsDir( 15 resultsBaseDir: string, 16 fixture: string, 17 comboName: string, 18 hash: string, 19): Promise<string> { 20 const dir = join(resultsBaseDir, fixture, comboName, hash); 21 await mkdir(dir, { recursive: true }); 22 23 // Create .gitignore in this hash directory to ignore workspaces and logs 24 const gitignorePath = join(dir, ".gitignore"); 25 const gitignoreContent = `run-*/ 26*.log 27`; 28 await writeFile(gitignorePath, gitignoreContent).catch(() => {}); 29 30 return dir; 31} 32 33export async function countExistingRuns(resultsDir: string): Promise<number> { 34 try { 35 const entries = await readdir(resultsDir); 36 return entries.filter((e) => e.endsWith(".eval.json")).length; 37 } catch { 38 return 0; 39 } 40} 41 42export async function loadExistingResults( 43 resultsDir: string, 44): Promise<RunResult[]> { 45 try { 46 const entries = await readdir(resultsDir); 47 const jsonFiles = entries.filter((e) => e.endsWith(".eval.json")).sort(); 48 const results: RunResult[] = []; 49 50 for (const file of jsonFiles) { 51 const content = await readFile(join(resultsDir, file), "utf-8"); 52 results.push(JSON.parse(content) as RunResult); 53 } 54 55 return results; 56 } catch { 57 return []; 58 } 59} 60 61export async function saveRunResult( 62 resultsDir: string, 63 runNumber: number, 64 result: RunResult, 65): Promise<void> { 66 const padded = String(runNumber).padStart(3, "0"); 67 await writeFile( 68 join(resultsDir, `run-${padded}.eval.json`), 69 JSON.stringify(result, null, 2), 70 ); 71 // Note: log file is now written via streaming during the run 72} 73 74export function calculateStats(results: RunResult[]): { 75 runs: number; 76 ok: number; 77 fail: number; 78 mean: number; 79 std: number; 80 scores: number[]; 81} { 82 const okResults = results.filter((r) => r.status === "ok"); 83 const scores = okResults.map((r) => r.score!).filter((s) => s !== undefined); 84 85 return { 86 runs: results.length, 87 ok: okResults.length, 88 fail: results.length - okResults.length, 89 mean: mean(scores), 90 std: sampleStd(scores), 91 scores, 92 }; 93}