Create, run, rate, and iterate on your Claude Skills
claude-skills
1import { mkdir, readdir, readFile, writeFile } from "node:fs/promises";
2import { join } from "node:path";
3import { mean, sampleStd } from "./stats.js";
4
5export interface RunResult {
6 status: "ok" | "fail";
7 score?: number;
8 reason?: string;
9 error?: string;
10 durationMs: number;
11 costUsd?: number;
12}
13
14export async function getResultsDir(
15 resultsBaseDir: string,
16 fixture: string,
17 comboName: string,
18 hash: string,
19): Promise<string> {
20 const dir = join(resultsBaseDir, fixture, comboName, hash);
21 await mkdir(dir, { recursive: true });
22
23 // Create .gitignore in this hash directory to ignore workspaces and logs
24 const gitignorePath = join(dir, ".gitignore");
25 const gitignoreContent = `run-*/
26*.log
27`;
28 await writeFile(gitignorePath, gitignoreContent).catch(() => {});
29
30 return dir;
31}
32
33export async function countExistingRuns(resultsDir: string): Promise<number> {
34 try {
35 const entries = await readdir(resultsDir);
36 return entries.filter((e) => e.endsWith(".eval.json")).length;
37 } catch {
38 return 0;
39 }
40}
41
42export async function loadExistingResults(
43 resultsDir: string,
44): Promise<RunResult[]> {
45 try {
46 const entries = await readdir(resultsDir);
47 const jsonFiles = entries.filter((e) => e.endsWith(".eval.json")).sort();
48 const results: RunResult[] = [];
49
50 for (const file of jsonFiles) {
51 const content = await readFile(join(resultsDir, file), "utf-8");
52 results.push(JSON.parse(content) as RunResult);
53 }
54
55 return results;
56 } catch {
57 return [];
58 }
59}
60
61export async function saveRunResult(
62 resultsDir: string,
63 runNumber: number,
64 result: RunResult,
65): Promise<void> {
66 const padded = String(runNumber).padStart(3, "0");
67 await writeFile(
68 join(resultsDir, `run-${padded}.eval.json`),
69 JSON.stringify(result, null, 2),
70 );
71 // Note: log file is now written via streaming during the run
72}
73
74export function calculateStats(results: RunResult[]): {
75 runs: number;
76 ok: number;
77 fail: number;
78 mean: number;
79 std: number;
80 scores: number[];
81} {
82 const okResults = results.filter((r) => r.status === "ok");
83 const scores = okResults.map((r) => r.score!).filter((s) => s !== undefined);
84
85 return {
86 runs: results.length,
87 ok: okResults.length,
88 fail: results.length - okResults.length,
89 mean: mean(scores),
90 std: sampleStd(scores),
91 scores,
92 };
93}