mirror of https://github.com/garrytan/gstack.git
Merge remote-tracking branch 'origin/garrytan/team-supabase-store' into garrytan/dev-mode
This commit is contained in:
commit
c11cb708a5
|
|
@ -11,3 +11,4 @@ bun.lock
|
||||||
.env.local
|
.env.local
|
||||||
.env.*
|
.env.*
|
||||||
!.env.example
|
!.env.example
|
||||||
|
.gstack-sync.json
|
||||||
|
|
|
||||||
23
CHANGELOG.md
23
CHANGELOG.md
|
|
@ -1,5 +1,28 @@
|
||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 0.3.10 — 2026-03-15
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **Team sync via Supabase (optional)** — shared data store for eval results, retro snapshots, QA reports, ship logs, and Greptile triage across team members. All sync operations are non-fatal and non-blocking — skills never wait on network. Offline queue with automatic retry (up to 5 attempts). Zero impact when not configured: without `.gstack-sync.json`, everything works locally as before. See `docs/designs/TEAM_COORDINATION_STORE.md` for architecture and setup.
|
||||||
|
- **Supabase migration SQL** — 4 migration files in `supabase/migrations/` for teams, eval_runs, data tables (retros, QA, ships, Greptile), and eval costs. Row-level security policies ensure team members can only access their own team's data.
|
||||||
|
- **Sync config + auth** — `.gstack-sync.json` for project-level config (Supabase URL, anon key, team slug). `~/.gstack/auth.json` for user-level tokens (keyed by Supabase URL for multi-team support). `GSTACK_SUPABASE_ACCESS_TOKEN` env var for CI/automation. Token refresh built in.
|
||||||
|
- **`gstack sync` CLI** — `status`, `push`, `pull`, `drain`, `login`, `logout` subcommands for managing team sync.
|
||||||
|
- **Universal eval format** — `StandardEvalResult` schema with validation, normalization, and bidirectional legacy conversion. Any language can produce JSON matching this format and push via `gstack eval push`.
|
||||||
|
- **Unified eval CLI** — `gstack eval list|compare|summary|trend|push|cost|cache|watch` consolidating all eval tools into one entry point.
|
||||||
|
- **Per-model cost tracking** — eval results now include `costs[]` with exact per-model token usage (input, output, cache read, cache creation) and API-reported cost. Extracted from `resultLine.modelUsage` in the `claude -p` NDJSON stream. `computeCosts()` prefers exact `cost_usd` over MODEL_PRICING estimates (~4x more accurate with prompt caching).
|
||||||
|
- **LLM judge caching** — SHA-based caching for LLM-as-judge eval calls via `eval-cache.ts`. Cache keyed by `model:prompt`, so unchanged SKILL.md content skips API calls entirely. ~$0.18/run savings. Set `EVAL_CACHE=0` to force re-run.
|
||||||
|
- **Dynamic model selection** — `EVAL_JUDGE_TIER` env var controls which Claude model runs judge evals (haiku/sonnet/opus, default: sonnet). `EVAL_TIER` pins the E2E test model via `--model` flag to `claude -p`.
|
||||||
|
- **`bun run eval:trend`** — per-test pass rate tracking over last N runs. Classifies tests as stable-pass, stable-fail, flaky, improving, or degrading. Sparkline table with `--limit`, `--tier`, `--test` filters.
|
||||||
|
- **Shared utilities** — `lib/util.ts` extracted with `atomicWriteJSON`, `readJSON`, `getGitInfo`, `getRemoteSlug`, `listEvalFiles`, `loadEvalResults`, `formatTimestamp`, and path constants.
|
||||||
|
- 52+ new tests across eval cache, cost, format, tier, trend, sync config, sync client, and LLM judge integration.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- `callJudge()` and `judge()` now return `{ result, meta }` with `JudgeMeta` (model, tokens, cached flag). `outcomeJudge()` retains simple return type for E2E callers.
|
||||||
|
- `EvalCollector.finalize()` aggregates per-test `costs[]` into result-level cost breakdown and attempts team sync (non-blocking).
|
||||||
|
- `cli-eval.ts` main block guarded with `import.meta.main` to prevent execution on import.
|
||||||
|
- `eval:summary` now hints to run `eval:trend` when flaky tests are detected.
|
||||||
|
- All 8 LLM eval test sites updated from hard-coded `cost_usd: 0.02` to real API-reported costs.
|
||||||
|
|
||||||
## 0.3.9 — 2026-03-15
|
## 0.3.9 — 2026-03-15
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ bun run dev:skill # watch mode: auto-regen + validate on change
|
||||||
bun run eval:list # list all eval runs from ~/.gstack-dev/evals/
|
bun run eval:list # list all eval runs from ~/.gstack-dev/evals/
|
||||||
bun run eval:compare # compare two eval runs (auto-picks most recent)
|
bun run eval:compare # compare two eval runs (auto-picks most recent)
|
||||||
bun run eval:summary # aggregate stats across all eval runs
|
bun run eval:summary # aggregate stats across all eval runs
|
||||||
|
bun run eval:trend # per-test pass rate trends (flaky detection)
|
||||||
```
|
```
|
||||||
|
|
||||||
`test:evals` requires `ANTHROPIC_API_KEY`. E2E tests stream progress in real-time
|
`test:evals` requires `ANTHROPIC_API_KEY`. E2E tests stream progress in real-time
|
||||||
|
|
|
||||||
|
|
@ -169,6 +169,8 @@ When E2E tests run, they produce machine-readable artifacts in `~/.gstack-dev/`:
|
||||||
bun run eval:list # list all eval runs
|
bun run eval:list # list all eval runs
|
||||||
bun run eval:compare # compare two runs (auto-picks most recent)
|
bun run eval:compare # compare two runs (auto-picks most recent)
|
||||||
bun run eval:summary # aggregate stats across all runs
|
bun run eval:summary # aggregate stats across all runs
|
||||||
|
bun run eval:trend # per-test pass rate over last N runs (flaky detection)
|
||||||
|
bun run eval:cache stats # check LLM judge cache hit rate
|
||||||
```
|
```
|
||||||
|
|
||||||
Artifacts are never cleaned up — they accumulate in `~/.gstack-dev/` for post-mortem debugging and trend analysis.
|
Artifacts are never cleaned up — they accumulate in `~/.gstack-dev/` for post-mortem debugging and trend analysis.
|
||||||
|
|
@ -187,7 +189,8 @@ Each dimension is scored 1-5. Threshold: every dimension must score **≥ 4**. T
|
||||||
# Needs ANTHROPIC_API_KEY in .env — included in bun run test:evals
|
# Needs ANTHROPIC_API_KEY in .env — included in bun run test:evals
|
||||||
```
|
```
|
||||||
|
|
||||||
- Uses `claude-sonnet-4-6` for scoring stability
|
- Model defaults to `claude-sonnet-4-6`; override with `EVAL_JUDGE_TIER=haiku|opus`
|
||||||
|
- Results are SHA-cached — unchanged SKILL.md content skips API calls ($0 on repeat runs). Set `EVAL_CACHE=0` to force re-run.
|
||||||
- Tests live in `test/skill-llm-eval.test.ts`
|
- Tests live in `test/skill-llm-eval.test.ts`
|
||||||
- Calls the Anthropic API directly (not `claude -p`), so it works from anywhere including inside Claude Code
|
- Calls the Anthropic API directly (not `claude -p`), so it works from anywhere including inside Claude Code
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -629,6 +629,12 @@ bun run eval:watch # live dashboard during E2E runs
|
||||||
|
|
||||||
E2E tests stream real-time progress, write machine-readable diagnostics, and persist partial results that survive kills. See CONTRIBUTING.md for the full eval infrastructure.
|
E2E tests stream real-time progress, write machine-readable diagnostics, and persist partial results that survive kills. See CONTRIBUTING.md for the full eval infrastructure.
|
||||||
|
|
||||||
|
### Team sync (optional)
|
||||||
|
|
||||||
|
For teams, gstack can sync eval results, retro snapshots, QA reports, and ship logs to a shared Supabase store. Without this, everything works locally as before — sync is purely additive.
|
||||||
|
|
||||||
|
To set up: copy `.gstack-sync.json.example` to `.gstack-sync.json`, create a Supabase project, run the migrations in `supabase/migrations/`, and fill in your credentials. See `docs/designs/TEAM_COORDINATION_STORE.md` for the full guide.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
MIT
|
MIT
|
||||||
|
|
|
||||||
4
TODOS.md
4
TODOS.md
|
|
@ -231,7 +231,7 @@
|
||||||
|
|
||||||
**Why:** Spot quality trends — is the app getting better or worse?
|
**Why:** Spot quality trends — is the app getting better or worse?
|
||||||
|
|
||||||
**Context:** QA already writes structured reports. This adds cross-run comparison.
|
**Context:** `eval:trend` now tracks test-level pass rates (eval infrastructure). QA-run-level trending (health scores over time across QA report files) is a separate feature that could reuse `computeTrends` pattern from `lib/cli-eval.ts`.
|
||||||
|
|
||||||
**Effort:** S
|
**Effort:** S
|
||||||
**Priority:** P2
|
**Priority:** P2
|
||||||
|
|
@ -335,6 +335,8 @@
|
||||||
|
|
||||||
**Why:** Reduce E2E test cost and flakiness.
|
**Why:** Reduce E2E test cost and flakiness.
|
||||||
|
|
||||||
|
**Status:** Model pinning shipped (session-runner.ts passes `--model` from `EVAL_TIER` env). Retry:2 still TODO.
|
||||||
|
|
||||||
**Effort:** XS
|
**Effort:** XS
|
||||||
**Priority:** P2
|
**Priority:** P2
|
||||||
|
|
||||||
|
|
|
||||||
192
lib/cli-eval.ts
192
lib/cli-eval.ts
|
|
@ -258,6 +258,7 @@ async function cmdSummary(args: string[]): Promise<void> {
|
||||||
if (flakyTests.length > 0) {
|
if (flakyTests.length > 0) {
|
||||||
console.log(` Flaky tests (${flakyTests.length}):`);
|
console.log(` Flaky tests (${flakyTests.length}):`);
|
||||||
for (const name of flakyTests) console.log(` - ${name}`);
|
for (const name of flakyTests) console.log(` - ${name}`);
|
||||||
|
console.log(` Run 'bun run eval:trend' for detailed time series.`);
|
||||||
console.log('─'.repeat(60));
|
console.log('─'.repeat(60));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -429,6 +430,191 @@ async function cmdWatch(): Promise<void> {
|
||||||
process.exit(exitCode);
|
process.exit(exitCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Trend tracking ---
|
||||||
|
|
||||||
|
export interface TestTrend {
|
||||||
|
name: string;
|
||||||
|
tier: string;
|
||||||
|
results: Array<{ timestamp: string; passed: boolean }>;
|
||||||
|
passRate: number;
|
||||||
|
streak: { type: 'pass' | 'fail'; count: number };
|
||||||
|
flipCount: number;
|
||||||
|
status: 'stable-pass' | 'stable-fail' | 'flaky' | 'improving' | 'degrading';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute per-test pass rate trends from eval results.
|
||||||
|
* Pure function — no I/O. Results are ordered chronologically (oldest first).
|
||||||
|
*/
|
||||||
|
export function computeTrends(
|
||||||
|
results: EvalResult[],
|
||||||
|
filterTier?: string,
|
||||||
|
filterTest?: string,
|
||||||
|
): TestTrend[] {
|
||||||
|
// Build time series per test (chronological — oldest first)
|
||||||
|
const byTest = new Map<string, Array<{ timestamp: string; passed: boolean }>>();
|
||||||
|
|
||||||
|
// Results from loadEvalResults are newest-first, so reverse for chronological
|
||||||
|
const chronological = [...results].reverse();
|
||||||
|
|
||||||
|
for (const r of chronological) {
|
||||||
|
if (filterTier && r.tier !== filterTier) continue;
|
||||||
|
for (const t of r.tests) {
|
||||||
|
if (filterTest && t.name !== filterTest) continue;
|
||||||
|
const key = `${r.tier}:${t.name}`;
|
||||||
|
if (!byTest.has(key)) byTest.set(key, []);
|
||||||
|
byTest.get(key)!.push({ timestamp: r.timestamp, passed: t.passed });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const trends: TestTrend[] = [];
|
||||||
|
|
||||||
|
for (const [key, results] of byTest) {
|
||||||
|
const [tier, ...nameParts] = key.split(':');
|
||||||
|
const name = nameParts.join(':');
|
||||||
|
const total = results.length;
|
||||||
|
const passCount = results.filter(r => r.passed).length;
|
||||||
|
const passRate = total > 0 ? passCount / total : 0;
|
||||||
|
|
||||||
|
// Streak: walk from newest (end of array) backward
|
||||||
|
let streakType: 'pass' | 'fail' = results[results.length - 1].passed ? 'pass' : 'fail';
|
||||||
|
let streakCount = 0;
|
||||||
|
for (let i = results.length - 1; i >= 0; i--) {
|
||||||
|
const r = results[i].passed ? 'pass' : 'fail';
|
||||||
|
if (r === streakType) streakCount++;
|
||||||
|
else break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flip count: transitions between pass and fail
|
||||||
|
let flipCount = 0;
|
||||||
|
for (let i = 1; i < results.length; i++) {
|
||||||
|
if (results[i].passed !== results[i - 1].passed) flipCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Classify status
|
||||||
|
let status: TestTrend['status'];
|
||||||
|
const last3 = results.slice(-3);
|
||||||
|
const earlier = results.slice(0, -3);
|
||||||
|
const last3AllPass = last3.length >= 3 && last3.every(r => r.passed);
|
||||||
|
const last3HasFail = last3.some(r => !r.passed);
|
||||||
|
const earlierHadFailures = earlier.some(r => !r.passed);
|
||||||
|
const earlierWasPassing = earlier.length > 0 && earlier.every(r => r.passed);
|
||||||
|
|
||||||
|
// Check improving/degrading first — a clear recent trend outranks raw pass rate
|
||||||
|
if (last3AllPass && earlierHadFailures) {
|
||||||
|
status = 'improving';
|
||||||
|
} else if (last3HasFail && earlierWasPassing) {
|
||||||
|
status = 'degrading';
|
||||||
|
} else if (flipCount >= 3 || (passRate > 0.3 && passRate < 0.7)) {
|
||||||
|
status = 'flaky';
|
||||||
|
} else if (passRate >= 0.9 && flipCount <= 1) {
|
||||||
|
status = 'stable-pass';
|
||||||
|
} else if (passRate <= 0.1 && flipCount <= 1) {
|
||||||
|
status = 'stable-fail';
|
||||||
|
} else if (passRate >= 0.5) {
|
||||||
|
status = 'stable-pass';
|
||||||
|
} else {
|
||||||
|
status = 'stable-fail';
|
||||||
|
}
|
||||||
|
|
||||||
|
trends.push({
|
||||||
|
name, tier, results, passRate,
|
||||||
|
streak: { type: streakType, count: streakCount },
|
||||||
|
flipCount, status,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort: flaky first, then flipCount desc, then name
|
||||||
|
trends.sort((a, b) => {
|
||||||
|
const statusOrder = { flaky: 0, degrading: 1, improving: 2, 'stable-fail': 3, 'stable-pass': 4 };
|
||||||
|
const sa = statusOrder[a.status] ?? 5;
|
||||||
|
const sb = statusOrder[b.status] ?? 5;
|
||||||
|
if (sa !== sb) return sa - sb;
|
||||||
|
if (a.flipCount !== b.flipCount) return b.flipCount - a.flipCount;
|
||||||
|
return a.name.localeCompare(b.name);
|
||||||
|
});
|
||||||
|
|
||||||
|
return trends;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function cmdTrend(args: string[]): Promise<void> {
|
||||||
|
let limit = 10;
|
||||||
|
let filterTier: string | undefined;
|
||||||
|
let filterTest: string | undefined;
|
||||||
|
|
||||||
|
for (let i = 0; i < args.length; i++) {
|
||||||
|
if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
|
||||||
|
else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
|
||||||
|
else if (args[i] === '--test' && args[i + 1]) { filterTest = args[++i]; }
|
||||||
|
}
|
||||||
|
|
||||||
|
const results = loadEvalResults<EvalResult>(undefined, limit);
|
||||||
|
if (results.length === 0) {
|
||||||
|
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const trends = computeTrends(results, filterTier, filterTest);
|
||||||
|
|
||||||
|
if (trends.length === 0) {
|
||||||
|
console.log('No test data matching filters.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine how many result columns to show
|
||||||
|
const maxResults = Math.min(limit, Math.max(...trends.map(t => t.results.length)));
|
||||||
|
|
||||||
|
console.log('');
|
||||||
|
console.log(`Test Trends (last ${results.length} runs)`);
|
||||||
|
console.log('═'.repeat(80));
|
||||||
|
console.log(
|
||||||
|
' ' +
|
||||||
|
'Test Name'.padEnd(36) +
|
||||||
|
'Rate'.padEnd(7) +
|
||||||
|
`Last ${maxResults}`.padEnd(maxResults + 3) +
|
||||||
|
'Streak'.padEnd(8) +
|
||||||
|
'Status'
|
||||||
|
);
|
||||||
|
console.log('─'.repeat(80));
|
||||||
|
|
||||||
|
let flakyCount = 0;
|
||||||
|
let degradingCount = 0;
|
||||||
|
|
||||||
|
for (const t of trends) {
|
||||||
|
if (t.status === 'flaky') flakyCount++;
|
||||||
|
if (t.status === 'degrading') degradingCount++;
|
||||||
|
|
||||||
|
const fullName = `${t.tier}:${t.name}`;
|
||||||
|
const displayName = fullName.length > 34 ? fullName.slice(0, 31) + '...' : fullName.padEnd(36);
|
||||||
|
const rate = `${Math.round(t.passRate * 100)}%`.padEnd(7);
|
||||||
|
|
||||||
|
// Build sparkline of last N results
|
||||||
|
const sparkline = t.results
|
||||||
|
.slice(-maxResults)
|
||||||
|
.map(r => r.passed ? '\u2713' : '\u2717')
|
||||||
|
.join('');
|
||||||
|
|
||||||
|
const streak = `${t.streak.count}${t.streak.type === 'pass' ? '\u2713' : '\u2717'}`.padEnd(8);
|
||||||
|
|
||||||
|
// Color status
|
||||||
|
let statusStr = t.status;
|
||||||
|
if (isTTY) {
|
||||||
|
if (t.status === 'flaky' || t.status === 'degrading') statusStr = red(t.status);
|
||||||
|
else if (t.status === 'stable-pass' || t.status === 'improving') statusStr = green(t.status);
|
||||||
|
else statusStr = dim(t.status);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` ${displayName}${rate}${sparkline.padEnd(maxResults + 3)}${streak}${statusStr}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('─'.repeat(80));
|
||||||
|
const parts: string[] = [`${trends.length} tests tracked`];
|
||||||
|
if (flakyCount > 0) parts.push(`${flakyCount} flaky`);
|
||||||
|
if (degradingCount > 0) parts.push(`${degradingCount} degrading`);
|
||||||
|
console.log(` ${parts.join(' | ')}`);
|
||||||
|
console.log('');
|
||||||
|
}
|
||||||
|
|
||||||
function printUsage(): void {
|
function printUsage(): void {
|
||||||
console.log(`
|
console.log(`
|
||||||
gstack eval — eval management CLI
|
gstack eval — eval management CLI
|
||||||
|
|
@ -441,13 +627,15 @@ Commands:
|
||||||
summary [--limit N] Aggregate stats across all runs
|
summary [--limit N] Aggregate stats across all runs
|
||||||
push <file> Validate + save + sync an eval result
|
push <file> Validate + save + sync an eval result
|
||||||
cost <file> Show per-model cost breakdown
|
cost <file> Show per-model cost breakdown
|
||||||
|
trend [--limit N] [--tier X] [--test X] Per-test pass rate trends
|
||||||
cache read|write|stats|clear|verify Manage eval cache
|
cache read|write|stats|clear|verify Manage eval cache
|
||||||
watch Live E2E test dashboard
|
watch Live E2E test dashboard
|
||||||
`);
|
`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Main ---
|
// --- Main (only when run directly, not imported) ---
|
||||||
|
|
||||||
|
if (import.meta.main) {
|
||||||
const command = process.argv[2];
|
const command = process.argv[2];
|
||||||
const cmdArgs = process.argv.slice(3);
|
const cmdArgs = process.argv.slice(3);
|
||||||
|
|
||||||
|
|
@ -457,6 +645,7 @@ switch (command) {
|
||||||
case 'summary': cmdSummary(cmdArgs); break;
|
case 'summary': cmdSummary(cmdArgs); break;
|
||||||
case 'push': cmdPush(cmdArgs); break;
|
case 'push': cmdPush(cmdArgs); break;
|
||||||
case 'cost': cmdCost(cmdArgs); break;
|
case 'cost': cmdCost(cmdArgs); break;
|
||||||
|
case 'trend': cmdTrend(cmdArgs); break;
|
||||||
case 'cache': cmdCache(cmdArgs); break;
|
case 'cache': cmdCache(cmdArgs); break;
|
||||||
case 'watch': cmdWatch(); break;
|
case 'watch': cmdWatch(); break;
|
||||||
case '--help': case '-h': case 'help': case undefined:
|
case '--help': case '-h': case 'help': case undefined:
|
||||||
|
|
@ -467,3 +656,4 @@ switch (command) {
|
||||||
printUsage();
|
printUsage();
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } {
|
||||||
export function computeCosts(costs: CostEntry[]): CostDashboard {
|
export function computeCosts(costs: CostEntry[]): CostDashboard {
|
||||||
const byModel = new Map<string, CostSummary>();
|
const byModel = new Map<string, CostSummary>();
|
||||||
|
|
||||||
|
// Track exact cost_usd sums per model (from API-provided costs)
|
||||||
|
const exactCosts = new Map<string, number>();
|
||||||
|
|
||||||
for (const entry of costs) {
|
for (const entry of costs) {
|
||||||
const existing = byModel.get(entry.model);
|
const existing = byModel.get(entry.model);
|
||||||
if (existing) {
|
if (existing) {
|
||||||
|
|
@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
|
||||||
estimated_cost_usd: 0,
|
estimated_cost_usd: 0,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
if (entry.cost_usd !== undefined) {
|
||||||
|
exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate costs
|
// Calculate costs — prefer exact cost_usd (accounts for cache discounts)
|
||||||
let total = 0;
|
let total = 0;
|
||||||
let atFast = 0;
|
let atFast = 0;
|
||||||
let atFull = 0;
|
let atFull = 0;
|
||||||
|
|
@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
|
||||||
const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
|
const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
|
||||||
|
|
||||||
for (const summary of byModel.values()) {
|
for (const summary of byModel.values()) {
|
||||||
|
const exact = exactCosts.get(summary.model);
|
||||||
|
if (exact !== undefined) {
|
||||||
|
summary.estimated_cost_usd = exact;
|
||||||
|
} else {
|
||||||
const pricing = getPricing(summary.model);
|
const pricing = getPricing(summary.model);
|
||||||
summary.estimated_cost_usd =
|
summary.estimated_cost_usd =
|
||||||
(summary.input_tokens / 1_000_000) * pricing.input +
|
(summary.input_tokens / 1_000_000) * pricing.input +
|
||||||
(summary.output_tokens / 1_000_000) * pricing.output;
|
(summary.output_tokens / 1_000_000) * pricing.output;
|
||||||
|
}
|
||||||
total += summary.estimated_cost_usd;
|
total += summary.estimated_cost_usd;
|
||||||
|
|
||||||
// What-if at fast/full tiers
|
// What-if at fast/full tiers (always from token counts)
|
||||||
atFast +=
|
atFast +=
|
||||||
(summary.input_tokens / 1_000_000) * fastPricing.input +
|
(summary.input_tokens / 1_000_000) * fastPricing.input +
|
||||||
(summary.output_tokens / 1_000_000) * fastPricing.output;
|
(summary.output_tokens / 1_000_000) * fastPricing.output;
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,10 @@ export interface CostEntry {
|
||||||
calls: number;
|
calls: number;
|
||||||
input_tokens: number;
|
input_tokens: number;
|
||||||
output_tokens: number;
|
output_tokens: number;
|
||||||
|
cache_read_input_tokens?: number;
|
||||||
|
cache_creation_input_tokens?: number;
|
||||||
|
/** Exact cost from API when available (accounts for cache discounts). */
|
||||||
|
cost_usd?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FailureEntry {
|
export interface FailureEntry {
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@
|
||||||
"eval:list": "bun run lib/cli-eval.ts list",
|
"eval:list": "bun run lib/cli-eval.ts list",
|
||||||
"eval:compare": "bun run lib/cli-eval.ts compare",
|
"eval:compare": "bun run lib/cli-eval.ts compare",
|
||||||
"eval:summary": "bun run lib/cli-eval.ts summary",
|
"eval:summary": "bun run lib/cli-eval.ts summary",
|
||||||
|
"eval:trend": "bun run lib/cli-eval.ts trend",
|
||||||
"eval:watch": "bun run lib/cli-eval.ts watch"
|
"eval:watch": "bun run lib/cli-eval.ts watch"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,74 @@ describe('EvalCollector', () => {
|
||||||
expect(data.tests).toHaveLength(0);
|
expect(data.tests).toHaveLength(0);
|
||||||
expect(data.tier).toBe('llm-judge');
|
expect(data.tier).toBe('llm-judge');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('finalize aggregates per-test costs into result-level costs[]', async () => {
|
||||||
|
const collector = new EvalCollector('e2e', tmpDir);
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-a',
|
||||||
|
costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }],
|
||||||
|
}));
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-b',
|
||||||
|
costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }],
|
||||||
|
}));
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-c',
|
||||||
|
costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }],
|
||||||
|
}));
|
||||||
|
|
||||||
|
const filepath = await collector.finalize();
|
||||||
|
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
||||||
|
|
||||||
|
expect(data.costs).toBeDefined();
|
||||||
|
expect(data.costs).toHaveLength(2); // two models
|
||||||
|
const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6');
|
||||||
|
const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5');
|
||||||
|
expect(sonnet).toBeDefined();
|
||||||
|
expect(sonnet!.calls).toBe(2);
|
||||||
|
expect(sonnet!.input_tokens).toBe(300);
|
||||||
|
expect(sonnet!.output_tokens).toBe(150);
|
||||||
|
expect(sonnet!.cost_usd).toBeCloseTo(0.03);
|
||||||
|
expect(haiku).toBeDefined();
|
||||||
|
expect(haiku!.calls).toBe(1);
|
||||||
|
expect(haiku!.cost_usd).toBeCloseTo(0.005);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('finalize omits costs when no tests have cost data', async () => {
|
||||||
|
const collector = new EvalCollector('e2e', tmpDir);
|
||||||
|
collector.addTest(makeEntry({ name: 'no-costs' }));
|
||||||
|
const filepath = await collector.finalize();
|
||||||
|
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
||||||
|
expect(data.costs).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('finalize aggregates cache token fields', async () => {
|
||||||
|
const collector = new EvalCollector('e2e', tmpDir);
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-a',
|
||||||
|
costs: [{
|
||||||
|
model: 'claude-sonnet-4-6', calls: 1,
|
||||||
|
input_tokens: 10, output_tokens: 50,
|
||||||
|
cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000,
|
||||||
|
cost_usd: 0.01,
|
||||||
|
}],
|
||||||
|
}));
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-b',
|
||||||
|
costs: [{
|
||||||
|
model: 'claude-sonnet-4-6', calls: 1,
|
||||||
|
input_tokens: 20, output_tokens: 100,
|
||||||
|
cache_read_input_tokens: 8000, cache_creation_input_tokens: 500,
|
||||||
|
cost_usd: 0.02,
|
||||||
|
}],
|
||||||
|
}));
|
||||||
|
|
||||||
|
const filepath = await collector.finalize();
|
||||||
|
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
||||||
|
const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!;
|
||||||
|
expect(sonnet.cache_read_input_tokens).toBe(13000);
|
||||||
|
expect(sonnet.cache_creation_input_tokens).toBe(1500);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// --- extractToolSummary tests ---
|
// --- extractToolSummary tests ---
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ import * as path from 'path';
|
||||||
import * as os from 'os';
|
import * as os from 'os';
|
||||||
import { spawnSync } from 'child_process';
|
import { spawnSync } from 'child_process';
|
||||||
import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
|
import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
|
||||||
|
import type { CostEntry } from '../../lib/eval-format';
|
||||||
|
|
||||||
const SCHEMA_VERSION = 1;
|
const SCHEMA_VERSION = 1;
|
||||||
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
||||||
|
|
@ -50,6 +51,9 @@ export interface EvalTestEntry {
|
||||||
detected_bugs?: string[];
|
detected_bugs?: string[];
|
||||||
missed_bugs?: string[];
|
missed_bugs?: string[];
|
||||||
|
|
||||||
|
// Per-model cost breakdown
|
||||||
|
costs?: CostEntry[];
|
||||||
|
|
||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -67,6 +71,7 @@ export interface EvalResult {
|
||||||
total_cost_usd: number;
|
total_cost_usd: number;
|
||||||
total_duration_ms: number;
|
total_duration_ms: number;
|
||||||
tests: EvalTestEntry[];
|
tests: EvalTestEntry[];
|
||||||
|
costs?: CostEntry[]; // aggregate per-model cost breakdown
|
||||||
_partial?: boolean; // true for incremental saves, absent in final
|
_partial?: boolean; // true for incremental saves, absent in final
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -414,6 +419,25 @@ export class EvalCollector {
|
||||||
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
||||||
const passed = this.tests.filter(t => t.passed).length;
|
const passed = this.tests.filter(t => t.passed).length;
|
||||||
|
|
||||||
|
// Aggregate per-model costs across all tests
|
||||||
|
const costMap = new Map<string, CostEntry>();
|
||||||
|
for (const t of this.tests) {
|
||||||
|
for (const c of t.costs || []) {
|
||||||
|
const existing = costMap.get(c.model);
|
||||||
|
if (existing) {
|
||||||
|
existing.calls += c.calls;
|
||||||
|
existing.input_tokens += c.input_tokens;
|
||||||
|
existing.output_tokens += c.output_tokens;
|
||||||
|
existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0);
|
||||||
|
existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0);
|
||||||
|
if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd;
|
||||||
|
} else {
|
||||||
|
costMap.set(c.model, { ...c });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const costs = costMap.size > 0 ? [...costMap.values()] : undefined;
|
||||||
|
|
||||||
const result: EvalResult = {
|
const result: EvalResult = {
|
||||||
schema_version: SCHEMA_VERSION,
|
schema_version: SCHEMA_VERSION,
|
||||||
version,
|
version,
|
||||||
|
|
@ -428,6 +452,7 @@ export class EvalCollector {
|
||||||
total_cost_usd: Math.round(totalCost * 100) / 100,
|
total_cost_usd: Math.round(totalCost * 100) / 100,
|
||||||
total_duration_ms: totalDuration,
|
total_duration_ms: totalDuration,
|
||||||
tests: this.tests,
|
tests: this.tests,
|
||||||
|
costs,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Write eval file
|
// Write eval file
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,117 @@
|
||||||
|
/**
|
||||||
|
* Tests for LLM judge cache + tier integration.
|
||||||
|
* Mocks Anthropic client to avoid API calls.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as os from 'os';
|
||||||
|
|
||||||
|
let tmpCacheDir: string;
|
||||||
|
const origEnv: Record<string, string | undefined> = {};
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
tmpCacheDir = fs.mkdtempSync(path.join(os.tmpdir(), 'llm-judge-test-'));
|
||||||
|
// Point cache to temp dir and clear tier env vars
|
||||||
|
origEnv.GSTACK_STATE_DIR = process.env.GSTACK_STATE_DIR;
|
||||||
|
origEnv.EVAL_JUDGE_TIER = process.env.EVAL_JUDGE_TIER;
|
||||||
|
origEnv.EVAL_TIER = process.env.EVAL_TIER;
|
||||||
|
origEnv.EVAL_CACHE = process.env.EVAL_CACHE;
|
||||||
|
process.env.GSTACK_STATE_DIR = tmpCacheDir;
|
||||||
|
delete process.env.EVAL_JUDGE_TIER;
|
||||||
|
delete process.env.EVAL_TIER;
|
||||||
|
delete process.env.EVAL_CACHE;
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
// Restore env
|
||||||
|
for (const [key, val] of Object.entries(origEnv)) {
|
||||||
|
if (val === undefined) delete process.env[key];
|
||||||
|
else process.env[key] = val;
|
||||||
|
}
|
||||||
|
try { fs.rmSync(tmpCacheDir, { recursive: true, force: true }); } catch {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test cache key computation directly (doesn't need mock)
|
||||||
|
describe('cache key computation', () => {
|
||||||
|
test('computeCacheKey produces consistent hashes for same input', async () => {
|
||||||
|
const { computeCacheKey } = await import('../../lib/eval-cache');
|
||||||
|
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
||||||
|
const key2 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
||||||
|
expect(key1).toBe(key2);
|
||||||
|
expect(key1).toHaveLength(16);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('cache key differs when model changes', async () => {
|
||||||
|
const { computeCacheKey } = await import('../../lib/eval-cache');
|
||||||
|
const key1 = computeCacheKey([], 'claude-sonnet-4-6:test prompt');
|
||||||
|
const key2 = computeCacheKey([], 'claude-haiku-4-5:test prompt');
|
||||||
|
expect(key1).not.toBe(key2);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('cache key differs when prompt changes', async () => {
|
||||||
|
const { computeCacheKey } = await import('../../lib/eval-cache');
|
||||||
|
const key1 = computeCacheKey([], 'claude-sonnet-4-6:prompt A');
|
||||||
|
const key2 = computeCacheKey([], 'claude-sonnet-4-6:prompt B');
|
||||||
|
expect(key1).not.toBe(key2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test cache read/write directly
|
||||||
|
describe('cache read/write for llm-judge suite', () => {
|
||||||
|
test('cacheRead returns null on miss', async () => {
|
||||||
|
const { cacheRead } = await import('../../lib/eval-cache');
|
||||||
|
expect(cacheRead('llm-judge', 'nonexistent')).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('cacheWrite + cacheRead round-trip', async () => {
|
||||||
|
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
|
||||||
|
const data = { clarity: 5, completeness: 4, actionability: 5, reasoning: 'test' };
|
||||||
|
cacheWrite('llm-judge', 'test-key', data, { model: 'claude-sonnet-4-6' });
|
||||||
|
const cached = cacheRead('llm-judge', 'test-key');
|
||||||
|
expect(cached).toEqual(data);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('EVAL_CACHE=0 bypasses cache read', async () => {
|
||||||
|
const { cacheRead, cacheWrite } = await import('../../lib/eval-cache');
|
||||||
|
cacheWrite('llm-judge', 'bypass-key', { test: true });
|
||||||
|
process.env.EVAL_CACHE = '0';
|
||||||
|
expect(cacheRead('llm-judge', 'bypass-key')).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tier resolution
|
||||||
|
describe('tier resolution for judge', () => {
|
||||||
|
test('defaults to standard (sonnet) when no env set', async () => {
|
||||||
|
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
||||||
|
expect(resolveJudgeTier()).toBe('standard');
|
||||||
|
expect(tierToModel(resolveJudgeTier())).toBe('claude-sonnet-4-6');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('EVAL_JUDGE_TIER=haiku selects fast tier', async () => {
|
||||||
|
process.env.EVAL_JUDGE_TIER = 'haiku';
|
||||||
|
// Need fresh import to pick up env change
|
||||||
|
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
||||||
|
expect(resolveJudgeTier()).toBe('fast');
|
||||||
|
expect(tierToModel(resolveJudgeTier())).toBe('claude-haiku-4-5');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('EVAL_JUDGE_TIER=opus selects full tier', async () => {
|
||||||
|
process.env.EVAL_JUDGE_TIER = 'opus';
|
||||||
|
const { resolveJudgeTier, tierToModel } = await import('../../lib/eval-tier');
|
||||||
|
expect(resolveJudgeTier()).toBe('full');
|
||||||
|
expect(tierToModel(resolveJudgeTier())).toBe('claude-opus-4-6');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test JudgeMeta shape
|
||||||
|
describe('JudgeMeta interface', () => {
|
||||||
|
test('exported from llm-judge module', async () => {
|
||||||
|
const mod = await import('./llm-judge');
|
||||||
|
// Verify callJudge and judge are exported functions
|
||||||
|
expect(typeof mod.callJudge).toBe('function');
|
||||||
|
expect(typeof mod.judge).toBe('function');
|
||||||
|
expect(typeof mod.outcomeJudge).toBe('function');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -1,13 +1,19 @@
|
||||||
/**
|
/**
|
||||||
* Shared LLM-as-judge helpers for eval and E2E tests.
|
* Shared LLM-as-judge helpers for eval and E2E tests.
|
||||||
*
|
*
|
||||||
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
* Provides callJudge (generic JSON-from-LLM with cache + tier support),
|
||||||
* and outcomeJudge (planted-bug detection scorer).
|
* judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
|
||||||
*
|
*
|
||||||
* Requires: ANTHROPIC_API_KEY env var
|
* Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
|
||||||
|
*
|
||||||
|
* Env vars:
|
||||||
|
* EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
|
||||||
|
* EVAL_CACHE=0 — bypass cache, always re-run
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import Anthropic from '@anthropic-ai/sdk';
|
import Anthropic from '@anthropic-ai/sdk';
|
||||||
|
import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
|
||||||
|
import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';
|
||||||
|
|
||||||
export interface JudgeScore {
|
export interface JudgeScore {
|
||||||
clarity: number; // 1-5
|
clarity: number; // 1-5
|
||||||
|
|
@ -25,15 +31,35 @@ export interface OutcomeJudgeResult {
|
||||||
reasoning: string;
|
reasoning: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface JudgeMeta {
|
||||||
|
model: string;
|
||||||
|
input_tokens: number;
|
||||||
|
output_tokens: number;
|
||||||
|
cached: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
|
* Call the judge model with a prompt, extract JSON response.
|
||||||
|
* Uses eval-cache for SHA-based caching and eval-tier for model selection.
|
||||||
* Retries once on 429 rate limit errors.
|
* Retries once on 429 rate limit errors.
|
||||||
*/
|
*/
|
||||||
export async function callJudge<T>(prompt: string): Promise<T> {
|
export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
|
||||||
|
const model = tierToModel(resolveJudgeTier());
|
||||||
|
|
||||||
|
// Check cache (keyed by model + prompt content)
|
||||||
|
const cacheKey = computeCacheKey([], `${model}:${prompt}`);
|
||||||
|
const cached = cacheRead('llm-judge', cacheKey);
|
||||||
|
if (cached !== null) {
|
||||||
|
return {
|
||||||
|
result: cached as T,
|
||||||
|
meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const client = new Anthropic();
|
const client = new Anthropic();
|
||||||
|
|
||||||
const makeRequest = () => client.messages.create({
|
const makeRequest = () => client.messages.create({
|
||||||
model: 'claude-sonnet-4-6',
|
model,
|
||||||
max_tokens: 1024,
|
max_tokens: 1024,
|
||||||
messages: [{ role: 'user', content: prompt }],
|
messages: [{ role: 'user', content: prompt }],
|
||||||
});
|
});
|
||||||
|
|
@ -53,13 +79,25 @@ export async function callJudge<T>(prompt: string): Promise<T> {
|
||||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||||
return JSON.parse(jsonMatch[0]) as T;
|
const result = JSON.parse(jsonMatch[0]) as T;
|
||||||
|
|
||||||
|
// Write to cache
|
||||||
|
cacheWrite('llm-judge', cacheKey, result, { model });
|
||||||
|
|
||||||
|
const meta: JudgeMeta = {
|
||||||
|
model,
|
||||||
|
input_tokens: (response.usage as any)?.input_tokens || 0,
|
||||||
|
output_tokens: (response.usage as any)?.output_tokens || 0,
|
||||||
|
cached: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
return { result, meta };
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Score documentation quality on clarity/completeness/actionability (1-5).
|
* Score documentation quality on clarity/completeness/actionability (1-5).
|
||||||
*/
|
*/
|
||||||
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
|
||||||
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
||||||
|
|
||||||
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
||||||
|
|
@ -92,12 +130,14 @@ ${content}`);
|
||||||
/**
|
/**
|
||||||
* Evaluate a QA report against planted-bug ground truth.
|
* Evaluate a QA report against planted-bug ground truth.
|
||||||
* Returns detection metrics for the planted bugs.
|
* Returns detection metrics for the planted bugs.
|
||||||
|
* Note: outcomeJudge returns just the result (not meta) for backward compat
|
||||||
|
* with E2E test callers. Cache still works internally.
|
||||||
*/
|
*/
|
||||||
export async function outcomeJudge(
|
export async function outcomeJudge(
|
||||||
groundTruth: any,
|
groundTruth: any,
|
||||||
report: string,
|
report: string,
|
||||||
): Promise<OutcomeJudgeResult> {
|
): Promise<OutcomeJudgeResult> {
|
||||||
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
||||||
|
|
||||||
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
||||||
${JSON.stringify(groundTruth.bugs, null, 2)}
|
${JSON.stringify(groundTruth.bugs, null, 2)}
|
||||||
|
|
@ -127,4 +167,5 @@ Rules:
|
||||||
- detection_rate = length of detected array
|
- detection_rate = length of detected array
|
||||||
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
||||||
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -93,4 +93,36 @@ describe('parseNDJSON', () => {
|
||||||
expect(parsed.turnCount).toBe(2);
|
expect(parsed.turnCount).toBe(2);
|
||||||
expect(parsed.toolCalls).toHaveLength(0);
|
expect(parsed.toolCalls).toHaveLength(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('resultLine preserves modelUsage for cost extraction', () => {
|
||||||
|
const lines = [
|
||||||
|
'{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}',
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'result', subtype: 'success', total_cost_usd: 0.07,
|
||||||
|
num_turns: 1, result: 'Done.',
|
||||||
|
usage: { input_tokens: 8, output_tokens: 802 },
|
||||||
|
modelUsage: {
|
||||||
|
'claude-sonnet-4-6': {
|
||||||
|
inputTokens: 8, outputTokens: 802,
|
||||||
|
cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223,
|
||||||
|
costUSD: 0.07308,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
const parsed = parseNDJSON(lines);
|
||||||
|
expect(parsed.resultLine).not.toBeNull();
|
||||||
|
expect(parsed.resultLine.modelUsage).toBeDefined();
|
||||||
|
const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6'];
|
||||||
|
expect(usage.inputTokens).toBe(8);
|
||||||
|
expect(usage.outputTokens).toBe(802);
|
||||||
|
expect(usage.cacheReadInputTokens).toBe(88133);
|
||||||
|
expect(usage.costUSD).toBeCloseTo(0.07308);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('resultLine without modelUsage has undefined modelUsage', () => {
|
||||||
|
const parsed = parseNDJSON(FIXTURE_LINES);
|
||||||
|
// Original fixture has no modelUsage on result line
|
||||||
|
expect(parsed.resultLine?.modelUsage).toBeUndefined();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@ import * as fs from 'fs';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
import * as os from 'os';
|
import * as os from 'os';
|
||||||
import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
|
import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
|
||||||
|
import type { CostEntry } from '../../lib/eval-format';
|
||||||
|
import { resolveTier, tierToModel } from '../../lib/eval-tier';
|
||||||
|
|
||||||
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
|
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
|
||||||
|
|
||||||
|
|
@ -34,6 +36,7 @@ export interface SkillTestResult {
|
||||||
output: string;
|
output: string;
|
||||||
costEstimate: CostEstimate;
|
costEstimate: CostEstimate;
|
||||||
transcript: any[];
|
transcript: any[];
|
||||||
|
costs: CostEntry[];
|
||||||
}
|
}
|
||||||
|
|
||||||
const BROWSE_ERROR_PATTERNS = [
|
const BROWSE_ERROR_PATTERNS = [
|
||||||
|
|
@ -135,8 +138,11 @@ export async function runSkillTest(options: {
|
||||||
|
|
||||||
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
|
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
|
||||||
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
||||||
|
// Model pinned via EVAL_TIER env var (default: sonnet).
|
||||||
|
const evalModel = tierToModel(resolveTier());
|
||||||
const args = [
|
const args = [
|
||||||
'-p',
|
'-p',
|
||||||
|
'--model', evalModel,
|
||||||
'--output-format', 'stream-json',
|
'--output-format', 'stream-json',
|
||||||
'--verbose',
|
'--verbose',
|
||||||
'--dangerously-skip-permissions',
|
'--dangerously-skip-permissions',
|
||||||
|
|
@ -323,5 +329,21 @@ export async function runSkillTest(options: {
|
||||||
turnsUsed,
|
turnsUsed,
|
||||||
};
|
};
|
||||||
|
|
||||||
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
|
// Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
|
||||||
|
const costs: CostEntry[] = [];
|
||||||
|
if (resultLine?.modelUsage) {
|
||||||
|
for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
|
||||||
|
costs.push({
|
||||||
|
model,
|
||||||
|
calls: 1,
|
||||||
|
input_tokens: usage.inputTokens || 0,
|
||||||
|
output_tokens: usage.outputTokens || 0,
|
||||||
|
cache_read_input_tokens: usage.cacheReadInputTokens || 0,
|
||||||
|
cache_creation_input_tokens: usage.cacheCreationInputTokens || 0,
|
||||||
|
cost_usd: usage.costUSD,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,193 @@
|
||||||
|
/**
|
||||||
|
* Tests for computeTrends() — per-test pass rate trend tracking.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, test, expect } from 'bun:test';
|
||||||
|
import { computeTrends } from '../lib/cli-eval';
|
||||||
|
import type { EvalResult } from './helpers/eval-store';
|
||||||
|
|
||||||
|
/** Build a minimal EvalResult with given tests. */
|
||||||
|
function makeRun(opts: {
|
||||||
|
timestamp: string;
|
||||||
|
tier?: 'e2e' | 'llm-judge';
|
||||||
|
tests: Array<{ name: string; passed: boolean }>;
|
||||||
|
}): EvalResult {
|
||||||
|
return {
|
||||||
|
schema_version: 1,
|
||||||
|
version: '0.3.3',
|
||||||
|
branch: 'main',
|
||||||
|
git_sha: 'abc',
|
||||||
|
timestamp: opts.timestamp,
|
||||||
|
hostname: 'test',
|
||||||
|
tier: opts.tier || 'e2e',
|
||||||
|
total_tests: opts.tests.length,
|
||||||
|
passed: opts.tests.filter(t => t.passed).length,
|
||||||
|
failed: opts.tests.filter(t => !t.passed).length,
|
||||||
|
total_cost_usd: 0,
|
||||||
|
total_duration_ms: 0,
|
||||||
|
tests: opts.tests.map(t => ({
|
||||||
|
name: t.name, suite: 'test', tier: opts.tier || 'e2e' as const,
|
||||||
|
passed: t.passed, duration_ms: 0, cost_usd: 0,
|
||||||
|
})),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('computeTrends', () => {
|
||||||
|
test('classifies stable-pass test correctly', () => {
|
||||||
|
// 10 runs all passing — results are newest-first (loadEvalResults order)
|
||||||
|
const results = Array.from({ length: 10 }, (_, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [{ name: 'always-pass', passed: true }],
|
||||||
|
})).reverse(); // newest first
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends).toHaveLength(1);
|
||||||
|
expect(trends[0].status).toBe('stable-pass');
|
||||||
|
expect(trends[0].passRate).toBe(1);
|
||||||
|
expect(trends[0].streak).toEqual({ type: 'pass', count: 10 });
|
||||||
|
expect(trends[0].flipCount).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('classifies stable-fail test correctly', () => {
|
||||||
|
const results = Array.from({ length: 10 }, (_, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [{ name: 'always-fail', passed: false }],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].status).toBe('stable-fail');
|
||||||
|
expect(trends[0].passRate).toBe(0);
|
||||||
|
expect(trends[0].streak).toEqual({ type: 'fail', count: 10 });
|
||||||
|
});
|
||||||
|
|
||||||
|
test('classifies flaky test correctly — alternating pass/fail', () => {
|
||||||
|
const results = Array.from({ length: 10 }, (_, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [{ name: 'flaky', passed: i % 2 === 0 }],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].status).toBe('flaky');
|
||||||
|
expect(trends[0].flipCount).toBe(9);
|
||||||
|
expect(trends[0].passRate).toBe(0.5);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('classifies improving test correctly', () => {
|
||||||
|
// First 5 fail, last 5 pass
|
||||||
|
const results = Array.from({ length: 10 }, (_, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [{ name: 'improving', passed: i >= 5 }],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].status).toBe('improving');
|
||||||
|
expect(trends[0].streak).toEqual({ type: 'pass', count: 5 });
|
||||||
|
});
|
||||||
|
|
||||||
|
test('classifies degrading test correctly', () => {
|
||||||
|
// First 7 pass, last 3 fail
|
||||||
|
const results = Array.from({ length: 10 }, (_, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [{ name: 'degrading', passed: i < 7 }],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].status).toBe('degrading');
|
||||||
|
expect(trends[0].streak).toEqual({ type: 'fail', count: 3 });
|
||||||
|
});
|
||||||
|
|
||||||
|
test('computes streak correctly with mixed ending', () => {
|
||||||
|
// pass, pass, fail, pass, pass, pass (newest)
|
||||||
|
const passed = [true, true, false, true, true, true];
|
||||||
|
const results = passed.map((p, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [{ name: 'test', passed: p }],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].streak).toEqual({ type: 'pass', count: 3 });
|
||||||
|
});
|
||||||
|
|
||||||
|
test('computes flipCount correctly', () => {
|
||||||
|
// pass, fail, pass, pass, fail, pass → 4 flips
|
||||||
|
const passed = [true, false, true, true, false, true];
|
||||||
|
const results = passed.map((p, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [{ name: 'test', passed: p }],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].flipCount).toBe(4);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('handles single run', () => {
|
||||||
|
const results = [makeRun({
|
||||||
|
timestamp: '2026-03-15T00:00:00Z',
|
||||||
|
tests: [{ name: 'single', passed: true }],
|
||||||
|
})];
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends).toHaveLength(1);
|
||||||
|
expect(trends[0].passRate).toBe(1);
|
||||||
|
expect(trends[0].streak).toEqual({ type: 'pass', count: 1 });
|
||||||
|
expect(trends[0].flipCount).toBe(0);
|
||||||
|
expect(trends[0].status).toBe('stable-pass');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('handles single failing run', () => {
|
||||||
|
const results = [makeRun({
|
||||||
|
timestamp: '2026-03-15T00:00:00Z',
|
||||||
|
tests: [{ name: 'single-fail', passed: false }],
|
||||||
|
})];
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].status).toBe('stable-fail');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('filters by tier', () => {
|
||||||
|
const results = [
|
||||||
|
makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'e2e', tests: [{ name: 'e2e-test', passed: true }] }),
|
||||||
|
makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'llm-judge', tests: [{ name: 'judge-test', passed: true }] }),
|
||||||
|
];
|
||||||
|
|
||||||
|
const e2eOnly = computeTrends(results, 'e2e');
|
||||||
|
expect(e2eOnly).toHaveLength(1);
|
||||||
|
expect(e2eOnly[0].name).toBe('e2e-test');
|
||||||
|
|
||||||
|
const judgeOnly = computeTrends(results, 'llm-judge');
|
||||||
|
expect(judgeOnly).toHaveLength(1);
|
||||||
|
expect(judgeOnly[0].name).toBe('judge-test');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('filters by test name', () => {
|
||||||
|
const results = Array.from({ length: 3 }, (_, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [
|
||||||
|
{ name: 'test-a', passed: true },
|
||||||
|
{ name: 'test-b', passed: false },
|
||||||
|
],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const filtered = computeTrends(results, undefined, 'test-a');
|
||||||
|
expect(filtered).toHaveLength(1);
|
||||||
|
expect(filtered[0].name).toBe('test-a');
|
||||||
|
expect(filtered[0].passRate).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('sorts flaky tests first', () => {
|
||||||
|
// Create runs where test-a is flaky and test-b is stable
|
||||||
|
const results = Array.from({ length: 6 }, (_, i) => makeRun({
|
||||||
|
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
|
||||||
|
tests: [
|
||||||
|
{ name: 'test-a', passed: i % 2 === 0 }, // flaky: alternating
|
||||||
|
{ name: 'test-b', passed: true }, // stable-pass
|
||||||
|
],
|
||||||
|
})).reverse();
|
||||||
|
|
||||||
|
const trends = computeTrends(results);
|
||||||
|
expect(trends[0].name).toBe('test-a');
|
||||||
|
expect(trends[0].status).toBe('flaky');
|
||||||
|
expect(trends[1].name).toBe('test-b');
|
||||||
|
expect(trends[1].status).toBe('stable-pass');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?:
|
||||||
exit_reason: result.exitReason,
|
exit_reason: result.exitReason,
|
||||||
timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
|
timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
|
||||||
last_tool_call: lastTool,
|
last_tool_call: lastTool,
|
||||||
|
costs: result.costs,
|
||||||
...extra,
|
...extra,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,16 +7,18 @@
|
||||||
* Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set)
|
* Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set)
|
||||||
* Run: EVALS=1 bun run test:eval
|
* Run: EVALS=1 bun run test:eval
|
||||||
*
|
*
|
||||||
* Cost: ~$0.05-0.15 per run (sonnet)
|
* Cost: ~$0.05-0.15 per run (sonnet), $0 on cache hit
|
||||||
|
* Cache: SHA-based via eval-cache. Set EVAL_CACHE=0 to force re-run.
|
||||||
|
* Model: Set EVAL_JUDGE_TIER=haiku|sonnet|opus to override (default: sonnet).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { describe, test, expect, afterAll } from 'bun:test';
|
import { describe, test, expect, afterAll } from 'bun:test';
|
||||||
import Anthropic from '@anthropic-ai/sdk';
|
|
||||||
import * as fs from 'fs';
|
import * as fs from 'fs';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
import { callJudge, judge } from './helpers/llm-judge';
|
import { callJudge, judge } from './helpers/llm-judge';
|
||||||
import type { JudgeScore } from './helpers/llm-judge';
|
import type { JudgeMeta } from './helpers/llm-judge';
|
||||||
import { EvalCollector } from './helpers/eval-store';
|
import { EvalCollector } from './helpers/eval-store';
|
||||||
|
import { MODEL_PRICING } from '../lib/eval-cost';
|
||||||
|
|
||||||
const ROOT = path.resolve(import.meta.dir, '..');
|
const ROOT = path.resolve(import.meta.dir, '..');
|
||||||
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
|
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
|
||||||
|
|
@ -26,6 +28,22 @@ const describeEval = evalsEnabled ? describe : describe.skip;
|
||||||
// Eval result collector
|
// Eval result collector
|
||||||
const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
|
const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
|
||||||
|
|
||||||
|
/** Compute actual judge cost from meta (0 on cache hit). */
|
||||||
|
function judgeCost(meta: JudgeMeta): number {
|
||||||
|
if (meta.cached) return 0;
|
||||||
|
const p = MODEL_PRICING[meta.model] || { input: 3.0, output: 15.0 };
|
||||||
|
return (meta.input_tokens / 1_000_000) * p.input + (meta.output_tokens / 1_000_000) * p.output;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Build CostEntry array from judge meta (empty on cache hit). */
|
||||||
|
function judgeCosts(meta: JudgeMeta) {
|
||||||
|
if (meta.cached) return [];
|
||||||
|
return [{
|
||||||
|
model: meta.model, calls: 1,
|
||||||
|
input_tokens: meta.input_tokens, output_tokens: meta.output_tokens,
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
|
||||||
describeEval('LLM-as-judge quality evals', () => {
|
describeEval('LLM-as-judge quality evals', () => {
|
||||||
test('command reference table scores >= 4 on all dimensions', async () => {
|
test('command reference table scores >= 4 on all dimensions', async () => {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
|
|
@ -34,8 +52,8 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
const end = content.indexOf('## Tips');
|
const end = content.indexOf('## Tips');
|
||||||
const section = content.slice(start, end);
|
const section = content.slice(start, end);
|
||||||
|
|
||||||
const scores = await judge('command reference table', section);
|
const { result: scores, meta } = await judge('command reference table', section);
|
||||||
console.log('Command reference scores:', JSON.stringify(scores, null, 2));
|
console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'command reference table',
|
name: 'command reference table',
|
||||||
|
|
@ -43,9 +61,10 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||||
judge_reasoning: scores.reasoning,
|
judge_reasoning: scores.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
|
@ -60,8 +79,8 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
const end = content.indexOf('## Command Reference');
|
const end = content.indexOf('## Command Reference');
|
||||||
const section = content.slice(start, end);
|
const section = content.slice(start, end);
|
||||||
|
|
||||||
const scores = await judge('snapshot flags reference', section);
|
const { result: scores, meta } = await judge('snapshot flags reference', section);
|
||||||
console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));
|
console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'snapshot flags reference',
|
name: 'snapshot flags reference',
|
||||||
|
|
@ -69,9 +88,10 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||||
judge_reasoning: scores.reasoning,
|
judge_reasoning: scores.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
|
@ -85,8 +105,8 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
const start = content.indexOf('## Snapshot Flags');
|
const start = content.indexOf('## Snapshot Flags');
|
||||||
const section = content.slice(start);
|
const section = content.slice(start);
|
||||||
|
|
||||||
const scores = await judge('browse skill reference (flags + commands)', section);
|
const { result: scores, meta } = await judge('browse skill reference (flags + commands)', section);
|
||||||
console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));
|
console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'browse/SKILL.md reference',
|
name: 'browse/SKILL.md reference',
|
||||||
|
|
@ -94,9 +114,10 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||||
judge_reasoning: scores.reasoning,
|
judge_reasoning: scores.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
|
@ -111,8 +132,8 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
const setupEnd = content.indexOf('## IMPORTANT');
|
const setupEnd = content.indexOf('## IMPORTANT');
|
||||||
const section = content.slice(setupStart, setupEnd);
|
const section = content.slice(setupStart, setupEnd);
|
||||||
|
|
||||||
const scores = await judge('setup/binary discovery instructions', section);
|
const { result: scores, meta } = await judge('setup/binary discovery instructions', section);
|
||||||
console.log('Setup block scores:', JSON.stringify(scores, null, 2));
|
console.log('Setup block scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'setup block',
|
name: 'setup block',
|
||||||
|
|
@ -120,9 +141,10 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: scores.actionability >= 3 && scores.clarity >= 3,
|
passed: scores.actionability >= 3 && scores.clarity >= 3,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||||
judge_reasoning: scores.reasoning,
|
judge_reasoning: scores.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
// Setup block is intentionally minimal (binary discovery only).
|
// Setup block is intentionally minimal (binary discovery only).
|
||||||
|
|
@ -171,13 +193,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||||
| \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
|
| \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
|
||||||
| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
|
| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
|
||||||
|
|
||||||
const client = new Anthropic();
|
const { result, meta } = await callJudge<{ winner: string; reasoning: string; a_score: number; b_score: number }>(`You are comparing two versions of CLI documentation for an AI coding agent.
|
||||||
const response = await client.messages.create({
|
|
||||||
model: 'claude-sonnet-4-6',
|
|
||||||
max_tokens: 1024,
|
|
||||||
messages: [{
|
|
||||||
role: 'user',
|
|
||||||
content: `You are comparing two versions of CLI documentation for an AI coding agent.
|
|
||||||
|
|
||||||
VERSION A (baseline — hand-maintained):
|
VERSION A (baseline — hand-maintained):
|
||||||
${baseline}
|
${baseline}
|
||||||
|
|
@ -193,15 +209,9 @@ Which version is better for an AI agent trying to use these commands? Consider:
|
||||||
Respond with ONLY valid JSON:
|
Respond with ONLY valid JSON:
|
||||||
{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
|
{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
|
||||||
|
|
||||||
Scores are 1-5 overall quality.`,
|
Scores are 1-5 overall quality.`);
|
||||||
}],
|
|
||||||
});
|
|
||||||
|
|
||||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
console.log('Regression comparison:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
|
||||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
||||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
|
||||||
const result = JSON.parse(jsonMatch[0]);
|
|
||||||
console.log('Regression comparison:', JSON.stringify(result, null, 2));
|
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'regression vs baseline',
|
name: 'regression vs baseline',
|
||||||
|
|
@ -209,9 +219,10 @@ Scores are 1-5 overall quality.`,
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: result.b_score >= result.a_score,
|
passed: result.b_score >= result.a_score,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { a_score: result.a_score, b_score: result.b_score },
|
judge_scores: { a_score: result.a_score, b_score: result.b_score },
|
||||||
judge_reasoning: result.reasoning,
|
judge_reasoning: result.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
|
expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
|
||||||
|
|
@ -229,7 +240,7 @@ describeEval('QA skill quality evals', () => {
|
||||||
const end = qaContent.indexOf('## Health Score Rubric');
|
const end = qaContent.indexOf('## Health Score Rubric');
|
||||||
const section = qaContent.slice(start, end);
|
const section = qaContent.slice(start, end);
|
||||||
|
|
||||||
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
|
const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
|
||||||
|
|
||||||
The agent reads this document to learn how to systematically QA test a web application. The workflow references
|
The agent reads this document to learn how to systematically QA test a web application. The workflow references
|
||||||
a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions.
|
a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions.
|
||||||
|
|
@ -246,7 +257,7 @@ Respond with ONLY valid JSON:
|
||||||
Here is the QA workflow to evaluate:
|
Here is the QA workflow to evaluate:
|
||||||
|
|
||||||
${section}`);
|
${section}`);
|
||||||
console.log('QA workflow scores:', JSON.stringify(scores, null, 2));
|
console.log('QA workflow scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'qa/SKILL.md workflow',
|
name: 'qa/SKILL.md workflow',
|
||||||
|
|
@ -254,9 +265,10 @@ ${section}`);
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||||
judge_reasoning: scores.reasoning,
|
judge_reasoning: scores.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
|
@ -271,7 +283,7 @@ ${section}`);
|
||||||
const start = qaContent.indexOf('## Health Score Rubric');
|
const start = qaContent.indexOf('## Health Score Rubric');
|
||||||
const section = qaContent.slice(start);
|
const section = qaContent.slice(start);
|
||||||
|
|
||||||
const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
|
const { result: scores, meta } = await callJudge<{ clarity: number; completeness: number; actionability: number; reasoning: string }>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
|
||||||
|
|
||||||
The agent uses this rubric after QA testing a website. It needs to:
|
The agent uses this rubric after QA testing a website. It needs to:
|
||||||
1. Understand each scoring category and what counts as a deduction
|
1. Understand each scoring category and what counts as a deduction
|
||||||
|
|
@ -289,7 +301,7 @@ Respond with ONLY valid JSON:
|
||||||
Here is the rubric to evaluate:
|
Here is the rubric to evaluate:
|
||||||
|
|
||||||
${section}`);
|
${section}`);
|
||||||
console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));
|
console.log('QA health rubric scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'qa/SKILL.md health rubric',
|
name: 'qa/SKILL.md health rubric',
|
||||||
|
|
@ -297,9 +309,10 @@ ${section}`);
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||||
judge_reasoning: scores.reasoning,
|
judge_reasoning: scores.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
|
@ -332,7 +345,7 @@ describeEval('Cross-skill consistency evals', () => {
|
||||||
extractGrepLines(retroContent, 'retro/SKILL.md'),
|
extractGrepLines(retroContent, 'retro/SKILL.md'),
|
||||||
].join('\n\n');
|
].join('\n\n');
|
||||||
|
|
||||||
const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
|
const { result, meta } = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
|
||||||
|
|
||||||
INTENDED ARCHITECTURE:
|
INTENDED ARCHITECTURE:
|
||||||
- greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md)
|
- greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md)
|
||||||
|
|
@ -355,7 +368,7 @@ Evaluate consistency. Respond with ONLY valid JSON:
|
||||||
|
|
||||||
score (1-5): 5 = perfectly consistent, 1 = contradictory`);
|
score (1-5): 5 = perfectly consistent, 1 = contradictory`);
|
||||||
|
|
||||||
console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));
|
console.log('Cross-skill consistency:', JSON.stringify(result, null, 2), meta.cached ? '(cached)' : '');
|
||||||
|
|
||||||
evalCollector?.addTest({
|
evalCollector?.addTest({
|
||||||
name: 'cross-skill greptile consistency',
|
name: 'cross-skill greptile consistency',
|
||||||
|
|
@ -363,9 +376,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed: result.consistent && result.score >= 4,
|
passed: result.consistent && result.score >= 4,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { consistency_score: result.score },
|
judge_scores: { consistency_score: result.score },
|
||||||
judge_reasoning: result.reasoning,
|
judge_reasoning: result.reasoning,
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(result.consistent).toBe(true);
|
expect(result.consistent).toBe(true);
|
||||||
|
|
@ -392,7 +406,7 @@ describeEval('Baseline score pinning', () => {
|
||||||
const cmdStart = skillContent.indexOf('## Command Reference');
|
const cmdStart = skillContent.indexOf('## Command Reference');
|
||||||
const cmdEnd = skillContent.indexOf('## Tips');
|
const cmdEnd = skillContent.indexOf('## Tips');
|
||||||
const cmdSection = skillContent.slice(cmdStart, cmdEnd);
|
const cmdSection = skillContent.slice(cmdStart, cmdEnd);
|
||||||
const cmdScores = await judge('command reference table', cmdSection);
|
const { result: cmdScores, meta } = await judge('command reference table', cmdSection);
|
||||||
|
|
||||||
for (const dim of ['clarity', 'completeness', 'actionability'] as const) {
|
for (const dim of ['clarity', 'completeness', 'actionability'] as const) {
|
||||||
if (cmdScores[dim] < baselines.command_reference[dim]) {
|
if (cmdScores[dim] < baselines.command_reference[dim]) {
|
||||||
|
|
@ -417,9 +431,10 @@ describeEval('Baseline score pinning', () => {
|
||||||
tier: 'llm-judge',
|
tier: 'llm-judge',
|
||||||
passed,
|
passed,
|
||||||
duration_ms: Date.now() - t0,
|
duration_ms: Date.now() - t0,
|
||||||
cost_usd: 0.02,
|
cost_usd: judgeCost(meta),
|
||||||
judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
|
judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
|
||||||
judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
|
judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
|
||||||
|
costs: judgeCosts(meta),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!passed) {
|
if (!passed) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue