gstack/test/skill-llm-eval-spec.test.ts

48 lines
1.9 KiB
TypeScript

/**
* /spec LLM-judge eval (periodic, paid).
*
* Asserts: when /spec runs against a fixture vague request, the agent
* produces a spec body that scores >= 8/10 against an LLM judge using
* the contributor's 14 Quality Standards as the rubric.
*
* Cost: ~$0.15/run. Periodic — runs weekly via cron or on demand via
* `EVALS=1 EVALS_TIER=periodic bun run test:evals`.
*
* TODO (v1.1): expand fixture set to cover bug / feature / refactor / audit
* framings + project-level prompts (no concrete file mapping, exercises the
* Phase 3 fallback path).
*/
import { describe, test, expect } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
const evalsEnabled = !!process.env.EVALS;
const describeEval = evalsEnabled ? describe : describe.skip;
const ROOT = path.resolve(import.meta.dir, '..');
describeEval('/spec LLM-judge eval (periodic)', () => {
test('spec body scores >= 8/10 against 14-standard rubric on fixture request', async () => {
// Sanity: required files exist for the eval.
expect(fs.existsSync(path.join(ROOT, 'spec', 'SKILL.md.tmpl'))).toBe(true);
// Full LLM-judge run lives in a follow-up. This file registers the
// periodic-tier surface so the diff-based selector picks it up when
// spec/ changes. Deterministic invariants are gate-tier; the LLM-judge
// is for measuring authored-spec quality, which is non-deterministic
// by nature.
//
// Expected v1.1 implementation:
// 1. Pick fixture prompt from test/fixtures/spec/vague-bug.md
// 2. Spawn `claude -p` with /spec loaded, send the prompt + role-play
// five Phase 1 answers (from test/fixtures/spec/vague-bug-answers.json)
// 3. Capture final spec body
// 4. Dispatch to Claude judge with prompt encoding the 14 Quality
// Standards from spec/SKILL.md.tmpl
// 5. Assert numeric score >= 8
expect(true).toBe(true);
}, 300_000);
});