mirror of https://github.com/garrytan/gstack.git
48 lines
1.9 KiB
TypeScript
48 lines
1.9 KiB
TypeScript
/**
|
|
* /spec LLM-judge eval (periodic, paid).
|
|
*
|
|
* Asserts: when /spec runs against a fixture vague request, the agent
|
|
* produces a spec body that scores >= 8/10 against an LLM judge using
|
|
* the contributor's 14 Quality Standards as the rubric.
|
|
*
|
|
* Cost: ~$0.15/run. Periodic — runs weekly via cron or on demand via
|
|
* `EVALS=1 EVALS_TIER=periodic bun run test:evals`.
|
|
*
|
|
* TODO (v1.1): expand fixture set to cover bug / feature / refactor / audit
|
|
* framings + project-level prompts (no concrete file mapping, exercises the
|
|
* Phase 3 fallback path).
|
|
*/
|
|
|
|
import { describe, test, expect } from 'bun:test';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
|
|
const evalsEnabled = !!process.env.EVALS;
|
|
const describeEval = evalsEnabled ? describe : describe.skip;
|
|
|
|
const ROOT = path.resolve(import.meta.dir, '..');
|
|
|
|
describeEval('/spec LLM-judge eval (periodic)', () => {
|
|
test('spec body scores >= 8/10 against 14-standard rubric on fixture request', async () => {
|
|
// Sanity: required files exist for the eval.
|
|
expect(fs.existsSync(path.join(ROOT, 'spec', 'SKILL.md.tmpl'))).toBe(true);
|
|
|
|
// Full LLM-judge run lives in a follow-up. This file registers the
|
|
// periodic-tier surface so the diff-based selector picks it up when
|
|
// spec/ changes. Deterministic invariants are gate-tier; the LLM-judge
|
|
// is for measuring authored-spec quality, which is non-deterministic
|
|
// by nature.
|
|
//
|
|
// Expected v1.1 implementation:
|
|
// 1. Pick fixture prompt from test/fixtures/spec/vague-bug.md
|
|
// 2. Spawn `claude -p` with /spec loaded, send the prompt + role-play
|
|
// five Phase 1 answers (from test/fixtures/spec/vague-bug-answers.json)
|
|
// 3. Capture final spec body
|
|
// 4. Dispatch to Claude judge with prompt encoding the 14 Quality
|
|
// Standards from spec/SKILL.md.tmpl
|
|
// 5. Assert numeric score >= 8
|
|
|
|
expect(true).toBe(true);
|
|
}, 300_000);
|
|
});
|