mirror of https://github.com/garrytan/gstack.git
92 lines
3.7 KiB
TypeScript
92 lines
3.7 KiB
TypeScript
/**
|
|
* AskUserQuestion format-compliance gate (gate, paid, SDK capture).
|
|
*
|
|
* Asserts: /plan-ceo-review's first AskUserQuestion (Step 0F mode selection) is a
|
|
* compliant decision brief — all 7 mandated format elements present, with a
|
|
* substantive recommendation.
|
|
*
|
|
* Why SDK capture, not real-PTY (changed v1.59+): the prior version launched an
|
|
* interactive `claude` PTY and grepped the rendered TUI after stripAnsi. But
|
|
* plan-mode AUQs render as an interactive cursor picker whose cursor-positioning
|
|
* escapes stripAnsi CANNOT faithfully flatten — verified directly: the picker
|
|
* renders fine for a human (cursorSeen=45) but the flattened text drops `ELI10:`
|
|
* and `(recommended)` and `parseNumberedOptions` returns 0. So the old test was
|
|
* grading a lossy projection of the TUI, not the question's actual format, and
|
|
* failed by construction in this environment.
|
|
*
|
|
* This version drives the skill via the SDK $OUT_FILE capture path (the agent
|
|
* writes the verbatim AskUserQuestion it would have shown to a file — clean text,
|
|
* zero rendering loss) and grades that. Same property tested (does the question
|
|
* carry every format element), reliably, environment-independent. The rendering
|
|
* layer is identical across skills/content, so it is not where format regressions
|
|
* hide; the model's composed question is. Shares the engine with the periodic
|
|
* A/B and matrix evals (test/helpers/auq-sdk-capture.ts).
|
|
*/
|
|
import { describe, test, expect } from 'bun:test';
|
|
import * as fs from 'node:fs';
|
|
import {
|
|
setupPlanCeoDir,
|
|
captureModeSelectionAuq,
|
|
scoreAuqFormat,
|
|
gradeAuqRecommendation,
|
|
carvedSkill,
|
|
} from './helpers/auq-sdk-capture';
|
|
|
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
|
const describeE2E = shouldRun ? describe : describe.skip;
|
|
const runId = `auq-format-gate-${process.env.EVALS_RUN_ID ?? 'local'}`;
|
|
|
|
describeE2E('AskUserQuestion format compliance (gate)', () => {
|
|
test(
|
|
"/plan-ceo-review's first AskUserQuestion is a compliant decision brief (7/7 + substance)",
|
|
async () => {
|
|
const carved = carvedSkill();
|
|
const dir = setupPlanCeoDir({
|
|
skillMd: carved.skillMd,
|
|
sectionsFrom: carved.sectionsFrom,
|
|
tmpPrefix: 'auq-format-gate-',
|
|
});
|
|
|
|
let text = '';
|
|
try {
|
|
text = await captureModeSelectionAuq({ planDir: dir, testName: 'auq-format-gate', runId });
|
|
} finally {
|
|
fs.rmSync(dir, { recursive: true, force: true });
|
|
}
|
|
|
|
if (!text.trim()) {
|
|
throw new Error('No AskUserQuestion captured — the skill never reached its mode-selection question.');
|
|
}
|
|
|
|
// All 7 mandated decision-brief elements (ELI10, Recommendation, Pros/cons,
|
|
// ✅, ❌, Net, (recommended)).
|
|
const fmt = scoreAuqFormat(text);
|
|
if (fmt.missing.length > 0) {
|
|
throw new Error(
|
|
`AskUserQuestion missing ${fmt.missing.length} mandated format element(s): ` +
|
|
`${fmt.missing.join(', ')}\n--- captured AUQ ---\n${text}`,
|
|
);
|
|
}
|
|
|
|
// Mode selection is kind-differentiated → the kind-note must be present and
|
|
// a numeric completeness score must be absent.
|
|
expect(text).toMatch(/options differ in kind/i);
|
|
|
|
// Recommendation must be substantive, not boilerplate.
|
|
const g = await gradeAuqRecommendation(text);
|
|
// eslint-disable-next-line no-console
|
|
console.log(
|
|
`[auq-format-gate] format=${fmt.present}/${fmt.total} substance=${g.substance} ` +
|
|
`recPresent=${g.present} literalBecause=${g.hadLiteralBecause}`,
|
|
);
|
|
expect(g.present).toBe(true);
|
|
if (g.substance < 4) {
|
|
throw new Error(
|
|
`Recommendation substance ${g.substance} < 4 (boilerplate/weak):\n--- captured AUQ ---\n${text}`,
|
|
);
|
|
}
|
|
},
|
|
300_000,
|
|
);
|
|
});
|