gstack/test/skill-e2e-ask-user-question...

92 lines
3.7 KiB
TypeScript

/**
* AskUserQuestion format-compliance gate (gate, paid, SDK capture).
*
* Asserts: /plan-ceo-review's first AskUserQuestion (Step 0F mode selection) is a
* compliant decision brief — all 7 mandated format elements present, with a
* substantive recommendation.
*
* Why SDK capture, not real-PTY (changed v1.59+): the prior version launched an
* interactive `claude` PTY and grepped the rendered TUI after stripAnsi. But
* plan-mode AUQs render as an interactive cursor picker whose cursor-positioning
* escapes stripAnsi CANNOT faithfully flatten — verified directly: the picker
* renders fine for a human (cursorSeen=45) but the flattened text drops `ELI10:`
* and `(recommended)` and `parseNumberedOptions` returns 0. So the old test was
* grading a lossy projection of the TUI, not the question's actual format, and
* failed by construction in this environment.
*
* This version drives the skill via the SDK $OUT_FILE capture path (the agent
* writes the verbatim AskUserQuestion it would have shown to a file — clean text,
* zero rendering loss) and grades that. Same property tested (does the question
* carry every format element), reliably, environment-independent. The rendering
* layer is identical across skills/content, so it is not where format regressions
* hide; the model's composed question is. Shares the engine with the periodic
* A/B and matrix evals (test/helpers/auq-sdk-capture.ts).
*/
import { describe, test, expect } from 'bun:test';
import * as fs from 'node:fs';
import {
setupPlanCeoDir,
captureModeSelectionAuq,
scoreAuqFormat,
gradeAuqRecommendation,
carvedSkill,
} from './helpers/auq-sdk-capture';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
const describeE2E = shouldRun ? describe : describe.skip;
const runId = `auq-format-gate-${process.env.EVALS_RUN_ID ?? 'local'}`;
describeE2E('AskUserQuestion format compliance (gate)', () => {
test(
"/plan-ceo-review's first AskUserQuestion is a compliant decision brief (7/7 + substance)",
async () => {
const carved = carvedSkill();
const dir = setupPlanCeoDir({
skillMd: carved.skillMd,
sectionsFrom: carved.sectionsFrom,
tmpPrefix: 'auq-format-gate-',
});
let text = '';
try {
text = await captureModeSelectionAuq({ planDir: dir, testName: 'auq-format-gate', runId });
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
if (!text.trim()) {
throw new Error('No AskUserQuestion captured — the skill never reached its mode-selection question.');
}
// All 7 mandated decision-brief elements (ELI10, Recommendation, Pros/cons,
// ✅, ❌, Net, (recommended)).
const fmt = scoreAuqFormat(text);
if (fmt.missing.length > 0) {
throw new Error(
`AskUserQuestion missing ${fmt.missing.length} mandated format element(s): ` +
`${fmt.missing.join(', ')}\n--- captured AUQ ---\n${text}`,
);
}
// Mode selection is kind-differentiated → the kind-note must be present and
// a numeric completeness score must be absent.
expect(text).toMatch(/options differ in kind/i);
// Recommendation must be substantive, not boilerplate.
const g = await gradeAuqRecommendation(text);
// eslint-disable-next-line no-console
console.log(
`[auq-format-gate] format=${fmt.present}/${fmt.total} substance=${g.substance} ` +
`recPresent=${g.present} literalBecause=${g.hadLiteralBecause}`,
);
expect(g.present).toBe(true);
if (g.substance < 4) {
throw new Error(
`Recommendation substance ${g.substance} < 4 (boilerplate/weak):\n--- captured AUQ ---\n${text}`,
);
}
},
300_000,
);
});