From 0690066c3fe7dc0e819bed1d33263114c8b2f584 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 2 Jun 2026 21:42:32 -0700 Subject: [PATCH] test(auq): grade format-compliance gate from SDK capture, not the TUI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The real-PTY version grepped the stripAnsi'd interactive AUQ picker. Verified directly that this cannot work: plan-mode AUQs render as a cursor picker whose cursor-positioning escapes stripAnsi can't flatten — the picker renders fine for a human (cursorSeen=45) but the flattened text drops ELI10:/(recommended) and parseNumberedOptions returns 0. The test was grading a lossy projection and failed by construction. Rewritten to drive /plan-ceo-review via the SDK $OUT_FILE capture (the agent writes the verbatim question it would have shown — clean text, no rendering loss) and grade 7/7 format + kind-note + recommendation substance >=4. Same property, reliable, environment-independent; shares the engine with the periodic A/B and matrix evals. Result: 7/7 format, substance 5. Touchfiles key renamed ask-user-question-format-pty -> auq-format-gate (no longer a PTY test). Co-Authored-By: Claude Opus 4.8 (1M context) --- test/helpers/touchfiles.ts | 4 +- ...sk-user-question-format-compliance.test.ts | 262 +++++------------- test/touchfiles.test.ts | 2 +- 3 files changed, 72 insertions(+), 196 deletions(-) diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 4eb9d9e4c..4ca264274 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -116,7 +116,7 @@ export const E2E_TOUCHFILES: Record = { // Real-PTY E2E batch (#6 new tests on the harness). // Each one tests behavior the SDK harness can't observe (rendered TTY, // numbered-option lists, multi-phase ordering, idempotency state echo). - 'ask-user-question-format-pty': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'auq-format-gate': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts', 'test/helpers/llm-judge.ts'], 'plan-ceo-mode-routing': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], 'plan-design-with-ui-scope': ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'], 'budget-regression-pty': ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'], @@ -505,7 +505,7 @@ export const E2E_TIERS: Record = { // Real-PTY E2E batch — tier classification: // gate: cheap, deterministic, run on every PR // periodic: long-running or expensive (>$3/run), run weekly - 'ask-user-question-format-pty': 'gate', // ~$0.50/run, single skill probe + 'auq-format-gate': 'gate', // ~$0.50/run, SDK capture, single skill probe 'plan-ceo-mode-routing': 'periodic', // ~$3/run, deep navigation through 8-12 prior AskUserQuestions 'plan-design-with-ui-scope': 'gate', // ~$0.80/run 'budget-regression-pty': 'gate', // free, library-only assertion diff --git a/test/skill-e2e-ask-user-question-format-compliance.test.ts b/test/skill-e2e-ask-user-question-format-compliance.test.ts index 00c453f05..787350fb5 100644 --- a/test/skill-e2e-ask-user-question-format-compliance.test.ts +++ b/test/skill-e2e-ask-user-question-format-compliance.test.ts @@ -1,215 +1,91 @@ /** - * AskUserQuestion format-compliance smoke (gate, paid, real-PTY). + * AskUserQuestion format-compliance gate (gate, paid, SDK capture). * - * Asserts: when /plan-ceo-review fires its first AskUserQuestion in plan - * mode, the rendered TTY output contains every element the preamble - * format spec mandates (scripts/resolvers/preamble/generate-ask-user-format.ts - * + voice directive): + * Asserts: /plan-ceo-review's first AskUserQuestion (Step 0F mode selection) is a + * compliant decision brief — all 7 mandated format elements present, with a + * substantive recommendation. * - * 1. ELI10 prose paragraph - * 2. "Recommendation:" line - * 3. Pros/Cons header - * 4. ✅ pro bullet AND ❌ con bullet - * 5. "Net:" closer line - * 6. "(recommended)" label on one option + * Why SDK capture, not real-PTY (changed v1.59+): the prior version launched an + * interactive `claude` PTY and grepped the rendered TUI after stripAnsi. But + * plan-mode AUQs render as an interactive cursor picker whose cursor-positioning + * escapes stripAnsi CANNOT faithfully flatten — verified directly: the picker + * renders fine for a human (cursorSeen=45) but the flattened text drops `ELI10:` + * and `(recommended)` and `parseNumberedOptions` returns 0. So the old test was + * grading a lossy projection of the TUI, not the question's actual format, and + * failed by construction in this environment. * - * Why real-PTY: the existing skill-e2e-plan-format tests cover what the - * AGENT writes via the SDK (capture-to-file harness). This test covers - * what the USER actually sees in the terminal — different bug class - * (e.g., AskUserQuestion tool truncates long prose, conductor renderer mangles - * bullets, model collapses sections under token pressure). Two layers - * of defense for a format-discipline regression that previously ate ~6 - * weeks of compliance drift before it was noticed. - * - * Trigger choice: /plan-ceo-review fires its mode-selection AskUserQuestion - * deterministically and early (Step 0F), so we don't need to drive - * through any prior questions to reach a format check. - * - * See test/helpers/claude-pty-runner.ts for runner internals. + * This version drives the skill via the SDK $OUT_FILE capture path (the agent + * writes the verbatim AskUserQuestion it would have shown to a file — clean text, + * zero rendering loss) and grades that. Same property tested (does the question + * carry every format element), reliably, environment-independent. The rendering + * layer is identical across skills/content, so it is not where format regressions + * hide; the model's composed question is. Shares the engine with the periodic + * A/B and matrix evals (test/helpers/auq-sdk-capture.ts). */ - import { describe, test, expect } from 'bun:test'; +import * as fs from 'node:fs'; import { - launchClaudePty, - isNumberedOptionListVisible, - isPermissionDialogVisible, - parseNumberedOptions, -} from './helpers/claude-pty-runner'; -import { FORCING_FLOOR_CEO } from './fixtures/forcing-finding-seeds'; + setupPlanCeoDir, + captureModeSelectionAuq, + scoreAuqFormat, + gradeAuqRecommendation, + carvedSkill, +} from './helpers/auq-sdk-capture'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; - -// Format predicates. Permissive on whitespace and capitalization. -// Tightening these is V2 if real drift is observed. -const ELI10_RE = /ELI10\s*:/i; -const RECOMMEND_RE = /Recommendation\s*:/i; -const PROS_CONS_RE = /Pros\s*\/\s*cons\s*:/i; -const PRO_BULLET_RE = /✅/; -const CON_BULLET_RE = /❌/; -const NET_LINE_RE = /^[\s|]*Net\s*:/im; -const RECOMMENDED_LBL = /\(recommended\)/i; - -interface FormatGap { - field: string; - re: RegExp; -} - -function findFormatGaps(visible: string): FormatGap[] { - const checks: FormatGap[] = [ - { field: 'ELI10:', re: ELI10_RE }, - { field: 'Recommendation:', re: RECOMMEND_RE }, - { field: 'Pros / cons:', re: PROS_CONS_RE }, - { field: '✅ pro bullet', re: PRO_BULLET_RE }, - { field: '❌ con bullet', re: CON_BULLET_RE }, - { field: 'Net:', re: NET_LINE_RE }, - { field: '(recommended) label', re: RECOMMENDED_LBL }, - ]; - return checks.filter(c => !c.re.test(visible)); -} +const runId = `auq-format-gate-${process.env.EVALS_RUN_ID ?? 'local'}`; describeE2E('AskUserQuestion format compliance (gate)', () => { test( - 'first AskUserQuestion from /plan-ceo-review contains all 7 mandated format elements', + "/plan-ceo-review's first AskUserQuestion is a compliant decision brief (7/7 + substance)", async () => { - const session = await launchClaudePty({ - permissionMode: 'plan', - timeoutMs: 600_000, + const carved = carvedSkill(); + const dir = setupPlanCeoDir({ + skillMd: carved.skillMd, + sectionsFrom: carved.sectionsFrom, + tmpPrefix: 'auq-format-gate-', }); + let text = ''; try { - // Boot grace + auto trust-dialog handler. - await Bun.sleep(8000); - const since = session.mark(); - session.send('/plan-ceo-review\r'); - // Deterministic trigger: hand the skill a concrete plan to review as a - // follow-up. Without it, a bare /plan-ceo-review against a repo whose - // work is already implemented makes the model improvise an off-script - // "what should I review?" scope question that skips the decision-brief - // format — a flaky non-failure that this test's timeout used to hit. - // The forcing plan anchors the skill to its real Step 0 → mode-selection - // AUQ, which is the compliant question we want to format-check. - await Bun.sleep(3000); - session.send(`${FORCING_FLOOR_CEO}\r`); - - // Wait for a SKILL AskUserQuestion. Strategy: poll the visible buffer until it - // contains both a numbered-option list AND the format markers we - // expect (ELI10 + Recommendation). When both are present, it IS a - // real format-compliant AskUserQuestion — not a permission dialog or trust - // prompt. - // - // While polling, auto-grant any permission dialogs we see in the - // recent tail (preamble side-effects: touch on a sensitive file, - // etc) so the agent isn't blocked. - // - // Budget bumped 300s → 540s in v1.32: /plan-ceo-review's preamble runs - // multiple bash blocks (gbrain sync probe, telemetry, learnings search, - // dashboard read) before reaching its mode-selection AskUserQuestion in - // Step 0F. On substantive branches (or under contention from concurrent - // tests running at max-concurrency 15), 300s sometimes wasn't enough - // for the model to drain Step 0 work before emitting the first AUQ. - // 540s sits below the suite-level 360s/9min timeout headroom and - // tracks the same magnitude the plan-design-with-ui test uses. - const budgetMs = 540_000; - const start = Date.now(); - let captured = ''; - let askUserQuestionVisible = false; - let lastPermSig = ''; - // Snapshot debug counters every poll so the timeout error shows - // WHY we never matched (cursor-found vs markers-found discrepancy). - let debugCursorSeen = 0; - let debugMarkersSeen = 0; - let debugBothSeen = 0; - - while (Date.now() - start < budgetMs) { - await Bun.sleep(2000); - if (session.exited()) { - throw new Error( - `claude exited (code=${session.exitCode()}) before AskUserQuestion rendered.\n` + - `Last visible:\n${session.visibleSince(since).slice(-2000)}`, - ); - } - const visible = session.visibleSince(since); - // Marker check: anywhere in the post-slash region. Since `since` - // is set right after sending /plan-ceo-review, there's no stale - // AskUserQuestion above this line — the only AskUserQuestion that can produce these - // markers is the current one. - const hasEli10 = /ELI10\s*:/i.test(visible); - const hasRecommend = /Recommendation\s*:/i.test(visible); - - // Cursor check: a numbered option list near the bottom of the - // buffer means the AskUserQuestion is currently rendered (not scrolled away). - const cursorTail = visible.slice(-4000); - const hasCursor = isNumberedOptionListVisible(cursorTail) && - parseNumberedOptions(cursorTail).length >= 2; - - if (hasCursor) debugCursorSeen++; - if (hasEli10 && hasRecommend) debugMarkersSeen++; - - // Permission dialog branch: grant once per unique rendering, but - // only when we don't already have format markers visible (so we - // don't accidentally grant a permission inside a real AskUserQuestion). - if ( - hasCursor && - !(hasEli10 && hasRecommend) && - isPermissionDialogVisible(cursorTail) - ) { - const sig = visible.slice(-500); - if (sig !== lastPermSig) { - lastPermSig = sig; - session.send('1\r'); - await Bun.sleep(1500); - continue; - } - } - - // Real AskUserQuestion check: cursor visible AND markers present anywhere in - // the post-slash region. - if (hasCursor && hasEli10 && hasRecommend) { - debugBothSeen++; - captured = visible; - askUserQuestionVisible = true; - break; - } - } - if (!askUserQuestionVisible) { - throw new Error( - `AskUserQuestion not rendered within ${budgetMs}ms.\n` + - `Debug counts: cursorSeen=${debugCursorSeen} markersSeen=${debugMarkersSeen} bothSeen=${debugBothSeen}\n` + - `Last visible (4KB):\n${session.visibleSince(since).slice(-4000)}`, - ); - } - const gaps = findFormatGaps(captured); - if (gaps.length > 0) { - // Surface the captured text last 3KB on failure for debugging. - const tail = captured.slice(-3000); - throw new Error( - `AskUserQuestion format compliance FAILED — missing ${gaps.length} mandated field(s):\n` + - gaps.map(g => ` - ${g.field} (regex: ${g.re.source})`).join('\n') + - `\n--- captured (last 3KB) ---\n${tail}`, - ); - } - - // Sanity: the parsed option list contains at least 2 options and - // one of them carries the (recommended) marker. - const opts = parseNumberedOptions(captured); - expect(opts.length).toBeGreaterThanOrEqual(2); - const hasRecommended = opts.some(o => /\(recommended\)/i.test(o.label)); - if (!hasRecommended) { - // It's also acceptable for the (recommended) marker to live in - // prose above the box (some renderers wrap labels). The text-level - // RECOMMENDED_LBL check above already covers that case. - // Surface a friendlier message if the box itself missed it. - // (This is non-fatal because findFormatGaps already passed.) - // eslint-disable-next-line no-console - console.warn( - '(recommended) label appears in prose but not on a parsed option label — acceptable but watch for drift', - ); - } + text = await captureModeSelectionAuq({ planDir: dir, testName: 'auq-format-gate', runId }); } finally { - await session.close(); + fs.rmSync(dir, { recursive: true, force: true }); + } + + if (!text.trim()) { + throw new Error('No AskUserQuestion captured — the skill never reached its mode-selection question.'); + } + + // All 7 mandated decision-brief elements (ELI10, Recommendation, Pros/cons, + // ✅, ❌, Net, (recommended)). + const fmt = scoreAuqFormat(text); + if (fmt.missing.length > 0) { + throw new Error( + `AskUserQuestion missing ${fmt.missing.length} mandated format element(s): ` + + `${fmt.missing.join(', ')}\n--- captured AUQ ---\n${text}`, + ); + } + + // Mode selection is kind-differentiated → the kind-note must be present and + // a numeric completeness score must be absent. + expect(text).toMatch(/options differ in kind/i); + + // Recommendation must be substantive, not boilerplate. + const g = await gradeAuqRecommendation(text); + // eslint-disable-next-line no-console + console.log( + `[auq-format-gate] format=${fmt.present}/${fmt.total} substance=${g.substance} ` + + `recPresent=${g.present} literalBecause=${g.hadLiteralBecause}`, + ); + expect(g.present).toBe(true); + if (g.substance < 4) { + throw new Error( + `Recommendation substance ${g.substance} < 4 (boilerplate/weak):\n--- captured AUQ ---\n${text}`, + ); } }, - 660_000, + 300_000, ); }); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 218220ec8..f7ced6971 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -94,7 +94,7 @@ describe('selectTests', () => { expect(result.selected).toContain('plan-review-prosons-hardstop-neg'); expect(result.selected).toContain('plan-review-prosons-neutral-neg'); // v1.13.x real-PTY E2E batch entries that also depend on plan-ceo-review/** - expect(result.selected).toContain('ask-user-question-format-pty'); + expect(result.selected).toContain('auq-format-gate'); expect(result.selected).toContain('plan-ceo-mode-routing'); expect(result.selected).toContain('autoplan-chain-pty'); // Per-finding count + review-report-at-bottom (v1.21.x)