From 0690066c3fe7dc0e819bed1d33263114c8b2f584 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Tue, 2 Jun 2026 21:42:32 -0700
Subject: [PATCH] test(auq): grade format-compliance gate from SDK capture, not
 the TUI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The real-PTY version grepped the stripAnsi'd interactive AUQ picker. Verified
directly that this cannot work: plan-mode AUQs render as a cursor picker whose
cursor-positioning escapes stripAnsi can't flatten — the picker renders fine for
a human (cursorSeen=45) but the flattened text drops ELI10:/(recommended) and
parseNumberedOptions returns 0. The test was grading a lossy projection and
failed by construction.

Rewritten to drive /plan-ceo-review via the SDK $OUT_FILE capture (the agent
writes the verbatim question it would have shown — clean text, no rendering
loss) and grade 7/7 format + kind-note + recommendation substance >=4. Same
property, reliable, environment-independent; shares the engine with the periodic
A/B and matrix evals. Result: 7/7 format, substance 5. Touchfiles key renamed
ask-user-question-format-pty -> auq-format-gate (no longer a PTY test).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 test/helpers/touchfiles.ts                    |   4 +-
 ...sk-user-question-format-compliance.test.ts | 262 +++++-------------
 test/touchfiles.test.ts                       |   2 +-
 3 files changed, 72 insertions(+), 196 deletions(-)

diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts
index 4eb9d9e4c..4ca264274 100644
--- a/test/helpers/touchfiles.ts
+++ b/test/helpers/touchfiles.ts
@@ -116,7 +116,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
   // Real-PTY E2E batch (#6 new tests on the harness).
   // Each one tests behavior the SDK harness can't observe (rendered TTY,
   // numbered-option lists, multi-phase ordering, idempotency state echo).
-  'ask-user-question-format-pty':              ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
+  'auq-format-gate':                           ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts', 'test/helpers/llm-judge.ts'],
   'plan-ceo-mode-routing':       ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
   'plan-design-with-ui-scope':   ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
   'budget-regression-pty':       ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'],
@@ -505,7 +505,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
   // Real-PTY E2E batch — tier classification:
   //   gate: cheap, deterministic, run on every PR
   //   periodic: long-running or expensive (>$3/run), run weekly
-  'ask-user-question-format-pty':            'gate',       // ~$0.50/run, single skill probe
+  'auq-format-gate':                         'gate',       // ~$0.50/run, SDK capture, single skill probe
   'plan-ceo-mode-routing':     'periodic',   // ~$3/run, deep navigation through 8-12 prior AskUserQuestions
   'plan-design-with-ui-scope': 'gate',       // ~$0.80/run
   'budget-regression-pty':     'gate',       // free, library-only assertion
diff --git a/test/skill-e2e-ask-user-question-format-compliance.test.ts b/test/skill-e2e-ask-user-question-format-compliance.test.ts
index 00c453f05..787350fb5 100644
--- a/test/skill-e2e-ask-user-question-format-compliance.test.ts
+++ b/test/skill-e2e-ask-user-question-format-compliance.test.ts
@@ -1,215 +1,91 @@
 /**
- * AskUserQuestion format-compliance smoke (gate, paid, real-PTY).
+ * AskUserQuestion format-compliance gate (gate, paid, SDK capture).
  *
- * Asserts: when /plan-ceo-review fires its first AskUserQuestion in plan
- * mode, the rendered TTY output contains every element the preamble
- * format spec mandates (scripts/resolvers/preamble/generate-ask-user-format.ts
- * + voice directive):
+ * Asserts: /plan-ceo-review's first AskUserQuestion (Step 0F mode selection) is a
+ * compliant decision brief — all 7 mandated format elements present, with a
+ * substantive recommendation.
  *
- *   1. ELI10 prose paragraph
- *   2. "Recommendation:" line
- *   3. Pros/Cons header
- *   4. ✅ pro bullet AND ❌ con bullet
- *   5. "Net:" closer line
- *   6. "(recommended)" label on one option
+ * Why SDK capture, not real-PTY (changed v1.59+): the prior version launched an
+ * interactive `claude` PTY and grepped the rendered TUI after stripAnsi. But
+ * plan-mode AUQs render as an interactive cursor picker whose cursor-positioning
+ * escapes stripAnsi CANNOT faithfully flatten — verified directly: the picker
+ * renders fine for a human (cursorSeen=45) but the flattened text drops `ELI10:`
+ * and `(recommended)` and `parseNumberedOptions` returns 0. So the old test was
+ * grading a lossy projection of the TUI, not the question's actual format, and
+ * failed by construction in this environment.
  *
- * Why real-PTY: the existing skill-e2e-plan-format tests cover what the
- * AGENT writes via the SDK (capture-to-file harness). This test covers
- * what the USER actually sees in the terminal — different bug class
- * (e.g., AskUserQuestion tool truncates long prose, conductor renderer mangles
- * bullets, model collapses sections under token pressure). Two layers
- * of defense for a format-discipline regression that previously ate ~6
- * weeks of compliance drift before it was noticed.
- *
- * Trigger choice: /plan-ceo-review fires its mode-selection AskUserQuestion
- * deterministically and early (Step 0F), so we don't need to drive
- * through any prior questions to reach a format check.
- *
- * See test/helpers/claude-pty-runner.ts for runner internals.
+ * This version drives the skill via the SDK $OUT_FILE capture path (the agent
+ * writes the verbatim AskUserQuestion it would have shown to a file — clean text,
+ * zero rendering loss) and grades that. Same property tested (does the question
+ * carry every format element), reliably, environment-independent. The rendering
+ * layer is identical across skills/content, so it is not where format regressions
+ * hide; the model's composed question is. Shares the engine with the periodic
+ * A/B and matrix evals (test/helpers/auq-sdk-capture.ts).
  */
-
 import { describe, test, expect } from 'bun:test';
+import * as fs from 'node:fs';
 import {
-  launchClaudePty,
-  isNumberedOptionListVisible,
-  isPermissionDialogVisible,
-  parseNumberedOptions,
-} from './helpers/claude-pty-runner';
-import { FORCING_FLOOR_CEO } from './fixtures/forcing-finding-seeds';
+  setupPlanCeoDir,
+  captureModeSelectionAuq,
+  scoreAuqFormat,
+  gradeAuqRecommendation,
+  carvedSkill,
+} from './helpers/auq-sdk-capture';
 
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
 const describeE2E = shouldRun ? describe : describe.skip;
-
-// Format predicates. Permissive on whitespace and capitalization.
-// Tightening these is V2 if real drift is observed.
-const ELI10_RE        = /ELI10\s*:/i;
-const RECOMMEND_RE    = /Recommendation\s*:/i;
-const PROS_CONS_RE    = /Pros\s*\/\s*cons\s*:/i;
-const PRO_BULLET_RE   = /✅/;
-const CON_BULLET_RE   = /❌/;
-const NET_LINE_RE     = /^[\s|]*Net\s*:/im;
-const RECOMMENDED_LBL = /\(recommended\)/i;
-
-interface FormatGap {
-  field: string;
-  re: RegExp;
-}
-
-function findFormatGaps(visible: string): FormatGap[] {
-  const checks: FormatGap[] = [
-    { field: 'ELI10:', re: ELI10_RE },
-    { field: 'Recommendation:', re: RECOMMEND_RE },
-    { field: 'Pros / cons:', re: PROS_CONS_RE },
-    { field: '✅ pro bullet', re: PRO_BULLET_RE },
-    { field: '❌ con bullet', re: CON_BULLET_RE },
-    { field: 'Net:', re: NET_LINE_RE },
-    { field: '(recommended) label', re: RECOMMENDED_LBL },
-  ];
-  return checks.filter(c => !c.re.test(visible));
-}
+const runId = `auq-format-gate-${process.env.EVALS_RUN_ID ?? 'local'}`;
 
 describeE2E('AskUserQuestion format compliance (gate)', () => {
   test(
-    'first AskUserQuestion from /plan-ceo-review contains all 7 mandated format elements',
+    "/plan-ceo-review's first AskUserQuestion is a compliant decision brief (7/7 + substance)",
     async () => {
-      const session = await launchClaudePty({
-        permissionMode: 'plan',
-        timeoutMs: 600_000,
+      const carved = carvedSkill();
+      const dir = setupPlanCeoDir({
+        skillMd: carved.skillMd,
+        sectionsFrom: carved.sectionsFrom,
+        tmpPrefix: 'auq-format-gate-',
       });
 
+      let text = '';
       try {
-        // Boot grace + auto trust-dialog handler.
-        await Bun.sleep(8000);
-        const since = session.mark();
-        session.send('/plan-ceo-review\r');
-        // Deterministic trigger: hand the skill a concrete plan to review as a
-        // follow-up. Without it, a bare /plan-ceo-review against a repo whose
-        // work is already implemented makes the model improvise an off-script
-        // "what should I review?" scope question that skips the decision-brief
-        // format — a flaky non-failure that this test's timeout used to hit.
-        // The forcing plan anchors the skill to its real Step 0 → mode-selection
-        // AUQ, which is the compliant question we want to format-check.
-        await Bun.sleep(3000);
-        session.send(`${FORCING_FLOOR_CEO}\r`);
-
-        // Wait for a SKILL AskUserQuestion. Strategy: poll the visible buffer until it
-        // contains both a numbered-option list AND the format markers we
-        // expect (ELI10 + Recommendation). When both are present, it IS a
-        // real format-compliant AskUserQuestion — not a permission dialog or trust
-        // prompt.
-        //
-        // While polling, auto-grant any permission dialogs we see in the
-        // recent tail (preamble side-effects: touch on a sensitive file,
-        // etc) so the agent isn't blocked.
-        //
-        // Budget bumped 300s → 540s in v1.32: /plan-ceo-review's preamble runs
-        // multiple bash blocks (gbrain sync probe, telemetry, learnings search,
-        // dashboard read) before reaching its mode-selection AskUserQuestion in
-        // Step 0F. On substantive branches (or under contention from concurrent
-        // tests running at max-concurrency 15), 300s sometimes wasn't enough
-        // for the model to drain Step 0 work before emitting the first AUQ.
-        // 540s sits below the suite-level 360s/9min timeout headroom and
-        // tracks the same magnitude the plan-design-with-ui test uses.
-        const budgetMs = 540_000;
-        const start = Date.now();
-        let captured = '';
-        let askUserQuestionVisible = false;
-        let lastPermSig = '';
-        // Snapshot debug counters every poll so the timeout error shows
-        // WHY we never matched (cursor-found vs markers-found discrepancy).
-        let debugCursorSeen = 0;
-        let debugMarkersSeen = 0;
-        let debugBothSeen = 0;
-
-        while (Date.now() - start < budgetMs) {
-          await Bun.sleep(2000);
-          if (session.exited()) {
-            throw new Error(
-              `claude exited (code=${session.exitCode()}) before AskUserQuestion rendered.\n` +
-                `Last visible:\n${session.visibleSince(since).slice(-2000)}`,
-            );
-          }
-          const visible = session.visibleSince(since);
-          // Marker check: anywhere in the post-slash region. Since `since`
-          // is set right after sending /plan-ceo-review, there's no stale
-          // AskUserQuestion above this line — the only AskUserQuestion that can produce these
-          // markers is the current one.
-          const hasEli10 = /ELI10\s*:/i.test(visible);
-          const hasRecommend = /Recommendation\s*:/i.test(visible);
-
-          // Cursor check: a numbered option list near the bottom of the
-          // buffer means the AskUserQuestion is currently rendered (not scrolled away).
-          const cursorTail = visible.slice(-4000);
-          const hasCursor = isNumberedOptionListVisible(cursorTail) &&
-                            parseNumberedOptions(cursorTail).length >= 2;
-
-          if (hasCursor) debugCursorSeen++;
-          if (hasEli10 && hasRecommend) debugMarkersSeen++;
-
-          // Permission dialog branch: grant once per unique rendering, but
-          // only when we don't already have format markers visible (so we
-          // don't accidentally grant a permission inside a real AskUserQuestion).
-          if (
-            hasCursor &&
-            !(hasEli10 && hasRecommend) &&
-            isPermissionDialogVisible(cursorTail)
-          ) {
-            const sig = visible.slice(-500);
-            if (sig !== lastPermSig) {
-              lastPermSig = sig;
-              session.send('1\r');
-              await Bun.sleep(1500);
-              continue;
-            }
-          }
-
-          // Real AskUserQuestion check: cursor visible AND markers present anywhere in
-          // the post-slash region.
-          if (hasCursor && hasEli10 && hasRecommend) {
-            debugBothSeen++;
-            captured = visible;
-            askUserQuestionVisible = true;
-            break;
-          }
-        }
-        if (!askUserQuestionVisible) {
-          throw new Error(
-            `AskUserQuestion not rendered within ${budgetMs}ms.\n` +
-              `Debug counts: cursorSeen=${debugCursorSeen} markersSeen=${debugMarkersSeen} bothSeen=${debugBothSeen}\n` +
-              `Last visible (4KB):\n${session.visibleSince(since).slice(-4000)}`,
-          );
-        }
-        const gaps = findFormatGaps(captured);
-        if (gaps.length > 0) {
-          // Surface the captured text last 3KB on failure for debugging.
-          const tail = captured.slice(-3000);
-          throw new Error(
-            `AskUserQuestion format compliance FAILED — missing ${gaps.length} mandated field(s):\n` +
-              gaps.map(g => `  - ${g.field} (regex: ${g.re.source})`).join('\n') +
-              `\n--- captured (last 3KB) ---\n${tail}`,
-          );
-        }
-
-        // Sanity: the parsed option list contains at least 2 options and
-        // one of them carries the (recommended) marker.
-        const opts = parseNumberedOptions(captured);
-        expect(opts.length).toBeGreaterThanOrEqual(2);
-        const hasRecommended = opts.some(o => /\(recommended\)/i.test(o.label));
-        if (!hasRecommended) {
-          // It's also acceptable for the (recommended) marker to live in
-          // prose above the box (some renderers wrap labels). The text-level
-          // RECOMMENDED_LBL check above already covers that case.
-          // Surface a friendlier message if the box itself missed it.
-          // (This is non-fatal because findFormatGaps already passed.)
-          // eslint-disable-next-line no-console
-          console.warn(
-            '(recommended) label appears in prose but not on a parsed option label — acceptable but watch for drift',
-          );
-        }
+        text = await captureModeSelectionAuq({ planDir: dir, testName: 'auq-format-gate', runId });
       } finally {
-        await session.close();
+        fs.rmSync(dir, { recursive: true, force: true });
+      }
+
+      if (!text.trim()) {
+        throw new Error('No AskUserQuestion captured — the skill never reached its mode-selection question.');
+      }
+
+      // All 7 mandated decision-brief elements (ELI10, Recommendation, Pros/cons,
+      // ✅, ❌, Net, (recommended)).
+      const fmt = scoreAuqFormat(text);
+      if (fmt.missing.length > 0) {
+        throw new Error(
+          `AskUserQuestion missing ${fmt.missing.length} mandated format element(s): ` +
+            `${fmt.missing.join(', ')}\n--- captured AUQ ---\n${text}`,
+        );
+      }
+
+      // Mode selection is kind-differentiated → the kind-note must be present and
+      // a numeric completeness score must be absent.
+      expect(text).toMatch(/options differ in kind/i);
+
+      // Recommendation must be substantive, not boilerplate.
+      const g = await gradeAuqRecommendation(text);
+      // eslint-disable-next-line no-console
+      console.log(
+        `[auq-format-gate] format=${fmt.present}/${fmt.total} substance=${g.substance} ` +
+          `recPresent=${g.present} literalBecause=${g.hadLiteralBecause}`,
+      );
+      expect(g.present).toBe(true);
+      if (g.substance < 4) {
+        throw new Error(
+          `Recommendation substance ${g.substance} < 4 (boilerplate/weak):\n--- captured AUQ ---\n${text}`,
+        );
       }
     },
-    660_000,
+    300_000,
   );
 });
diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts
index 218220ec8..f7ced6971 100644
--- a/test/touchfiles.test.ts
+++ b/test/touchfiles.test.ts
@@ -94,7 +94,7 @@ describe('selectTests', () => {
     expect(result.selected).toContain('plan-review-prosons-hardstop-neg');
     expect(result.selected).toContain('plan-review-prosons-neutral-neg');
     // v1.13.x real-PTY E2E batch entries that also depend on plan-ceo-review/**
-    expect(result.selected).toContain('ask-user-question-format-pty');
+    expect(result.selected).toContain('auq-format-gate');
     expect(result.selected).toContain('plan-ceo-mode-routing');
     expect(result.selected).toContain('autoplan-chain-pty');
     // Per-finding count + review-report-at-bottom (v1.21.x)