gstack/test/skill-e2e-ask-user-question...

/**
 * AskUserQuestion format-compliance gate (gate, paid, SDK capture).
 *
 * Asserts: /plan-ceo-review's first AskUserQuestion (Step 0F mode selection) is a
 * compliant decision brief — all 7 mandated format elements present, with a
 * substantive recommendation.
 *
 * Why SDK capture, not real-PTY (changed v1.59+): the prior version launched an
 * interactive `claude` PTY and grepped the rendered TUI after stripAnsi. But
 * plan-mode AUQs render as an interactive cursor picker whose cursor-positioning
 * escapes stripAnsi CANNOT faithfully flatten — verified directly: the picker
 * renders fine for a human (cursorSeen=45) but the flattened text drops `ELI10:`
 * and `(recommended)` and `parseNumberedOptions` returns 0. So the old test was
 * grading a lossy projection of the TUI, not the question's actual format, and
 * failed by construction in this environment.
 *
 * This version drives the skill via the SDK $OUT_FILE capture path (the agent
 * writes the verbatim AskUserQuestion it would have shown to a file — clean text,
 * zero rendering loss) and grades that. Same property tested (does the question
 * carry every format element), reliably, environment-independent. The rendering
 * layer is identical across skills/content, so it is not where format regressions
 * hide; the model's composed question is. Shares the engine with the periodic
 * A/B and matrix evals (test/helpers/auq-sdk-capture.ts).
 */
import { describe, test, expect } from 'bun:test';
import * as fs from 'node:fs';
import {
  setupPlanCeoDir,
  captureModeSelectionAuq,
  scoreAuqFormat,
  gradeAuqRecommendation,
  carvedSkill,
} from './helpers/auq-sdk-capture';

const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
const describeE2E = shouldRun ? describe : describe.skip;
const runId = `auq-format-gate-${process.env.EVALS_RUN_ID ?? 'local'}`;

describeE2E('AskUserQuestion format compliance (gate)', () => {
  test(
    "/plan-ceo-review's first AskUserQuestion is a compliant decision brief (7/7 + substance)",
    async () => {
      const carved = carvedSkill();
      const dir = setupPlanCeoDir({
        skillMd: carved.skillMd,
        sectionsFrom: carved.sectionsFrom,
        tmpPrefix: 'auq-format-gate-',
      });

      let text = '';
      try {
        text = await captureModeSelectionAuq({ planDir: dir, testName: 'auq-format-gate', runId });
      } finally {
        fs.rmSync(dir, { recursive: true, force: true });
      }

      if (!text.trim()) {
        throw new Error('No AskUserQuestion captured — the skill never reached its mode-selection question.');
      }

      // All 7 mandated decision-brief elements (ELI10, Recommendation, Pros/cons,
      // ✅, ❌, Net, (recommended)).
      const fmt = scoreAuqFormat(text);
      if (fmt.missing.length > 0) {
        throw new Error(
          `AskUserQuestion missing ${fmt.missing.length} mandated format element(s): ` +
            `${fmt.missing.join(', ')}\n--- captured AUQ ---\n${text}`,
        );
      }

      // Mode selection is kind-differentiated → the kind-note must be present and
      // a numeric completeness score must be absent.
      expect(text).toMatch(/options differ in kind/i);

      // Recommendation must be substantive, not boilerplate.
      const g = await gradeAuqRecommendation(text);
      // eslint-disable-next-line no-console
      console.log(
        `[auq-format-gate] format=${fmt.present}/${fmt.total} substance=${g.substance} ` +
          `recPresent=${g.present} literalBecause=${g.hadLiteralBecause}`,
      );
      expect(g.present).toBe(true);
      if (g.substance < 4) {
        throw new Error(
          `Recommendation substance ${g.substance} < 4 (boilerplate/weak):\n--- captured AUQ ---\n${text}`,
        );
      }
    },
    300_000,
  );
});