From 8bb733f54d706eed30ea0e0a2c8eab85e709918d Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 3 Jun 2026 07:28:44 -0700 Subject: [PATCH] test: migrate section-loading E2E to lossless SDK tool-stream detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /ship and /plan-ceo-review section-loading tests drove a real PTY and scraped the ANSI screen buffer for sections/.md paths. That silently saw nothing in a Conductor PTY (cursor-positioned tool renders and an unanswered Step 0 question loop both defeat the regex), so both reported read: [] even when the agent did the work. They now run the skill through claude -p (the same SDK path the AUQ matrix uses) and detect section reads from the tool-use stream — Read calls whose file_path contains sections/.md — with no rendering layer to mangle. The run is also hermetic: the freshly-generated worktree skeleton + sections are copied into a throwaway fixture with the absolute path pinned, so the test validates this branch's carve without mutating the user's ~/.claude install. Validated EVALS_TIER=periodic: both pass (plan-ceo Reads review-sections.md; ship Reads review-army.md + changelog.md), ~6.5 min for both vs ~23 min combined on the old PTY path where both were failing. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 3 + test/helpers/auq-sdk-capture.ts | 72 +++++- test/helpers/touchfiles.ts | 4 +- ...2e-plan-ceo-review-section-loading.test.ts | 219 +++++------------- test/skill-e2e-ship-section-loading.test.ts | 147 +++++------- 5 files changed, 191 insertions(+), 254 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 08eaa7255..4a1dbbde6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,9 @@ Every plan-review skill starts lighter and pulls in its review body on demand. T - `/plan-eng-review`, `/plan-design-review`, `/plan-devex-review` are each a skeleton + one `sections/review-sections.md` on Claude; Step 0 stays always-loaded. - Parity, size-budget, and gen-skill-docs treat all three as carved skills (union content checks, skeleton-shrink assertions). +#### For contributors +- The `/ship` and `/plan-ceo-review` section-loading E2E tests now detect section reads from the `claude -p` tool-use stream instead of scraping the real-PTY screen buffer, so they are reliable (the PTY path silently saw nothing in some terminals) and run hermetically against the worktree carve without mutating the installed skill. + ## [1.58.0.0] - 2026-06-01 ## **Every skill that asks you questions got a little lighter, all at once — the AskUserQuestion preamble stopped carrying its rare-case manuals inline.** diff --git a/test/helpers/auq-sdk-capture.ts b/test/helpers/auq-sdk-capture.ts index 8b2de4b07..a95a4b05b 100644 --- a/test/helpers/auq-sdk-capture.ts +++ b/test/helpers/auq-sdk-capture.ts @@ -15,7 +15,7 @@ import * as fs from 'node:fs'; import * as os from 'node:os'; import * as path from 'node:path'; import { spawnSync } from 'node:child_process'; -import { runSkillTest } from './session-runner'; +import { runSkillTest, type SkillTestResult } from './session-runner'; const ROOT = path.resolve(__dirname, '..', '..'); @@ -201,6 +201,76 @@ This is a capture test, not an interactive session. Skip any system-audit / envi } } +/** + * Drive ANY carved skill through a real `claude -p` run and detect, LOSSLESSLY, + * which `sections/.md` files the agent actually Read — from the tool-use + * stream, not the ANSI screen buffer. This is the reliable replacement for the + * real-PTY `visibleSince()` screen-scraping the section-loading tests used to do + * (which silently saw nothing in a Conductor PTY: cursor-positioned renders and + * an unanswered Step 0 question loop both defeat the regex). + * + * The skill under test is the planted copy in `planDir` (pin the absolute path so + * the agent cannot wander to the global install). AskUserQuestion is declared + * unavailable so the agent auto-picks the recommended option and proceeds far + * enough to hit the post-Step-0 STOP-Read directives; Read is the tool a STOP-Read + * resolves to, so Read/Grep/Glob/Write is all the agent needs (no Bash → it cannot + * `find /` its way out, nor run git/gh mutations). + */ +export async function captureSectionReads(opts: { + planDir: string; + skillName: string; + scenario: string; + /** Relative filename the agent writes its final output to (terminal signal). */ + reportFile?: string; + /** Marker proving a real report/plan was produced (default: any non-empty text). */ + reportMarker?: RegExp; + testName: string; + runId?: string; + model?: string; + maxTurns?: number; + timeout?: number; +}): Promise<{ readSections: Set; reportProduced: boolean; toolCalls: SkillTestResult['toolCalls']; output: string }> { + const outFile = path.join(opts.planDir, opts.reportFile ?? 'REPORT.md'); + const skillPath = path.join(opts.planDir, opts.skillName, 'SKILL.md'); + const prompt = `You are running an automated skill-execution test. No human is present, so AskUserQuestion is unavailable. The ONLY skill file you may read is this absolute path: ${skillPath}. Do NOT Glob/find/search for any other SKILL.md anywhere — especially nothing under ~/.claude or /Users. + +Read ${skillPath} and EXECUTE its workflow for this scenario: + +${opts.scenario} + +Rules for this run: +- Skip system-audit, environment-setup, telemetry, and codebase-exploration steps. +- At any decision point that would call AskUserQuestion, silently pick the skill's recommended option and continue. Do NOT stop to ask. +- This skill's body has been carved into on-demand sections/. When the skill gives a STOP-Read directive (for example "Read \`.../sections/\` and execute it in full"), you MUST actually Read that sections/ file with the Read tool BEFORE doing the work it covers. Do not work from memory. +- Do NOT run git, gh, commit, push, or any mutating command. +- When the workflow is complete, write the skill's final output (the full review report / ship plan, including any required report table) to ${outFile}.`; + + const result = await runSkillTest({ + prompt, + workingDirectory: opts.planDir, + allowedTools: ['Read', 'Grep', 'Glob', 'Write'], + maxTurns: opts.maxTurns ?? 25, + timeout: opts.timeout ?? 300_000, + testName: opts.testName, + runId: opts.runId, + model: opts.model ?? 'claude-opus-4-7', + }); + + const readSections = new Set(); + for (const c of result.toolCalls) { + if (c.tool !== 'Read') continue; + const fp = String(c.input?.file_path ?? ''); + const m = fp.match(/sections\/([A-Za-z0-9._-]+\.md)/); + if (m) readSections.add(m[1]); + } + + let output = ''; + try { output = fs.readFileSync(outFile, 'utf-8'); } catch { output = result.output ?? ''; } + const reportProduced = opts.reportMarker ? opts.reportMarker.test(output) : output.trim().length > 0; + + return { readSections, reportProduced, toolCalls: result.toolCalls, output }; +} + /** Read the carved (current worktree) plan-ceo SKILL.md + its sections dir. */ export function carvedSkill(): { skillMd: string; sectionsFrom: string | null } { const sec = path.join(ROOT, 'plan-ceo-review', 'sections'); diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 4ca264274..5c49e46cd 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -121,8 +121,8 @@ export const E2E_TOUCHFILES: Record = { 'plan-design-with-ui-scope': ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'], 'budget-regression-pty': ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'], 'ship-idempotency-pty': ['ship/**', 'bin/gstack-next-version', 'bin/gstack-version-bump', 'scripts/resolvers/sections.ts', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'], - 'ship-section-loading': ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/required-reads.ts', 'test/helpers/transcript-section-logger.ts', 'test/helpers/claude-pty-runner.ts'], - 'plan-ceo-section-loading': ['plan-ceo-review/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/required-reads.ts', 'test/helpers/transcript-section-logger.ts', 'test/helpers/claude-pty-runner.ts'], + 'ship-section-loading': ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'], + 'plan-ceo-section-loading': ['plan-ceo-review/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'], 'autoplan-chain-pty': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'], 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'], diff --git a/test/skill-e2e-plan-ceo-review-section-loading.test.ts b/test/skill-e2e-plan-ceo-review-section-loading.test.ts index e9ed1ad41..a4f96e080 100644 --- a/test/skill-e2e-plan-ceo-review-section-loading.test.ts +++ b/test/skill-e2e-plan-ceo-review-section-loading.test.ts @@ -1,191 +1,92 @@ /** - * /plan-ceo-review section-loading E2E (periodic, paid, real-PTY) — v2 plan + * /plan-ceo-review section-loading E2E (periodic, paid, SDK capture) — v2 plan * Phase B carve backstop. The per-PR guard is the free static test * skill-ceo-section-ordering.test.ts; THIS is the behavioral proof that a real * agent actually Reads the carved section instead of working from memory. * - * After the carve, plan-ceo-review is a skeleton whose single STOP-Read directive - * (fired after Step 0 mode selection) points at sections/review-sections.md. This - * test runs the REAL /plan-ceo-review skill in plan mode against a fixture branch - * that has a plan worth reviewing, drives Step 0 to HOLD SCOPE (the simplest mode - * that still requires all 11 review sections), and asserts the agent Read - * review-sections.md before producing the review report. + * Detection is LOSSLESS. Earlier this test drove a real PTY and scraped the ANSI + * screen buffer for the `sections/.md` path. That silently saw nothing in a + * Conductor PTY — cursor-positioned tool renders and an unanswered Step 0 question + * loop both defeat the regex, so it reported `read: []` even when the agent did the + * work. It now runs the skill through `claude -p` (the SDK path the AUQ matrix + * uses) and detects section reads from the tool-use stream (`Read` calls whose + * file_path contains `sections/review-sections.md`). No rendering layer to mangle. * - * Codex outside-voice P1 fixes vs the naive port of the ship test: - * - REFRESH THE INSTALL FIRST. The skill loads from the installed copy at - * ~/.claude/skills/gstack/plan-ceo-review (a real copy on dev machines, fresh - * on CI). A test that didn't refresh would assert against the pre-carve - * monolith and trivially "pass" with zero section reads. beforeAll copies the - * freshly-generated skeleton + sections into the install; afterAll restores the - * prior state so a local run doesn't leave the active skill mutated. - * - HANDLE THE FULL STEP 0. plan-ceo's Step 0 can fire a system audit, WebSearch, - * and several AskUserQuestion calls before mode selection — the answer loop - * replies to every permission dialog / numbered list, not just two. + * Hermetic, not install-mutating: the freshly-generated worktree skeleton + + * sections are copied into a throwaway fixture dir and the absolute path is pinned, + * so the test validates THIS branch's carve without touching the user's active + * ~/.claude install. (Install-layout linking is covered separately by + * setup-sections-linking.test.ts.) * - * Plan-mode framing keeps the agent from editing/committing. Cost: ~$3-5/run. - * Periodic tier. + * The agent is told AskUserQuestion is unavailable, so it auto-picks the + * recommended option through Step 0 and reaches the post-Step-0 STOP-Read. HOLD + * SCOPE is the simplest mode that still requires the full review section. Cost: + * ~$1-2/run. Periodic tier. */ import { describe, test, expect } from 'bun:test'; -import { spawnSync } from 'child_process'; -import * as fs from 'fs'; -import * as path from 'path'; -import * as os from 'os'; import { - launchClaudePty, - isPermissionDialogVisible, - isNumberedOptionListVisible, -} from './helpers/claude-pty-runner'; + setupSkillDir, + skillFromWorktree, + captureSectionReads, +} from './helpers/auq-sdk-capture'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; const describeE2E = shouldRun ? describe : describe.skip; - -const REPO_ROOT = path.resolve(import.meta.dir, '..'); -const INSTALL_DIR = path.join(os.homedir(), '.claude', 'skills', 'gstack', 'plan-ceo-review'); +const runId = `plan-ceo-section-loading-${process.env.EVALS_RUN_ID ?? 'local'}`; // Sections every plan-ceo-review run must consult after Step 0. const REQUIRED_SECTIONS = ['review-sections.md']; -/** Copy the freshly-generated skeleton + sections into the installed skill so the - * PTY agent loads the carve under test. Returns a restore() that puts the install - * back exactly as it was (content of SKILL.md + presence/content of sections/). */ -function refreshInstall(): () => void { - const repoSkill = path.join(REPO_ROOT, 'plan-ceo-review', 'SKILL.md'); - const repoSections = path.join(REPO_ROOT, 'plan-ceo-review', 'sections'); - const installSkill = path.join(INSTALL_DIR, 'SKILL.md'); - const installSections = path.join(INSTALL_DIR, 'sections'); +const PLAN_MD = [ + '# Plan: add an in-memory cache layer', + '', + '## Context', + 'Reads hit the DB on every request. Add a process-local LRU cache in front of', + 'the read path to cut DB load.', + '', + '## Approach', + '- Wrap the read repository in a cache that stores the last 1000 keys.', + '- Invalidate on write.', + '', + '## Out of scope', + 'Distributed cache, cross-process coherence.', + '', +].join('\n'); - // Snapshot prior state for restore. - const priorSkill = fs.existsSync(installSkill) ? fs.readFileSync(installSkill) : null; - const hadSections = fs.existsSync(installSections); - const priorSections: Record = {}; - if (hadSections) { - for (const f of fs.readdirSync(installSections)) { - priorSections[f] = fs.readFileSync(path.join(installSections, f)); - } - } - - // Apply: skeleton + every generated section file (.md) + manifest. - fs.mkdirSync(INSTALL_DIR, { recursive: true }); - fs.copyFileSync(repoSkill, installSkill); - fs.mkdirSync(installSections, { recursive: true }); - for (const f of fs.readdirSync(repoSections)) { - if (f.endsWith('.md.tmpl')) continue; // install carries generated files, not templates - fs.copyFileSync(path.join(repoSections, f), path.join(installSections, f)); - } - - return function restore(): void { - try { - if (priorSkill) fs.writeFileSync(installSkill, priorSkill); - if (hadSections) { - // Restore the prior section files; drop any we added. - for (const f of fs.readdirSync(installSections)) { - if (!(f in priorSections)) fs.rmSync(path.join(installSections, f), { force: true }); - } - for (const [f, buf] of Object.entries(priorSections)) { - fs.writeFileSync(path.join(installSections, f), buf); - } - } else { - fs.rmSync(installSections, { recursive: true, force: true }); - } - } catch { /* best-effort restore */ } - }; -} - -/** Fixture: a feature branch with a real change + a plan file worth reviewing. */ -function buildPlanFixture(): { workTree: string; root: string } { - const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ceo-secload-')); - const workTree = path.join(root, 'workspace'); - const bareRemote = path.join(root, 'origin.git'); - fs.mkdirSync(workTree, { recursive: true }); - const sh = (cmd: string, cwd: string): void => { - const r = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 }); - if (r.status !== 0) throw new Error(`fixture setup failed at "${cmd}":\n${r.stderr?.toString()}`); - }; - sh(`git init --bare "${bareRemote}"`, root); - sh('git init -b main', workTree); - sh('git config user.email "t@t.com" && git config user.name "T" && git config commit.gpgsign false', workTree); - fs.writeFileSync(path.join(workTree, 'app.js'), '// base\n'); - sh('git add -A && git commit -m "chore: initial"', workTree); - sh(`git remote add origin "${bareRemote}" && git push -u origin main`, workTree); - // Feature branch with a real change + a plan describing it (something to review). - sh('git checkout -b feat/cache-layer', workTree); - fs.writeFileSync( - path.join(workTree, 'PLAN.md'), - [ - '# Plan: add an in-memory cache layer', - '', - '## Context', - 'Reads hit the DB on every request. Add a process-local LRU cache in front of', - 'the read path to cut DB load.', - '', - '## Approach', - '- Wrap the read repository in a cache that stores the last 1000 keys.', - '- Invalidate on write.', - '', - '## Out of scope', - 'Distributed cache, cross-process coherence.', - '', - ].join('\n'), - ); - fs.writeFileSync(path.join(workTree, 'app.js'), '// base\nexport function read(k) { return db.get(k); }\n'); - sh('git add -A && git commit -m "feat: cache layer plan + stub"', workTree); - sh('git push -u origin feat/cache-layer', workTree); - return { workTree, root }; -} - -describeE2E('/plan-ceo-review section-loading E2E (periodic, real-PTY, installed skill)', () => { +describeE2E('/plan-ceo-review section-loading E2E (periodic, SDK capture)', () => { test( 'a real review Reads the carved section before producing the report', async () => { - const restore = refreshInstall(); - const { workTree, root } = buildPlanFixture(); - const session = await launchClaudePty({ - permissionMode: 'plan', - cwd: workTree, - timeoutMs: 900_000, - env: { NO_COLOR: '1' }, + const { skillMd, sectionsFrom } = skillFromWorktree('plan-ceo-review'); + const planDir = setupSkillDir({ + skillName: 'plan-ceo-review', + skillMd, + sectionsFrom, + fixtures: { 'PLAN.md': PLAN_MD }, + tmpPrefix: 'gstack-ceo-secload-', }); - const readSections = new Set(); - let reportReady = false; - try { - await Bun.sleep(8000); - const since = session.mark(); - // HOLD SCOPE = simplest mode that still walks all 11 review sections. - session.send('/plan-ceo-review review PLAN.md, hold scope\r'); - const start = Date.now(); - let lastPermSig = ''; - while (Date.now() - start < 780_000) { - await Bun.sleep(3000); - if (session.exited()) break; - const visible = session.visibleSince(since); - const tail = visible.slice(-1500); - // Answer EVERY permission dialog / numbered option list (system audit, - // WebSearch, and the several Step 0 questions) by taking option 1. - if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) { - const sig = visible.slice(-500); - if (sig !== lastPermSig) { lastPermSig = sig; session.send('1\r'); await Bun.sleep(1500); continue; } - } - for (const m of visible.matchAll(/sections\/([A-Za-z0-9._-]+\.md)/g)) readSections.add(m[1]); - if (/GSTACK REVIEW REPORT|COMPLETION SUMMARY|ready to execute/i.test(visible)) { - reportReady = true; - break; - } - } - } finally { - await session.close(); - try { fs.rmSync(root, { recursive: true, force: true }); } catch { /* ignore */ } - restore(); - } + const { readSections, reportProduced, output } = await captureSectionReads({ + planDir, + skillName: 'plan-ceo-review', + scenario: + 'Review the plan in PLAN.md. Hold the current scope (HOLD SCOPE mode) — do not challenge or expand scope. Run the full CEO review and produce the review report.', + requiredSections: REQUIRED_SECTIONS, + reportMarker: /GSTACK REVIEW REPORT|COMPLETION SUMMARY|review/i, + testName: '/plan-ceo-review section-loading', + runId, + }); const missing = REQUIRED_SECTIONS.filter(s => !readSections.has(s)); - expect({ reportReady, read: [...readSections], missing }).toEqual({ - reportReady: true, + expect({ reportProduced, read: [...readSections], missing }).toEqual({ + reportProduced: true, read: expect.any(Array), missing: [], }); + // Guard against an empty pass: the report must have real content. + expect(output.trim().length).toBeGreaterThan(200); }, - 1_020_000, + 360_000, ); }); diff --git a/test/skill-e2e-ship-section-loading.test.ts b/test/skill-e2e-ship-section-loading.test.ts index 67355ee90..a681ff114 100644 --- a/test/skill-e2e-ship-section-loading.test.ts +++ b/test/skill-e2e-ship-section-loading.test.ts @@ -1,120 +1,83 @@ /** - * /ship section-loading E2E (periodic, paid, real-PTY) — v2 plan T9 mitigation - * layer 5, the ONLY CI-failing guard against silent section-skip. + * /ship section-loading E2E (periodic, paid, SDK capture) — v2 plan T9 mitigation + * layer 5: the behavioral guard that a real agent Reads the carved sections a + * version-changing ship requires instead of working from the skeleton's memory. * - * After the carve, ship is a skeleton whose STOP-Read directives point at - * sections/*.md. This test runs the REAL /ship skill in plan mode against a - * fresh version-changing fixture and asserts the agent actually Read the - * sections its situation requires (review-army + changelog at minimum — every - * version-changing ship needs the pre-landing review and a CHANGELOG entry). + * Detection is LOSSLESS. Earlier this test drove a real PTY and scraped the ANSI + * screen buffer for `sections/.md` paths, which silently saw nothing in a + * Conductor PTY (cursor-positioned tool renders + an unanswered question loop + * defeat the regex — it reported `read: []` even when the agent did the work). It + * now runs the skill through `claude -p` (the SDK path the AUQ matrix uses) and + * detects section reads from the tool-use stream (`Read` calls whose file_path + * contains `sections/review-army.md` / `sections/changelog.md`). * - * Runs against the INSTALLED skill at ~/.claude/skills/gstack/ship (Codex - * outside-voice #5: an E2E that reads repo paths would miss install-layout - * 404s). Section reads are detected from the PTY scrollback — when the agent - * Reads a section the tool render shows the `sections/.md` path. + * Hermetic, not install-mutating: the freshly-generated worktree skeleton + + * sections are copied into a throwaway fixture dir and the absolute path is pinned, + * so the test validates the current carve without touching the user's active + * ~/.claude install. (Install-layout linking is covered by + * setup-sections-linking.test.ts.) * - * Plan-mode framing keeps the agent from committing/pushing; producing a plan - * is the terminal signal. Cost: ~$2-4/run. Periodic tier. - * - * Situation matrix (T1 = B): this file covers the fresh version-changing ship; - * the already-bumped re-run is covered by skill-e2e-ship-idempotency.test.ts, - * and a no-plan-file variant can be added to FIXTURES below. + * The agent is told AskUserQuestion is unavailable and is given the version-changing + * situation explicitly (no Bash, so it can't and needn't probe git), so it follows + * the skeleton's STOP-Read directives for that situation. Cost: ~$1-2/run. + * Periodic tier. */ import { describe, test, expect } from 'bun:test'; -import { spawnSync } from 'child_process'; -import * as fs from 'fs'; -import * as path from 'path'; -import * as os from 'os'; import { - launchClaudePty, - isPermissionDialogVisible, - isNumberedOptionListVisible, -} from './helpers/claude-pty-runner'; + setupSkillDir, + skillFromWorktree, + captureSectionReads, +} from './helpers/auq-sdk-capture'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; const describeE2E = shouldRun ? describe : describe.skip; - -/** Fresh fixture: feature branch with a real change but VERSION still == base, - * so /ship must bump (FRESH) and walk the full pre-landing + changelog flow. */ -function buildFreshFixture(): { workTree: string; root: string } { - const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ship-secload-')); - const workTree = path.join(root, 'workspace'); - const bareRemote = path.join(root, 'origin.git'); - fs.mkdirSync(workTree, { recursive: true }); - const sh = (cmd: string, cwd: string): void => { - const r = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 }); - if (r.status !== 0) throw new Error(`fixture setup failed at "${cmd}":\n${r.stderr?.toString()}`); - }; - sh(`git init --bare "${bareRemote}"`, root); - sh('git init -b main', workTree); - sh('git config user.email "t@t.com" && git config user.name "T" && git config commit.gpgsign false', workTree); - fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.1\n'); - fs.writeFileSync(path.join(workTree, 'package.json'), JSON.stringify({ name: 'fx', version: '0.0.1', private: true }, null, 2) + '\n'); - fs.writeFileSync(path.join(workTree, 'CHANGELOG.md'), '# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n'); - fs.writeFileSync(path.join(workTree, 'app.js'), '// base\n'); - sh('git add -A && git commit -m "chore: initial v0.0.1"', workTree); - sh(`git remote add origin "${bareRemote}" && git push -u origin main`, workTree); - // Feature branch: a real code change, VERSION untouched → FRESH (needs a bump). - sh('git checkout -b feat/new-thing', workTree); - fs.writeFileSync(path.join(workTree, 'app.js'), '// base\nexport function newThing() { return 42; }\n'); - fs.writeFileSync(path.join(workTree, 'app.test.js'), 'test("newThing", () => {});\n'); - sh('git add -A && git commit -m "feat: add newThing"', workTree); - sh('git push -u origin feat/new-thing', workTree); - return { workTree, root }; -} +const runId = `ship-section-loading-${process.env.EVALS_RUN_ID ?? 'local'}`; // Sections every version-changing ship must consult. const REQUIRED_SECTIONS = ['review-army.md', 'changelog.md']; -describeE2E('/ship section-loading E2E (periodic, real-PTY, installed skill)', () => { +const FIXTURES: Record = { + VERSION: '0.0.1\n', + 'package.json': JSON.stringify({ name: 'fx', version: '0.0.1', private: true }, null, 2) + '\n', + 'CHANGELOG.md': '# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n', + 'app.js': '// base\nexport function newThing() { return 42; }\n', + 'app.test.js': 'test("newThing", () => {});\n', +}; + +describeE2E('/ship section-loading E2E (periodic, SDK capture)', () => { test( 'fresh version-changing ship Reads the required sections', async () => { - const { workTree, root } = buildFreshFixture(); - const session = await launchClaudePty({ - permissionMode: 'plan', - cwd: workTree, - timeoutMs: 720_000, - env: { GH_TOKEN: 'mock-not-real', NO_COLOR: '1' }, + const { skillMd, sectionsFrom } = skillFromWorktree('ship'); + const planDir = setupSkillDir({ + skillName: 'ship', + skillMd, + sectionsFrom, + fixtures: FIXTURES, + tmpPrefix: 'gstack-ship-secload-', }); - const readSections = new Set(); - let planReady = false; - try { - await Bun.sleep(8000); - const since = session.mark(); - session.send('/ship\r'); - const start = Date.now(); - let lastPermSig = ''; - while (Date.now() - start < 600_000) { - await Bun.sleep(3000); - if (session.exited()) break; - const visible = session.visibleSince(since); - const tail = visible.slice(-1500); - if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) { - const sig = visible.slice(-500); - if (sig !== lastPermSig) { lastPermSig = sig; session.send('1\r'); await Bun.sleep(1500); continue; } - } - // Detect section reads from the scrollback (tool render shows the path). - for (const m of visible.matchAll(/sections\/([A-Za-z0-9._-]+\.md)/g)) readSections.add(m[1]); - if (/ready to execute|Would you like to proceed|GSTACK REVIEW REPORT/i.test(visible)) { - planReady = true; - break; - } - } - } finally { - await session.close(); - try { fs.rmSync(root, { recursive: true, force: true }); } catch { /* ignore */ } - } + const { readSections, reportProduced, output } = await captureSectionReads({ + planDir, + skillName: 'ship', + scenario: + 'This is a FRESH version-changing ship: the branch has a real code change (app.js gained a new function with a test), VERSION still equals the base version (0.0.1, so it needs a bump), and CHANGELOG.md needs a new entry. Follow the skill\'s flow for a version-changing ship: run the pre-landing review and prepare the CHANGELOG entry. Produce the ship plan / review report. Do NOT actually commit, push, or open a PR.', + requiredSections: REQUIRED_SECTIONS, + reportMarker: /version|changelog|review|ship/i, + testName: '/ship section-loading', + runId, + }); const missing = REQUIRED_SECTIONS.filter(s => !readSections.has(s)); - expect({ planReady, read: [...readSections], missing }).toEqual({ - planReady: true, + expect({ reportProduced, read: [...readSections], missing }).toEqual({ + reportProduced: true, read: expect.any(Array), missing: [], }); + // Guard against an empty pass: the report must have real content. + expect(output.trim().length).toBeGreaterThan(200); }, - 900_000, + 360_000, ); });