mirror of https://github.com/garrytan/gstack.git
93 lines
3.7 KiB
TypeScript
93 lines
3.7 KiB
TypeScript
/**
|
|
* /plan-ceo-review section-loading E2E (periodic, paid, SDK capture) — v2 plan
|
|
* Phase B carve backstop. The per-PR guard is the free static test
|
|
* skill-ceo-section-ordering.test.ts; THIS is the behavioral proof that a real
|
|
* agent actually Reads the carved section instead of working from memory.
|
|
*
|
|
* Detection is LOSSLESS. Earlier this test drove a real PTY and scraped the ANSI
|
|
* screen buffer for the `sections/<file>.md` path. That silently saw nothing in a
|
|
* Conductor PTY — cursor-positioned tool renders and an unanswered Step 0 question
|
|
* loop both defeat the regex, so it reported `read: []` even when the agent did the
|
|
* work. It now runs the skill through `claude -p` (the SDK path the AUQ matrix
|
|
* uses) and detects section reads from the tool-use stream (`Read` calls whose
|
|
* file_path contains `sections/review-sections.md`). No rendering layer to mangle.
|
|
*
|
|
* Hermetic, not install-mutating: the freshly-generated worktree skeleton +
|
|
* sections are copied into a throwaway fixture dir and the absolute path is pinned,
|
|
* so the test validates THIS branch's carve without touching the user's active
|
|
* ~/.claude install. (Install-layout linking is covered separately by
|
|
* setup-sections-linking.test.ts.)
|
|
*
|
|
* The agent is told AskUserQuestion is unavailable, so it auto-picks the
|
|
* recommended option through Step 0 and reaches the post-Step-0 STOP-Read. HOLD
|
|
* SCOPE is the simplest mode that still requires the full review section. Cost:
|
|
* ~$1-2/run. Periodic tier.
|
|
*/
|
|
|
|
import { describe, test, expect } from 'bun:test';
|
|
import {
|
|
setupSkillDir,
|
|
skillFromWorktree,
|
|
captureSectionReads,
|
|
} from './helpers/auq-sdk-capture';
|
|
|
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
|
const describeE2E = shouldRun ? describe : describe.skip;
|
|
const runId = `plan-ceo-section-loading-${process.env.EVALS_RUN_ID ?? 'local'}`;
|
|
|
|
// Sections every plan-ceo-review run must consult after Step 0.
|
|
const REQUIRED_SECTIONS = ['review-sections.md'];
|
|
|
|
const PLAN_MD = [
|
|
'# Plan: add an in-memory cache layer',
|
|
'',
|
|
'## Context',
|
|
'Reads hit the DB on every request. Add a process-local LRU cache in front of',
|
|
'the read path to cut DB load.',
|
|
'',
|
|
'## Approach',
|
|
'- Wrap the read repository in a cache that stores the last 1000 keys.',
|
|
'- Invalidate on write.',
|
|
'',
|
|
'## Out of scope',
|
|
'Distributed cache, cross-process coherence.',
|
|
'',
|
|
].join('\n');
|
|
|
|
describeE2E('/plan-ceo-review section-loading E2E (periodic, SDK capture)', () => {
|
|
test(
|
|
'a real review Reads the carved section before producing the report',
|
|
async () => {
|
|
const { skillMd, sectionsFrom } = skillFromWorktree('plan-ceo-review');
|
|
const planDir = setupSkillDir({
|
|
skillName: 'plan-ceo-review',
|
|
skillMd,
|
|
sectionsFrom,
|
|
fixtures: { 'PLAN.md': PLAN_MD },
|
|
tmpPrefix: 'gstack-ceo-secload-',
|
|
});
|
|
|
|
const { readSections, reportProduced, output } = await captureSectionReads({
|
|
planDir,
|
|
skillName: 'plan-ceo-review',
|
|
scenario:
|
|
'Review the plan in PLAN.md. Hold the current scope (HOLD SCOPE mode) — do not challenge or expand scope. Run the full CEO review and produce the review report.',
|
|
requiredSections: REQUIRED_SECTIONS,
|
|
reportMarker: /GSTACK REVIEW REPORT|COMPLETION SUMMARY|review/i,
|
|
testName: '/plan-ceo-review section-loading',
|
|
runId,
|
|
});
|
|
|
|
const missing = REQUIRED_SECTIONS.filter(s => !readSections.has(s));
|
|
expect({ reportProduced, read: [...readSections], missing }).toEqual({
|
|
reportProduced: true,
|
|
read: expect.any(Array),
|
|
missing: [],
|
|
});
|
|
// Guard against an empty pass: the report must have real content.
|
|
expect(output.trim().length).toBeGreaterThan(200);
|
|
},
|
|
360_000,
|
|
);
|
|
});
|