From 6c610bf55d413237e2a33c975b84684511bb7a00 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 2 Jun 2026 22:40:40 -0700 Subject: [PATCH] test: fix carve-broken CI evals (union reads + section fixtures) Two CI eval jobs failed on the carved plan-* skills because they read content that moved into sections/: - llm-judge (skill-llm-eval): runWorkflowJudge sliced SKILL.md between markers like "## Review Sections" / "## CRITICAL RULE" that now live in sections/review-sections.md. The markers vanished from the skeleton, so the judge scored empty/wrong content. Fix: read the skeleton+sections union. Verified: plan-ceo modes / plan-eng sections / plan-design passes all PASS (25/25). - e2e-plan (skill-e2e-plan): setupPlanDir copied only /SKILL.md into the fixture, not sections/. The carved skill's STOP pointed at a section file that was absent, so the model improvised a compressed report table instead of the canonical "| Review | Trigger | Why | Runs | Status | Findings |". Fix: copy sections/ alongside SKILL.md in all 6 setup sites. Verified: report test PASS, canonical table emitted. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/skill-e2e-plan.test.ts | 12 ++++++++++++ test/skill-llm-eval.test.ts | 14 +++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/test/skill-e2e-plan.test.ts b/test/skill-e2e-plan.test.ts index 9b61e9a20..6cfd3e322 100644 --- a/test/skill-e2e-plan.test.ts +++ b/test/skill-e2e-plan.test.ts @@ -61,6 +61,8 @@ We're building a new user dashboard that shows recent activity, notifications, a path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), path.join(planDir, 'plan-ceo-review', 'SKILL.md'), ); + // Carved skills (v2 plan T9): copy sections/ so the review workflow + report template are present. + { const _sec = path.join(ROOT, 'plan-ceo-review', 'sections'); if (fs.existsSync(_sec)) fs.cpSync(_sec, path.join(planDir, 'plan-ceo-review', 'sections'), { recursive: true }); } }); afterAll(() => { @@ -145,6 +147,8 @@ We're building a new user dashboard that shows recent activity, notifications, a path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), path.join(planDir, 'plan-ceo-review', 'SKILL.md'), ); + // Carved skills (v2 plan T9): copy sections/ so the review workflow + report template are present. + { const _sec = path.join(ROOT, 'plan-ceo-review', 'sections'); if (fs.existsSync(_sec)) fs.cpSync(_sec, path.join(planDir, 'plan-ceo-review', 'sections'), { recursive: true }); } }); afterAll(() => { @@ -213,6 +217,8 @@ describeIfSelected('Plan CEO Review Expansion Energy E2E', ['plan-ceo-review-exp path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), path.join(planDir, 'plan-ceo-review', 'SKILL.md'), ); + // Carved skills (v2 plan T9): copy sections/ so the review workflow + report template are present. + { const _sec = path.join(ROOT, 'plan-ceo-review', 'sections'); if (fs.existsSync(_sec)) fs.cpSync(_sec, path.join(planDir, 'plan-ceo-review', 'sections'), { recursive: true }); } }); afterAll(() => { @@ -319,6 +325,8 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R path.join(ROOT, 'plan-eng-review', 'SKILL.md'), path.join(planDir, 'plan-eng-review', 'SKILL.md'), ); + // Carved skills (v2 plan T9): copy sections/ so the review workflow + report template are present. + { const _sec = path.join(ROOT, 'plan-eng-review', 'sections'); if (fs.existsSync(_sec)) fs.cpSync(_sec, path.join(planDir, 'plan-eng-review', 'sections'), { recursive: true }); } }); afterAll(() => { @@ -415,6 +423,8 @@ export function main() { return Dashboard(); } path.join(ROOT, 'plan-eng-review', 'SKILL.md'), path.join(planDir, 'plan-eng-review', 'SKILL.md'), ); + // Carved skills (v2 plan T9): copy sections/ so the review workflow + report template are present. + { const _sec = path.join(ROOT, 'plan-eng-review', 'sections'); if (fs.existsSync(_sec)) fs.cpSync(_sec, path.join(planDir, 'plan-eng-review', 'sections'), { recursive: true }); } // Set up remote-slug shim and browse shims (plan-eng-review uses remote-slug for artifact path) setupBrowseShims(planDir); @@ -663,6 +673,8 @@ We're building a real-time notification system for our SaaS app. path.join(ROOT, 'plan-eng-review', 'SKILL.md'), path.join(planDir, 'plan-eng-review', 'SKILL.md'), ); + // Carved skills (v2 plan T9): copy sections/ so the review workflow + report template are present. + { const _sec = path.join(ROOT, 'plan-eng-review', 'sections'); if (fs.existsSync(_sec)) fs.cpSync(_sec, path.join(planDir, 'plan-eng-review', 'sections'), { recursive: true }); } }); afterAll(() => { diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index d54e2b551..eb47526b8 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -540,7 +540,19 @@ async function runWorkflowJudge(opts: { const defaults = { clarity: 4, completeness: 3, actionability: 4 }; const thresholds = { ...defaults, ...opts.thresholds }; - const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8'); + // Read the skeleton + sections UNION so carved skills (v2 plan T9) still + // expose markers that moved into sections/*.md (e.g. plan-eng's "## Review + // Sections" + "## CRITICAL RULE", plan-design's 7 passes). Without this the + // slice markers vanish from the skeleton and the judge scores empty content. + let content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8'); + const secDir = path.join(ROOT, path.dirname(opts.skillPath), 'sections'); + if (fs.existsSync(secDir)) { + for (const f of fs.readdirSync(secDir).sort()) { + if (f.endsWith('.md') && !f.endsWith('.md.tmpl')) { + content += '\n' + fs.readFileSync(path.join(secDir, f), 'utf-8'); + } + } + } const startIdx = content.indexOf(opts.startMarker); if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`);