mirror of https://github.com/garrytan/gstack.git
test(opus-4.7): rewrite scratch-root helper + add afterAll cleanup
First run of the Opus 4.7 eval exposed two test-setup gaps that made
results misleading:
- Only the root gstack SKILL.md was installed. Claude Code does
auto-discovery per-directory under .claude/skills/{name}/SKILL.md, so
without individual skill dirs the Skill tool had nothing to route to.
Positive routing cases all failed.
- `claude -p` does not load SKILL.md content as system context the way
the Claude Code harness does. The overlay nudges in SKILL.md were
invisible to the model, so the fanout A/B could not actually differ.
New `mkEvalRoot(suffix, includeOverlay)` helper, modelled on the pattern
in skill-routing-e2e.test.ts:
- Installs per-skill SKILL.md under .claude/skills/ for ~14 key skills
so the Skill tool has discoverable targets.
- Writes an explicit routing block into project CLAUDE.md.
- When includeOverlay is true, inlines the content of
model-overlays/opus-4-7.md into CLAUDE.md too. This is what makes the
fanout A/B observable in `claude -p`: arm ON gets the overlay in
context, arm OFF does not.
Plus an afterAll that re-runs gen-skill-docs at the default model so
the working tree is not left with opus-4-7-generated SKILL.md files
after the eval finishes (would break golden-file tests in the next
`bun test` run otherwise).
With this setup in place: routing went from 3/3 FAIL to 3/3 PASS
(correct skill or clarification in every positive case, zero false
positives on negatives). Fanout A/B is now a fair comparison; still
shows 0 parallel in both arms under `claude -p` (tracked as a P0 TODO
for re-measurement inside Claude Code's harness, where fanout may land
differently).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
723f9957f2
commit
d646bc12d8
|
|
@ -35,36 +35,85 @@ const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').sl
|
||||||
|
|
||||||
// --- Helpers ---
|
// --- Helpers ---
|
||||||
|
|
||||||
/** Regenerate SKILL.md files at the given model into a scratch root, return that root. */
|
/** Skills that must exist as individual .claude/skills/{name}/SKILL.md files
|
||||||
function regenSkillsAt(model: string, suffix: string): string {
|
* for Claude Code's auto-discovery to treat them as invokable via Skill tool.
|
||||||
|
* Matches the pattern in skill-routing-e2e.test.ts. */
|
||||||
|
const INSTALLED_SKILLS = [
|
||||||
|
'qa', 'qa-only', 'ship', 'review', 'plan-ceo-review', 'plan-eng-review',
|
||||||
|
'plan-design-review', 'design-review', 'design-consultation', 'retro',
|
||||||
|
'document-release', 'investigate', 'office-hours', 'browse',
|
||||||
|
];
|
||||||
|
|
||||||
|
/** Write a scratch root with:
|
||||||
|
* - Per-skill SKILL.md files under .claude/skills/ (so Skill tool sees them)
|
||||||
|
* - Project CLAUDE.md with explicit routing rules AND (optionally) the
|
||||||
|
* 4.7 overlay content directly inlined so `claude -p` sees it
|
||||||
|
* - git init
|
||||||
|
*
|
||||||
|
* `includeOverlay` controls whether the opus-4-7 nudges (Fan out, Literal,
|
||||||
|
* etc.) get inlined into CLAUDE.md — this is the A/B axis for the fanout
|
||||||
|
* test. `claude -p` doesn't auto-load SKILL.md content, so CLAUDE.md is
|
||||||
|
* the only way to make the overlay visible to the model in this test
|
||||||
|
* harness.
|
||||||
|
*/
|
||||||
|
function mkEvalRoot(suffix: string, includeOverlay: boolean): string {
|
||||||
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `opus47-${suffix}-`));
|
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `opus47-${suffix}-`));
|
||||||
|
|
||||||
// Bun runtime: run gen-skill-docs in a fresh copy of the repo so we don't
|
// Regenerate at opus-4-7 so the per-skill SKILL.md files reflect that
|
||||||
// pollute the main working tree. We need: SKILL.md.tmpl files, scripts/,
|
// model's overlay. If includeOverlay is false we'll re-regen at default
|
||||||
// model-overlays/, hosts/. Easiest is to run from ROOT and copy outputs.
|
// later just for the root SKILL.md copy. For individual skills, opus-4-7
|
||||||
|
// content doesn't matter for the routing test (we only need discovery).
|
||||||
const result = spawnSync(
|
const result = spawnSync(
|
||||||
'bun',
|
'bun',
|
||||||
['run', 'scripts/gen-skill-docs.ts', '--model', model],
|
['run', 'scripts/gen-skill-docs.ts', '--model', includeOverlay ? 'opus-4-7' : 'claude'],
|
||||||
{ cwd: ROOT, stdio: 'pipe', encoding: 'utf-8', timeout: 60_000 },
|
{ cwd: ROOT, stdio: 'pipe', encoding: 'utf-8', timeout: 60_000 },
|
||||||
);
|
);
|
||||||
if (result.status !== 0) {
|
if (result.status !== 0) {
|
||||||
throw new Error(`gen-skill-docs failed for --model ${model}: ${result.stderr}`);
|
throw new Error(`gen-skill-docs failed: ${result.stderr}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy the top-level generated SKILL.md into the scratch dir (under
|
// Install per-skill SKILL.md files for Skill tool discovery.
|
||||||
// .claude/skills/gstack/ which is where Claude looks for project skills).
|
const skillsDir = path.join(tmp, '.claude', 'skills');
|
||||||
const skillDir = path.join(tmp, '.claude', 'skills', 'gstack');
|
for (const skill of INSTALLED_SKILLS) {
|
||||||
fs.mkdirSync(skillDir, { recursive: true });
|
const src = path.join(ROOT, skill, 'SKILL.md');
|
||||||
fs.copyFileSync(path.join(ROOT, 'SKILL.md'), path.join(skillDir, 'SKILL.md'));
|
if (!fs.existsSync(src)) continue;
|
||||||
|
const destDir = path.join(skillsDir, skill);
|
||||||
|
fs.mkdirSync(destDir, { recursive: true });
|
||||||
|
fs.copyFileSync(src, path.join(destDir, 'SKILL.md'));
|
||||||
|
}
|
||||||
|
|
||||||
// Minimal project context
|
// Extract the opus-4-7 model-overlay content from the checked-in file
|
||||||
fs.writeFileSync(
|
// so we can inline it into CLAUDE.md when includeOverlay is true.
|
||||||
path.join(tmp, 'CLAUDE.md'),
|
const overlayText = includeOverlay
|
||||||
`# Project\n\nSee .claude/skills/gstack/SKILL.md for skill definitions.\n`,
|
? fs.readFileSync(path.join(ROOT, 'model-overlays', 'opus-4-7.md'), 'utf-8')
|
||||||
);
|
.replace(/\{\{INHERIT:claude\}\}\s*/, '')
|
||||||
|
.trim()
|
||||||
|
: '';
|
||||||
|
|
||||||
|
// Project CLAUDE.md. Explicit routing rules so the agent reaches for
|
||||||
|
// Skill tool on matching prompts, plus the optional overlay.
|
||||||
|
const routingBlock = `## Skill routing
|
||||||
|
|
||||||
|
When the user's request matches an available skill, invoke it via the Skill tool
|
||||||
|
as your FIRST action. The skill has multi-step workflows, checklists, and quality
|
||||||
|
gates that produce better results than an ad-hoc answer. When in doubt, invoke.
|
||||||
|
|
||||||
|
- Bugs, errors, "why is this broken", "wtf" → invoke investigate
|
||||||
|
- Ship, deploy, "send it", create a PR → invoke ship
|
||||||
|
- QA, test the site, "does this work" → invoke qa
|
||||||
|
- Code review, check my diff → invoke review
|
||||||
|
- Product ideas, brainstorming, "is this worth building" → invoke office-hours
|
||||||
|
- Architecture, "does this design make sense" → invoke plan-eng-review
|
||||||
|
- Design system, visual polish → invoke design-review
|
||||||
|
- Weekly retro, what did we ship → invoke retro`;
|
||||||
|
|
||||||
|
const claudeMd = includeOverlay
|
||||||
|
? `# Project\n\n${overlayText}\n\n${routingBlock}\n`
|
||||||
|
: `# Project\n\n${routingBlock}\n`;
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(tmp, 'CLAUDE.md'), claudeMd);
|
||||||
fs.writeFileSync(path.join(tmp, 'package.json'), '{"name":"opus47-eval"}');
|
fs.writeFileSync(path.join(tmp, 'package.json'), '{"name":"opus47-eval"}');
|
||||||
|
|
||||||
// git init so any downstream git-aware logic doesn't blow up
|
|
||||||
const git = (args: string[]) =>
|
const git = (args: string[]) =>
|
||||||
spawnSync('git', args, { cwd: tmp, stdio: 'pipe', timeout: 5_000 });
|
spawnSync('git', args, { cwd: tmp, stdio: 'pipe', timeout: 5_000 });
|
||||||
git(['init']);
|
git(['init']);
|
||||||
|
|
@ -111,13 +160,22 @@ const ROUTING_CASES: RoutingCase[] = [
|
||||||
describeE2E('Opus 4.7 overlay behavior evals', () => {
|
describeE2E('Opus 4.7 overlay behavior evals', () => {
|
||||||
afterAll(() => {
|
afterAll(() => {
|
||||||
evalCollector?.finalize();
|
evalCollector?.finalize();
|
||||||
|
// Restore working tree: mkEvalRoot runs `gen-skill-docs` with various
|
||||||
|
// --model flags, leaving the in-repo SKILL.md files generated at
|
||||||
|
// whichever model ran last. Reset to the default (claude) so the tree
|
||||||
|
// matches what would be checked in.
|
||||||
|
spawnSync('bun', ['run', 'scripts/gen-skill-docs.ts'], {
|
||||||
|
cwd: ROOT,
|
||||||
|
stdio: 'pipe',
|
||||||
|
timeout: 60_000,
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
test(
|
test(
|
||||||
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task',
|
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task',
|
||||||
async () => {
|
async () => {
|
||||||
const armA = regenSkillsAt('opus-4-7', 'on');
|
const armA = mkEvalRoot('on', true);
|
||||||
const armB = regenSkillsAt('claude', 'off');
|
const armB = mkEvalRoot('off', false);
|
||||||
|
|
||||||
// Populate three tiny independent files in each arm. The prompt asks
|
// Populate three tiny independent files in each arm. The prompt asks
|
||||||
// the agent to read all three and report. Opus 4.7 (without nudge)
|
// the agent to read all three and report. Opus 4.7 (without nudge)
|
||||||
|
|
@ -205,7 +263,7 @@ describeE2E('Opus 4.7 overlay behavior evals', () => {
|
||||||
// Single SKILL.md tree shared by all cases. We run claude-opus-4-7 with
|
// Single SKILL.md tree shared by all cases. We run claude-opus-4-7 with
|
||||||
// tool access to Skill; measure whether the first tool call is Skill(..)
|
// tool access to Skill; measure whether the first tool call is Skill(..)
|
||||||
// and if so, which skill.
|
// and if so, which skill.
|
||||||
const root = regenSkillsAt('opus-4-7', 'routing');
|
const root = mkEvalRoot('routing', true);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const results = await Promise.all(
|
const results = await Promise.all(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue