/** * Hermetic-isolation canaries (gate tier, ~$0.02 each, deterministic). * * Two tests that make the hermeticity claim FALSIFIABLE instead of asserted: * * 1. `hermetic-canary` — env + auth isolation. Plants contamination vars in * the TEST process env, spawns a child through the real runner, and * asserts from the Bash tool_result in the stream-json transcript (never * the model's prose — prose can hallucinate) that the child saw a temp * `/.claude` config dir, a temp GSTACK_HOME, and none of the planted * contamination. Auth hermeticity: hard-fails when ANTHROPIC_API_KEY is * absent (a skip here would be a silent hole), and asserts * total_cost_usd > 0 — subscription/keychain OAuth reports cost 0, so * nonzero cost is the discriminator that the API key actually paid * (verified empirically 2026-06-12; the result record exposes no * auth-source field, so cost is the best available signal — residual * gap documented in the plan). * * 2. `hermetic-sentinel` — config isolation, the poisoned-operator probe. * Builds a FAKE operator config tree (user CLAUDE.md + an mcpServers * entry) and points the test process's CLAUDE_CONFIG_DIR at it. If the * hermetic redirect ever breaks, the child loads that poisoned tree and * the probes fire: init.mcp_servers would list the planted server * (semantic proof that --strict-mcp-config + the redirect yield ZERO MCP * servers, not an assumption), and the child's config dir would contain * the poisoned CLAUDE.md. * * Both canaries double as the seed-schema / CLI version-skew tripwire: a * claude release that changes first-run behavior or config discovery fails * here first, loudly, in the gate tier. */ import { expect, afterAll } from 'bun:test'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import { runSkillTest } from './helpers/session-runner'; import { describeIfSelected, testIfSelected, createEvalCollector, finalizeEvalCollector, recordE2E, runId, logCost, } from './helpers/e2e-helpers'; const evalCollector = createEvalCollector('e2e-hermetic'); // Cheap + deterministic: the canaries assert environment facts, not model // quality, so the smallest model is the right tool. const CANARY_MODEL = 'claude-haiku-4-5-20251001'; /** Extract concatenated tool_result text from the stream-json transcript. */ function toolResultText(transcript: any[]): string { const chunks: string[] = []; for (const event of transcript) { if (event.type !== 'user') continue; for (const item of event.message?.content ?? []) { if (item.type !== 'tool_result') continue; if (typeof item.content === 'string') chunks.push(item.content); else for (const c of item.content ?? []) if (c.type === 'text') chunks.push(c.text); } } return chunks.join('\n'); } function initEvent(transcript: any[]): any { return transcript.find((e) => e.type === 'system' && e.subtype === 'init'); } describeIfSelected('hermetic isolation canaries', ['hermetic-canary', 'hermetic-sentinel'], () => { testIfSelected('hermetic-canary', async () => { // Auth hermeticity is part of the contract: a missing key must FAIL the // gate, not skip it — a skipped canary is a silent hole. if (!process.env.ANTHROPIC_API_KEY) { throw new Error('hermetic-canary requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip'); } const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-canary-')); // Plant contamination deterministically — the operator env may or may not // carry these, so set them ourselves and restore after. const planted: Record = { CONDUCTOR_WORKSPACE_PATH: '/tmp/poison-conductor-ws', GBRAIN_POISON_PROBE: 'leaked', }; const prev: Record = {}; for (const [k, v] of Object.entries(planted)) { prev[k] = process.env[k]; process.env[k] = v; } try { const result = await runSkillTest({ prompt: 'Run exactly this bash command and then stop: ' + 'echo "CFG=$CLAUDE_CONFIG_DIR"; echo "GH=$GSTACK_HOME"; ' + 'echo "CW=$CONDUCTOR_WORKSPACE_PATH"; echo "GP=$GBRAIN_POISON_PROBE"', workingDirectory: workDir, maxTurns: 3, allowedTools: ['Bash'], timeout: 120_000, testName: 'hermetic-canary', runId, model: CANARY_MODEL, }); logCost('hermetic-canary', result); recordE2E(evalCollector, 'hermetic-canary', 'e2e-hermetic', result); expect(result.exitReason).toBe('success'); // Deterministic: assert the Bash tool OUTPUT, not the model's prose. const bashOut = toolResultText(result.transcript); const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? ''; expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/); expect(bashOut).toMatch(/GH=\S*gstack-home/); // Planted contamination must not reach the child. CLAUDECODE is NOT // probed here: the child claude CLI sets CLAUDECODE=1 for its own tool // subprocesses (verified empirically — CI behaves identically), so the // Bash tool can't observe our scrub of it; the unit test pins that. expect(bashOut).toMatch(/(^|\n)CW=\s*($|\n)/); // planted Conductor var scrubbed expect(bashOut).toMatch(/(^|\n)GP=\s*($|\n)/); // GBRAIN_* scrubbed // Zero MCP servers — semantic, from the init event, not a flag grep. const init = initEvent(result.transcript); expect(init).toBeTruthy(); expect(init.mcp_servers ?? []).toHaveLength(0); // Auth: nonzero cost = the API key paid (OAuth/keychain reports 0). expect(result.transcript.find((e) => e.type === 'result')?.total_cost_usd).toBeGreaterThan(0); } finally { for (const [k, v] of Object.entries(prev)) { if (v === undefined) delete process.env[k]; else process.env[k] = v; } fs.rmSync(workDir, { recursive: true, force: true }); } }, 180_000); testIfSelected('hermetic-sentinel', async () => { if (!process.env.ANTHROPIC_API_KEY) { throw new Error('hermetic-sentinel requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip'); } const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-sentinel-')); // Poisoned operator config tree: if the hermetic redirect breaks, the // child discovers this dir and both probes below fire. const poisonRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-poison-')); const poisonCfg = path.join(poisonRoot, '.claude'); fs.mkdirSync(poisonCfg, { recursive: true }); fs.writeFileSync(path.join(poisonCfg, 'CLAUDE.md'), 'POISONED OPERATOR MEMORY — must never load\n'); fs.writeFileSync(path.join(poisonCfg, '.claude.json'), JSON.stringify({ hasCompletedOnboarding: true, mcpServers: { 'sentinel-mcp': { command: '/usr/bin/true', args: [] } }, })); const prevCfgDir = process.env.CLAUDE_CONFIG_DIR; process.env.CLAUDE_CONFIG_DIR = poisonCfg; try { const result = await runSkillTest({ prompt: 'Run exactly this bash command and then stop: ' + 'echo "CFG=$CLAUDE_CONFIG_DIR"; ' + 'if [ -f "$CLAUDE_CONFIG_DIR/CLAUDE.md" ]; then echo "USER_MD=present"; else echo "USER_MD=absent"; fi', workingDirectory: workDir, maxTurns: 3, allowedTools: ['Bash'], timeout: 120_000, testName: 'hermetic-sentinel', runId, model: CANARY_MODEL, }); logCost('hermetic-sentinel', result); recordE2E(evalCollector, 'hermetic-sentinel', 'e2e-hermetic', result); expect(result.exitReason).toBe('success'); const bashOut = toolResultText(result.transcript); const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? ''; // The redirect must beat the poisoned operator value... expect(cfg).not.toBe(poisonCfg); expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/); // ...and the active config dir must not carry the poisoned user memory. expect(bashOut).toContain('USER_MD=absent'); // The planted MCP server must be invisible: zero servers in init. const init = initEvent(result.transcript); expect(init).toBeTruthy(); const servers = (init.mcp_servers ?? []).map((s: any) => s?.name ?? s); expect(servers).toHaveLength(0); expect(JSON.stringify(servers)).not.toContain('sentinel-mcp'); } finally { if (prevCfgDir === undefined) delete process.env.CLAUDE_CONFIG_DIR; else process.env.CLAUDE_CONFIG_DIR = prevCfgDir; fs.rmSync(workDir, { recursive: true, force: true }); fs.rmSync(poisonRoot, { recursive: true, force: true }); } }, 180_000); }); afterAll(() => finalizeEvalCollector(evalCollector));