gstack/test/skill-e2e-hermetic-canary.t...

/**
 * Hermetic-isolation canaries (gate tier, ~$0.02 each, deterministic).
 *
 * Two tests that make the hermeticity claim FALSIFIABLE instead of asserted:
 *
 * 1. `hermetic-canary` — env + auth isolation. Plants contamination vars in
 *    the TEST process env, spawns a child through the real runner, and
 *    asserts from the Bash tool_result in the stream-json transcript (never
 *    the model's prose — prose can hallucinate) that the child saw a temp
 *    `/.claude` config dir, a temp GSTACK_HOME, and none of the planted
 *    contamination. Auth hermeticity: hard-fails when ANTHROPIC_API_KEY is
 *    absent (a skip here would be a silent hole), and asserts
 *    total_cost_usd > 0 — subscription/keychain OAuth reports cost 0, so
 *    nonzero cost is the discriminator that the API key actually paid
 *    (verified empirically 2026-06-12; the result record exposes no
 *    auth-source field, so cost is the best available signal — residual
 *    gap documented in the plan).
 *
 * 2. `hermetic-sentinel` — config isolation, the poisoned-operator probe.
 *    Builds a FAKE operator config tree (user CLAUDE.md + an mcpServers
 *    entry) and points the test process's CLAUDE_CONFIG_DIR at it. If the
 *    hermetic redirect ever breaks, the child loads that poisoned tree and
 *    the probes fire: init.mcp_servers would list the planted server
 *    (semantic proof that --strict-mcp-config + the redirect yield ZERO MCP
 *    servers, not an assumption), and the child's config dir would contain
 *    the poisoned CLAUDE.md.
 *
 * Both canaries double as the seed-schema / CLI version-skew tripwire: a
 * claude release that changes first-run behavior or config discovery fails
 * here first, loudly, in the gate tier.
 */

import { expect, afterAll } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { runSkillTest } from './helpers/session-runner';
import {
  describeIfSelected, testIfSelected, createEvalCollector, finalizeEvalCollector,
  recordE2E, runId, logCost,
} from './helpers/e2e-helpers';

const evalCollector = createEvalCollector('e2e-hermetic');

// Cheap + deterministic: the canaries assert environment facts, not model
// quality, so the smallest model is the right tool.
const CANARY_MODEL = 'claude-haiku-4-5-20251001';

/** Extract concatenated tool_result text from the stream-json transcript. */
function toolResultText(transcript: any[]): string {
  const chunks: string[] = [];
  for (const event of transcript) {
    if (event.type !== 'user') continue;
    for (const item of event.message?.content ?? []) {
      if (item.type !== 'tool_result') continue;
      if (typeof item.content === 'string') chunks.push(item.content);
      else for (const c of item.content ?? []) if (c.type === 'text') chunks.push(c.text);
    }
  }
  return chunks.join('\n');
}

function initEvent(transcript: any[]): any {
  return transcript.find((e) => e.type === 'system' && e.subtype === 'init');
}

describeIfSelected('hermetic isolation canaries', ['hermetic-canary', 'hermetic-sentinel'], () => {
  testIfSelected('hermetic-canary', async () => {
    // Auth hermeticity is part of the contract: a missing key must FAIL the
    // gate, not skip it — a skipped canary is a silent hole.
    if (!process.env.ANTHROPIC_API_KEY) {
      throw new Error('hermetic-canary requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip');
    }

    const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-canary-'));
    // Plant contamination deterministically — the operator env may or may not
    // carry these, so set them ourselves and restore after.
    const planted: Record<string, string> = {
      CONDUCTOR_WORKSPACE_PATH: '/tmp/poison-conductor-ws',
      GBRAIN_POISON_PROBE: 'leaked',
    };
    const prev: Record<string, string | undefined> = {};
    for (const [k, v] of Object.entries(planted)) { prev[k] = process.env[k]; process.env[k] = v; }

    try {
      const result = await runSkillTest({
        prompt: 'Run exactly this bash command and then stop: ' +
          'echo "CFG=$CLAUDE_CONFIG_DIR"; echo "GH=$GSTACK_HOME"; ' +
          'echo "CW=$CONDUCTOR_WORKSPACE_PATH"; echo "GP=$GBRAIN_POISON_PROBE"',
        workingDirectory: workDir,
        maxTurns: 3,
        allowedTools: ['Bash'],
        timeout: 120_000,
        testName: 'hermetic-canary',
        runId,
        model: CANARY_MODEL,
      });
      logCost('hermetic-canary', result);
      recordE2E(evalCollector, 'hermetic-canary', 'e2e-hermetic', result);

      expect(result.exitReason).toBe('success');

      // Deterministic: assert the Bash tool OUTPUT, not the model's prose.
      const bashOut = toolResultText(result.transcript);
      const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? '';
      expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/);
      expect(bashOut).toMatch(/GH=\S*gstack-home/);
      // Planted contamination must not reach the child. CLAUDECODE is NOT
      // probed here: the child claude CLI sets CLAUDECODE=1 for its own tool
      // subprocesses (verified empirically — CI behaves identically), so the
      // Bash tool can't observe our scrub of it; the unit test pins that.
      expect(bashOut).toMatch(/(^|\n)CW=\s*($|\n)/); // planted Conductor var scrubbed
      expect(bashOut).toMatch(/(^|\n)GP=\s*($|\n)/); // GBRAIN_* scrubbed

      // Zero MCP servers — semantic, from the init event, not a flag grep.
      const init = initEvent(result.transcript);
      expect(init).toBeTruthy();
      expect(init.mcp_servers ?? []).toHaveLength(0);

      // Auth: nonzero cost = the API key paid (OAuth/keychain reports 0).
      expect(result.transcript.find((e) => e.type === 'result')?.total_cost_usd).toBeGreaterThan(0);
    } finally {
      for (const [k, v] of Object.entries(prev)) {
        if (v === undefined) delete process.env[k]; else process.env[k] = v;
      }
      fs.rmSync(workDir, { recursive: true, force: true });
    }
  }, 180_000);

  testIfSelected('hermetic-sentinel', async () => {
    if (!process.env.ANTHROPIC_API_KEY) {
      throw new Error('hermetic-sentinel requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip');
    }

    const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-sentinel-'));
    // Poisoned operator config tree: if the hermetic redirect breaks, the
    // child discovers this dir and both probes below fire.
    const poisonRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-poison-'));
    const poisonCfg = path.join(poisonRoot, '.claude');
    fs.mkdirSync(poisonCfg, { recursive: true });
    fs.writeFileSync(path.join(poisonCfg, 'CLAUDE.md'), 'POISONED OPERATOR MEMORY — must never load\n');
    fs.writeFileSync(path.join(poisonCfg, '.claude.json'), JSON.stringify({
      hasCompletedOnboarding: true,
      mcpServers: { 'sentinel-mcp': { command: '/usr/bin/true', args: [] } },
    }));
    const prevCfgDir = process.env.CLAUDE_CONFIG_DIR;
    process.env.CLAUDE_CONFIG_DIR = poisonCfg;

    try {
      const result = await runSkillTest({
        prompt: 'Run exactly this bash command and then stop: ' +
          'echo "CFG=$CLAUDE_CONFIG_DIR"; ' +
          'if [ -f "$CLAUDE_CONFIG_DIR/CLAUDE.md" ]; then echo "USER_MD=present"; else echo "USER_MD=absent"; fi',
        workingDirectory: workDir,
        maxTurns: 3,
        allowedTools: ['Bash'],
        timeout: 120_000,
        testName: 'hermetic-sentinel',
        runId,
        model: CANARY_MODEL,
      });
      logCost('hermetic-sentinel', result);
      recordE2E(evalCollector, 'hermetic-sentinel', 'e2e-hermetic', result);

      expect(result.exitReason).toBe('success');

      const bashOut = toolResultText(result.transcript);
      const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? '';
      // The redirect must beat the poisoned operator value...
      expect(cfg).not.toBe(poisonCfg);
      expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/);
      // ...and the active config dir must not carry the poisoned user memory.
      expect(bashOut).toContain('USER_MD=absent');

      // The planted MCP server must be invisible: zero servers in init.
      const init = initEvent(result.transcript);
      expect(init).toBeTruthy();
      const servers = (init.mcp_servers ?? []).map((s: any) => s?.name ?? s);
      expect(servers).toHaveLength(0);
      expect(JSON.stringify(servers)).not.toContain('sentinel-mcp');
    } finally {
      if (prevCfgDir === undefined) delete process.env.CLAUDE_CONFIG_DIR;
      else process.env.CLAUDE_CONFIG_DIR = prevCfgDir;
      fs.rmSync(workDir, { recursive: true, force: true });
      fs.rmSync(poisonRoot, { recursive: true, force: true });
    }
  }, 180_000);
});

afterAll(() => finalizeEvalCollector(evalCollector));