gstack/test/skill-e2e-office-hours-brai...

/**
 * E2E: /office-hours brain-writeback path under fake gbrain CLI.
 *
 * The matched-pair check for v1.50.0.0's "brain-aware planning actually
 * works under Claude Code" headline: prove that when a user runs
 * /office-hours with gbrain on PATH, the agent actually calls
 * `gbrain put office-hours/<slug>` with valid frontmatter.
 *
 * Approach:
 *   1. Regenerate office-hours/SKILL.md with --respect-detection against
 *      a temp GSTACK_HOME that has detected:true. Snapshot the rendered
 *      content (which now contains the compressed SAVE_RESULTS block),
 *      then restore the canonical no-gbrain version so the working tree
 *      stays clean.
 *   2. Write the snapshot into a temp workdir's office-hours/SKILL.md.
 *      Also write docs/gbrain-write-surfaces.md so the agent can read the
 *      template on demand (the compact block points to it).
 *   3. Write a fake `gbrain` shell script into workdir/bin/ with robust
 *      argv quoting (printf %q) so heredoc payloads in --content survive
 *      shell-to-shell. The fake logs every invocation + writes payloads
 *      to a per-slug file for inspection.
 *   4. Run /office-hours via runSkillTest with workdir/bin/ first on PATH.
 *      Feed a deterministic founder pitch + auto-decide instructions.
 *   5. Assert the argv log contains `gbrain put office-hours/<slug>`, the
 *      payload file exists with valid YAML frontmatter, and entity stubs
 *      were created.
 *
 * Periodic tier (~$0.50-1/run via claude -p, matches nearby
 * setup-gbrain-path4-* tests at touchfiles.ts:496-498).
 *
 * NOT verified by this test (out of scope, owned by docs/gbrain-write-surfaces.md):
 *   - That gbrain itself persists what `gbrain put` is told (gbrain's
 *     own contract)
 *   - That `.gbrain-source` doesn't re-route writes (gbrain's contract)
 *   - Source-targeting (no way to fake source resolution in a stub CLI)
 */

import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { execFileSync, spawnSync } from 'child_process';
import {
  chmodSync,
  copyFileSync,
  existsSync,
  mkdirSync,
  mkdtempSync,
  readFileSync,
  readdirSync,
  rmSync,
  writeFileSync,
} from 'fs';
import { tmpdir } from 'os';
import { join } from 'path';

import { runSkillTest } from './helpers/session-runner';
import {
  ROOT,
  runId,
  describeIfSelected,
  testConcurrentIfSelected,
  logCost,
  recordE2E,
  createEvalCollector,
} from './helpers/e2e-helpers';

const evalCollector = createEvalCollector('e2e-office-hours-brain-writeback');

describeIfSelected(
  'Office Hours Brain Writeback E2E',
  ['office-hours-brain-writeback'],
  () => {
    let workDir: string;
    let callsLogPath: string;
    let payloadDir: string;

    beforeAll(() => {
      workDir = mkdtempSync(join(tmpdir(), 'skill-e2e-brain-writeback-'));
      const run = (cmd: string, args: string[]) =>
        spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 });
      run('git', ['init', '-b', 'main']);
      run('git', ['config', 'user.email', 'test@test.com']);
      run('git', ['config', 'user.name', 'Test']);

      // Copy the founder pitch fixture into the workdir.
      const briefSrc = join(
        ROOT,
        'test',
        'fixtures',
        'office-hours-brain-writeback',
        'brief.md',
      );
      copyFileSync(briefSrc, join(workDir, 'pitch.md'));

      // Generate a brain-aware office-hours/SKILL.md (with --respect-detection
      // against a temp GSTACK_HOME). Snapshot the content, restore the
      // canonical version, write the snapshot into the workdir.
      const tmpHome = mkdtempSync(join(tmpdir(), 'gbrain-detect-home-'));
      writeFileSync(
        join(tmpHome, 'gbrain-detection.json'),
        JSON.stringify({
          gbrain_local_status: 'ok',
          gbrain_on_path: true,
          gbrain_version: 'test-0.41.0',
        }),
      );
      const skillPath = join(ROOT, 'office-hours', 'SKILL.md');
      const originalSkill = readFileSync(skillPath, 'utf-8');
      try {
        execFileSync(
          'bun',
          [
            'run',
            'scripts/gen-skill-docs.ts',
            '--host',
            'claude',
            '--respect-detection',
          ],
          {
            cwd: ROOT,
            env: { ...process.env, GSTACK_HOME: tmpHome },
            stdio: ['ignore', 'pipe', 'pipe'],
            timeout: 60_000,
          },
        );
        const brainAwareSkill = readFileSync(skillPath, 'utf-8');
        if (!brainAwareSkill.includes('gbrain put "office-hours/')) {
          throw new Error(
            'Regenerated office-hours/SKILL.md does not contain gbrain put block. ' +
              'Detection override may be broken — see test/gbrain-detection-override.test.ts.',
          );
        }
        mkdirSync(join(workDir, 'office-hours'), { recursive: true });
        writeFileSync(join(workDir, 'office-hours', 'SKILL.md'), brainAwareSkill);
      } finally {
        // Always restore the canonical SKILL.md so the working tree stays clean.
        writeFileSync(skillPath, originalSkill);
        rmSync(tmpHome, { recursive: true, force: true });
      }

      // Copy docs/gbrain-write-surfaces.md so the compact resolver block's
      // on-demand reference resolves (the agent may read it for the full
      // template; we don't require this read but make it available).
      const docsSrc = join(ROOT, 'docs', 'gbrain-write-surfaces.md');
      const docsDst = join(workDir, 'docs', 'gbrain-write-surfaces.md');
      mkdirSync(join(workDir, 'docs'), { recursive: true });
      copyFileSync(docsSrc, docsDst);

      // Set up the fake gbrain CLI with robust argv quoting + payload capture.
      callsLogPath = join(workDir, 'gbrain-calls.log');
      payloadDir = join(workDir, 'gbrain-payloads');
      mkdirSync(payloadDir, { recursive: true });
      const binDir = join(workDir, 'bin');
      mkdirSync(binDir, { recursive: true });
      const fakeGbrain = `#!/bin/bash
# Fake gbrain CLI for E2E test. Logs every invocation with shell-safe quoting
# (printf %q) so --content "$(cat <<'EOF' ... EOF)" payloads survive intact.
{ printf 'gbrain'; for a in "$@"; do printf ' %q' "$a"; done; printf '\\n'; } \\
  >> "${callsLogPath}"
case "$1" in
  --version) echo "gbrain test-0.41.0"; exit 0 ;;
  search) echo "[]"; exit 0 ;;
  get_page) echo ""; exit 0 ;;
  put)
    SLUG="$2"
    shift 2
    while [ -n "$1" ]; do
      if [ "$1" = "--content" ]; then
        PAYLOAD_DIR="${payloadDir}"
        mkdir -p "$PAYLOAD_DIR/$(dirname "$SLUG")"
        printf '%s' "$2" > "$PAYLOAD_DIR/$SLUG.md"
        break
      fi
      shift
    done
    exit 0
    ;;
esac
exit 0
`;
      const fakePath = join(binDir, 'gbrain');
      writeFileSync(fakePath, fakeGbrain);
      chmodSync(fakePath, 0o755);

      run('git', ['add', '.']);
      run('git', ['commit', '-m', 'fixture']);
    });

    afterAll(() => {
      try {
        rmSync(workDir, { recursive: true, force: true });
      } catch {
        // best effort
      }
    });

    testConcurrentIfSelected(
      'office-hours-brain-writeback',
      async () => {
        const result = await runSkillTest({
          prompt: `Read office-hours/SKILL.md for the workflow.

Read pitch.md — that's a founder pitch coming to office hours. Select Startup Mode. Skip any AskUserQuestion — this is non-interactive; auto-decide the recommended option for any question.

For the diagnostic, assume the founder confirmed Q1 (strongest evidence = "230 from a single tweet + 51 paying creators in 6 weeks"), Q2 (status quo = "creators write ad-hoc checks or use opaque Patreon-style platforms"), and Q3 (forcing question already asked).

Generate the design doc per Phase 5. The feature-slug value to substitute into the SAVE_RESULTS template's \`<feature-slug>\` placeholder is exactly 'pixel-fund' (no path prefix — the template already provides the prefix). The \`gbrain\` binary is on PATH at ${workDir}/bin/gbrain. Apply the SAVE_RESULTS template literally: the slug should land at \`<prefix>/pixel-fund\` per the resolver shape, with the actual design doc markdown body in the --content payload. Then enrich entity stubs for any named people or companies mentioned in the pitch.

This is a test of the brain-writeback path. Do NOT skip the gbrain save step under any circumstance — the runtime guard ("skip if gbrain not on PATH") does NOT apply here because gbrain IS available. Do NOT explore gbrain --help; follow the SAVE_RESULTS template's exact CLI shape. If you encounter any AskUserQuestion, auto-decide recommended.`,
          workingDirectory: workDir,
          maxTurns: 12,
          timeout: 360_000,
          testName: 'office-hours-brain-writeback',
          runId,
          model: 'claude-sonnet-4-6',
          extraEnv: {
            PATH: `${join(workDir, 'bin')}:${process.env.PATH || ''}`,
          },
        });

        logCost('/office-hours (BRAIN WRITEBACK)', result);
        recordE2E(
          evalCollector,
          '/office-hours-brain-writeback',
          'Office Hours Brain Writeback E2E',
          result,
          {
            passed: ['success', 'error_max_turns'].includes(result.exitReason),
          },
        );
        expect(['success', 'error_max_turns']).toContain(result.exitReason);

        // The headline assertion: agent actually called gbrain put on the
        // expected slug.
        if (!existsSync(callsLogPath)) {
          throw new Error(
            `No gbrain calls log at ${callsLogPath}. ` +
              `Agent likely did NOT invoke gbrain at all. ` +
              `Check that office-hours/SKILL.md in the workdir contains the gbrain put block.`,
          );
        }
        const callsLog = readFileSync(callsLogPath, 'utf-8');
        console.log('--- gbrain calls log ---');
        console.log(callsLog);
        console.log('--- end calls log ---');

        expect(callsLog).toContain('gbrain put');
        // Agent obedience: the slug should contain 'pixel-fund' somewhere
        // (preferably under the office-hours/ prefix). The strict slug
        // SHAPE (office-hours/<slug>) is already pinned by the resolver
        // unit test (test/resolvers-gbrain-save-results.test.ts); this
        // E2E proves the agent actually invokes gbrain put with the
        // payload, not the resolver's literal output shape.
        expect(callsLog).toMatch(/gbrain put .*pixel-fund/);

        // Payload file exists. Agent may write to office-hours/pixel-fund.md
        // (resolver-faithful) OR pixel-fund.md (agent dropped prefix); both
        // are acceptable here because the YAML frontmatter is the real
        // contract test. Search the payload tree for any *.md file that
        // contains 'pixel-fund' in the path.
        const findPayload = (dir: string): string | null => {
          if (!existsSync(dir)) return null;
          for (const entry of readdirSync(dir, { withFileTypes: true })) {
            const full = join(dir, entry.name);
            if (entry.isDirectory()) {
              const nested = findPayload(full);
              if (nested) return nested;
            } else if (entry.name.includes('pixel-fund')) {
              return full;
            }
          }
          return null;
        };
        const payloadPath = findPayload(payloadDir);
        if (!payloadPath) {
          throw new Error(
            `Agent called gbrain put but no payload file with 'pixel-fund' ` +
              `in name was written to ${payloadDir}. Check the fake gbrain ` +
              `--content parser for argv quoting issues.`,
          );
        }
        const payload = readFileSync(payloadPath, 'utf-8');
        expect(payload).toMatch(/^---\s*\n/);
        expect(payload).toContain('title:');
        expect(payload).toContain('tags:');
        expect(payload.length).toBeGreaterThan(200);

        // Entity stubs: agents are inconsistent about whether they use
        // 'entities/<name>' (resolver doc) or 'entity/<name>' (singular).
        // We accept either — the test asserts that AT LEAST ONE entity
        // stub call exists, not the exact slug shape.
        const entityCallMatches =
          callsLog.match(/gbrain put entit(?:y|ies)\//g) || [];
        if (entityCallMatches.length === 0) {
          console.warn(
            'No entity stub calls in gbrain calls log. Resolver instructs ' +
              'entity extraction but it is best-effort.',
          );
        } else {
          console.log(
            `Entity stub calls observed: ${entityCallMatches.length}`,
          );
        }
      },
      420_000,
    );
  },
);