From 33b016712a6b7d5990dcb6d2fbb9a9bc40769b12 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 27 May 2026 08:34:33 -0700 Subject: [PATCH] test(brain): fake-CLI agent-obedience E2E for /office-hours writeback test/skill-e2e-office-hours-brain-writeback.test.ts (~210 LOC, periodic-tier, ~$0.50-1/run): Drives /office-hours via runSkillTest against a deterministic fixture brief (pixel.fund founder pitch). The workdir has: - A regenerated office-hours/SKILL.md with the compressed brain blocks (generated via gen-skill-docs --respect-detection against a temp GSTACK_HOME, then restored to canonical post-snapshot) - A fake gbrain shell script on PATH that uses printf %q quoting to preserve --content "$(cat <<'EOF' ... EOF)" heredoc payloads intact (naive `echo "$@"` would lose argv boundaries) - The docs/gbrain-write-surfaces.md the resolver points to Asserts: - gbrain-calls.log contains `gbrain put office-hours/pixel-fund` - Payload file at gbrain-payloads/office-hours/pixel-fund.md exists with valid YAML frontmatter (title: + tags: + design-doc tag) - At least one gbrain put entities/ call (entity stub enrichment is best-effort, soft warning if absent) Covers agent obedience to the SAVE_RESULTS instruction. Out of scope: gbrain CLI persistence contract (T11 covers that with real PGLite). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../office-hours-brain-writeback/brief.md | 30 ++ ...l-e2e-office-hours-brain-writeback.test.ts | 283 ++++++++++++++++++ 2 files changed, 313 insertions(+) create mode 100644 test/fixtures/office-hours-brain-writeback/brief.md create mode 100644 test/skill-e2e-office-hours-brain-writeback.test.ts diff --git a/test/fixtures/office-hours-brain-writeback/brief.md b/test/fixtures/office-hours-brain-writeback/brief.md new file mode 100644 index 000000000..b1e3f777a --- /dev/null +++ b/test/fixtures/office-hours-brain-writeback/brief.md @@ -0,0 +1,30 @@ +# Founder pitch — pixel.fund + +Founder: Maya Chen (CEO, ex-Stripe), co-founder Aria Patel (CTO, +ex-Robinhood). YC W26. + +## What + +A donation-budget tool for solo creators. Set a monthly $ floor for +causes you care about, pixel.fund auto-allocates each dollar across your +chosen orgs (Direct Relief, GiveDirectly, etc.) the moment a Stripe +payout lands. One-line embeddable receipt. 1% platform fee. + +## Traction + +- 2026-04-01 launched private beta with 14 creators from her newsletter +- 2026-05-15 hit 51 paying creators, $4,200 MRR +- Waitlist of 230 from a single tweet by a tech-Twitter influencer +- Two creators asked about a "team plan" (multi-seat) unprompted + +## Status quo + +Creators today either (a) write checks ad-hoc and forget about it, or +(b) use Patreon-style platforms where the "cause" is opaque (general +fund). Maya talked to 40 creators in YC interviews — 31 said they "want +to give more but it's mental overhead." + +## What Maya wants from office hours + +Should she chase the team-plan signal, or go deeper on the solo flow +first? She's two weeks from running out of YC dorm food. diff --git a/test/skill-e2e-office-hours-brain-writeback.test.ts b/test/skill-e2e-office-hours-brain-writeback.test.ts new file mode 100644 index 000000000..82252b37d --- /dev/null +++ b/test/skill-e2e-office-hours-brain-writeback.test.ts @@ -0,0 +1,283 @@ +/** + * E2E: /office-hours brain-writeback path under fake gbrain CLI. + * + * The matched-pair check for v1.50.0.0's "brain-aware planning actually + * works under Claude Code" headline: prove that when a user runs + * /office-hours with gbrain on PATH, the agent actually calls + * `gbrain put office-hours/` with valid frontmatter. + * + * Approach: + * 1. Regenerate office-hours/SKILL.md with --respect-detection against + * a temp GSTACK_HOME that has detected:true. Snapshot the rendered + * content (which now contains the compressed SAVE_RESULTS block), + * then restore the canonical no-gbrain version so the working tree + * stays clean. + * 2. Write the snapshot into a temp workdir's office-hours/SKILL.md. + * Also write docs/gbrain-write-surfaces.md so the agent can read the + * template on demand (the compact block points to it). + * 3. Write a fake `gbrain` shell script into workdir/bin/ with robust + * argv quoting (printf %q) so heredoc payloads in --content survive + * shell-to-shell. The fake logs every invocation + writes payloads + * to a per-slug file for inspection. + * 4. Run /office-hours via runSkillTest with workdir/bin/ first on PATH. + * Feed a deterministic founder pitch + auto-decide instructions. + * 5. Assert the argv log contains `gbrain put office-hours/`, the + * payload file exists with valid YAML frontmatter, and entity stubs + * were created. + * + * Periodic tier (~$0.50-1/run via claude -p, matches nearby + * setup-gbrain-path4-* tests at touchfiles.ts:496-498). + * + * NOT verified by this test (out of scope, owned by docs/gbrain-write-surfaces.md): + * - That gbrain itself persists what `gbrain put` is told (gbrain's + * own contract) + * - That `.gbrain-source` doesn't re-route writes (gbrain's contract) + * - Source-targeting (no way to fake source resolution in a stub CLI) + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { execFileSync, spawnSync } from 'child_process'; +import { + chmodSync, + copyFileSync, + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + rmSync, + writeFileSync, +} from 'fs'; +import { tmpdir } from 'os'; +import { join } from 'path'; + +import { runSkillTest } from './helpers/session-runner'; +import { + ROOT, + runId, + describeIfSelected, + testConcurrentIfSelected, + logCost, + recordE2E, + createEvalCollector, +} from './helpers/e2e-helpers'; + +const evalCollector = createEvalCollector('e2e-office-hours-brain-writeback'); + +describeIfSelected( + 'Office Hours Brain Writeback E2E', + ['office-hours-brain-writeback'], + () => { + let workDir: string; + let callsLogPath: string; + let payloadDir: string; + + beforeAll(() => { + workDir = mkdtempSync(join(tmpdir(), 'skill-e2e-brain-writeback-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 }); + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Copy the founder pitch fixture into the workdir. + const briefSrc = join( + ROOT, + 'test', + 'fixtures', + 'office-hours-brain-writeback', + 'brief.md', + ); + copyFileSync(briefSrc, join(workDir, 'pitch.md')); + + // Generate a brain-aware office-hours/SKILL.md (with --respect-detection + // against a temp GSTACK_HOME). Snapshot the content, restore the + // canonical version, write the snapshot into the workdir. + const tmpHome = mkdtempSync(join(tmpdir(), 'gbrain-detect-home-')); + writeFileSync( + join(tmpHome, 'gbrain-detection.json'), + JSON.stringify({ + gbrain_local_status: 'ok', + gbrain_on_path: true, + gbrain_version: 'test-0.41.0', + }), + ); + const skillPath = join(ROOT, 'office-hours', 'SKILL.md'); + const originalSkill = readFileSync(skillPath, 'utf-8'); + try { + execFileSync( + 'bun', + [ + 'run', + 'scripts/gen-skill-docs.ts', + '--host', + 'claude', + '--respect-detection', + ], + { + cwd: ROOT, + env: { ...process.env, GSTACK_HOME: tmpHome }, + stdio: ['ignore', 'pipe', 'pipe'], + timeout: 60_000, + }, + ); + const brainAwareSkill = readFileSync(skillPath, 'utf-8'); + if (!brainAwareSkill.includes('gbrain put "office-hours/')) { + throw new Error( + 'Regenerated office-hours/SKILL.md does not contain gbrain put block. ' + + 'Detection override may be broken — see test/gbrain-detection-override.test.ts.', + ); + } + mkdirSync(join(workDir, 'office-hours'), { recursive: true }); + writeFileSync(join(workDir, 'office-hours', 'SKILL.md'), brainAwareSkill); + } finally { + // Always restore the canonical SKILL.md so the working tree stays clean. + writeFileSync(skillPath, originalSkill); + rmSync(tmpHome, { recursive: true, force: true }); + } + + // Copy docs/gbrain-write-surfaces.md so the compact resolver block's + // on-demand reference resolves (the agent may read it for the full + // template; we don't require this read but make it available). + const docsSrc = join(ROOT, 'docs', 'gbrain-write-surfaces.md'); + const docsDst = join(workDir, 'docs', 'gbrain-write-surfaces.md'); + mkdirSync(join(workDir, 'docs'), { recursive: true }); + copyFileSync(docsSrc, docsDst); + + // Set up the fake gbrain CLI with robust argv quoting + payload capture. + callsLogPath = join(workDir, 'gbrain-calls.log'); + payloadDir = join(workDir, 'gbrain-payloads'); + mkdirSync(payloadDir, { recursive: true }); + const binDir = join(workDir, 'bin'); + mkdirSync(binDir, { recursive: true }); + const fakeGbrain = `#!/bin/bash +# Fake gbrain CLI for E2E test. Logs every invocation with shell-safe quoting +# (printf %q) so --content "$(cat <<'EOF' ... EOF)" payloads survive intact. +{ printf 'gbrain'; for a in "$@"; do printf ' %q' "$a"; done; printf '\\n'; } \\ + >> "${callsLogPath}" +case "$1" in + --version) echo "gbrain test-0.41.0"; exit 0 ;; + search) echo "[]"; exit 0 ;; + get_page) echo ""; exit 0 ;; + put) + SLUG="$2" + shift 2 + while [ -n "$1" ]; do + if [ "$1" = "--content" ]; then + PAYLOAD_DIR="${payloadDir}" + mkdir -p "$PAYLOAD_DIR/$(dirname "$SLUG")" + printf '%s' "$2" > "$PAYLOAD_DIR/$SLUG.md" + break + fi + shift + done + exit 0 + ;; +esac +exit 0 +`; + const fakePath = join(binDir, 'gbrain'); + writeFileSync(fakePath, fakeGbrain); + chmodSync(fakePath, 0o755); + + run('git', ['add', '.']); + run('git', ['commit', '-m', 'fixture']); + }); + + afterAll(() => { + try { + rmSync(workDir, { recursive: true, force: true }); + } catch { + // best effort + } + }); + + testConcurrentIfSelected( + 'office-hours-brain-writeback', + async () => { + const result = await runSkillTest({ + prompt: `Read office-hours/SKILL.md for the workflow. + +Read pitch.md — that's a founder pitch coming to office hours. Select Startup Mode. Skip any AskUserQuestion — this is non-interactive; auto-decide the recommended option for any question. + +For the diagnostic, assume the founder confirmed Q1 (strongest evidence = "230 from a single tweet + 51 paying creators in 6 weeks"), Q2 (status quo = "creators write ad-hoc checks or use opaque Patreon-style platforms"), and Q3 (forcing question already asked). + +Generate the design doc per Phase 5. Slug it 'pixel-fund'. Then EXPLICITLY follow the "Save Results to Brain" section: call \`gbrain\` to save the design doc to your brain. The \`gbrain\` binary is on PATH at ${workDir}/bin/gbrain. Use the slug 'pixel-fund' as the feature-slug, and include the actual design doc markdown body in the --content payload. Then enrich entity stubs for any named people or companies mentioned in the pitch. + +This is a test of the brain-writeback path. Do NOT skip the gbrain save step under any circumstance — the runtime guard ("skip if gbrain not on PATH") does NOT apply here because gbrain IS available. If you encounter any AskUserQuestion, auto-decide recommended.`, + workingDirectory: workDir, + maxTurns: 12, + timeout: 360_000, + testName: 'office-hours-brain-writeback', + runId, + model: 'claude-sonnet-4-6', + extraEnv: { + PATH: `${join(workDir, 'bin')}:${process.env.PATH || ''}`, + }, + }); + + logCost('/office-hours (BRAIN WRITEBACK)', result); + recordE2E( + evalCollector, + '/office-hours-brain-writeback', + 'Office Hours Brain Writeback E2E', + result, + { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }, + ); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + // The headline assertion: agent actually called gbrain put on the + // expected slug. + if (!existsSync(callsLogPath)) { + throw new Error( + `No gbrain calls log at ${callsLogPath}. ` + + `Agent likely did NOT invoke gbrain at all. ` + + `Check that office-hours/SKILL.md in the workdir contains the gbrain put block.`, + ); + } + const callsLog = readFileSync(callsLogPath, 'utf-8'); + console.log('--- gbrain calls log ---'); + console.log(callsLog); + console.log('--- end calls log ---'); + + expect(callsLog).toContain('gbrain put'); + expect(callsLog).toMatch(/gbrain put .*office-hours\/pixel-fund/); + + // Payload file exists and has valid YAML frontmatter. + const payloadPath = join(payloadDir, 'office-hours', 'pixel-fund.md'); + if (!existsSync(payloadPath)) { + throw new Error( + `Agent called gbrain put but payload file missing at ${payloadPath}. ` + + `Check fake gbrain --content parsing (likely an argv quoting issue).`, + ); + } + const payload = readFileSync(payloadPath, 'utf-8'); + expect(payload).toMatch(/^---\s*\n/); + expect(payload).toContain('title:'); + expect(payload).toContain('tags:'); + expect(payload).toContain('design-doc'); + expect(payload.length).toBeGreaterThan(200); + + // Entity stubs (at least one — the founder's name is in the pitch). + const entityFiles = existsSync(join(payloadDir, 'entities')) + ? readdirSync(join(payloadDir, 'entities')) + : []; + if (entityFiles.length === 0) { + // Soft-fail: entity stub extraction is a nice-to-have. Log but + // don't block the test on it — the resolver instructions tell + // the agent to extract entities, but model variability means + // small pitches sometimes produce no entities. + console.warn( + 'No entity stub files created. Resolver instructs entity ' + + 'extraction but it is best-effort.', + ); + } else { + console.log('Entity stubs created:', entityFiles); + } + }, + 420_000, + ); + }, +);