mirror of https://github.com/garrytan/gstack.git
284 lines
11 KiB
TypeScript
284 lines
11 KiB
TypeScript
/**
|
|
* E2E: /office-hours brain-writeback path under fake gbrain CLI.
|
|
*
|
|
* The matched-pair check for v1.50.0.0's "brain-aware planning actually
|
|
* works under Claude Code" headline: prove that when a user runs
|
|
* /office-hours with gbrain on PATH, the agent actually calls
|
|
* `gbrain put office-hours/<slug>` with valid frontmatter.
|
|
*
|
|
* Approach:
|
|
* 1. Regenerate office-hours/SKILL.md with --respect-detection against
|
|
* a temp GSTACK_HOME that has detected:true. Snapshot the rendered
|
|
* content (which now contains the compressed SAVE_RESULTS block),
|
|
* then restore the canonical no-gbrain version so the working tree
|
|
* stays clean.
|
|
* 2. Write the snapshot into a temp workdir's office-hours/SKILL.md.
|
|
* Also write docs/gbrain-write-surfaces.md so the agent can read the
|
|
* template on demand (the compact block points to it).
|
|
* 3. Write a fake `gbrain` shell script into workdir/bin/ with robust
|
|
* argv quoting (printf %q) so heredoc payloads in --content survive
|
|
* shell-to-shell. The fake logs every invocation + writes payloads
|
|
* to a per-slug file for inspection.
|
|
* 4. Run /office-hours via runSkillTest with workdir/bin/ first on PATH.
|
|
* Feed a deterministic founder pitch + auto-decide instructions.
|
|
* 5. Assert the argv log contains `gbrain put office-hours/<slug>`, the
|
|
* payload file exists with valid YAML frontmatter, and entity stubs
|
|
* were created.
|
|
*
|
|
* Periodic tier (~$0.50-1/run via claude -p, matches nearby
|
|
* setup-gbrain-path4-* tests at touchfiles.ts:496-498).
|
|
*
|
|
* NOT verified by this test (out of scope, owned by docs/gbrain-write-surfaces.md):
|
|
* - That gbrain itself persists what `gbrain put` is told (gbrain's
|
|
* own contract)
|
|
* - That `.gbrain-source` doesn't re-route writes (gbrain's contract)
|
|
* - Source-targeting (no way to fake source resolution in a stub CLI)
|
|
*/
|
|
|
|
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
import { execFileSync, spawnSync } from 'child_process';
|
|
import {
|
|
chmodSync,
|
|
copyFileSync,
|
|
existsSync,
|
|
mkdirSync,
|
|
mkdtempSync,
|
|
readFileSync,
|
|
readdirSync,
|
|
rmSync,
|
|
writeFileSync,
|
|
} from 'fs';
|
|
import { tmpdir } from 'os';
|
|
import { join } from 'path';
|
|
|
|
import { runSkillTest } from './helpers/session-runner';
|
|
import {
|
|
ROOT,
|
|
runId,
|
|
describeIfSelected,
|
|
testConcurrentIfSelected,
|
|
logCost,
|
|
recordE2E,
|
|
createEvalCollector,
|
|
} from './helpers/e2e-helpers';
|
|
|
|
const evalCollector = createEvalCollector('e2e-office-hours-brain-writeback');
|
|
|
|
describeIfSelected(
|
|
'Office Hours Brain Writeback E2E',
|
|
['office-hours-brain-writeback'],
|
|
() => {
|
|
let workDir: string;
|
|
let callsLogPath: string;
|
|
let payloadDir: string;
|
|
|
|
beforeAll(() => {
|
|
workDir = mkdtempSync(join(tmpdir(), 'skill-e2e-brain-writeback-'));
|
|
const run = (cmd: string, args: string[]) =>
|
|
spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 });
|
|
run('git', ['init', '-b', 'main']);
|
|
run('git', ['config', 'user.email', 'test@test.com']);
|
|
run('git', ['config', 'user.name', 'Test']);
|
|
|
|
// Copy the founder pitch fixture into the workdir.
|
|
const briefSrc = join(
|
|
ROOT,
|
|
'test',
|
|
'fixtures',
|
|
'office-hours-brain-writeback',
|
|
'brief.md',
|
|
);
|
|
copyFileSync(briefSrc, join(workDir, 'pitch.md'));
|
|
|
|
// Generate a brain-aware office-hours/SKILL.md (with --respect-detection
|
|
// against a temp GSTACK_HOME). Snapshot the content, restore the
|
|
// canonical version, write the snapshot into the workdir.
|
|
const tmpHome = mkdtempSync(join(tmpdir(), 'gbrain-detect-home-'));
|
|
writeFileSync(
|
|
join(tmpHome, 'gbrain-detection.json'),
|
|
JSON.stringify({
|
|
gbrain_local_status: 'ok',
|
|
gbrain_on_path: true,
|
|
gbrain_version: 'test-0.41.0',
|
|
}),
|
|
);
|
|
const skillPath = join(ROOT, 'office-hours', 'SKILL.md');
|
|
const originalSkill = readFileSync(skillPath, 'utf-8');
|
|
try {
|
|
execFileSync(
|
|
'bun',
|
|
[
|
|
'run',
|
|
'scripts/gen-skill-docs.ts',
|
|
'--host',
|
|
'claude',
|
|
'--respect-detection',
|
|
],
|
|
{
|
|
cwd: ROOT,
|
|
env: { ...process.env, GSTACK_HOME: tmpHome },
|
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
timeout: 60_000,
|
|
},
|
|
);
|
|
const brainAwareSkill = readFileSync(skillPath, 'utf-8');
|
|
if (!brainAwareSkill.includes('gbrain put "office-hours/')) {
|
|
throw new Error(
|
|
'Regenerated office-hours/SKILL.md does not contain gbrain put block. ' +
|
|
'Detection override may be broken — see test/gbrain-detection-override.test.ts.',
|
|
);
|
|
}
|
|
mkdirSync(join(workDir, 'office-hours'), { recursive: true });
|
|
writeFileSync(join(workDir, 'office-hours', 'SKILL.md'), brainAwareSkill);
|
|
} finally {
|
|
// Always restore the canonical SKILL.md so the working tree stays clean.
|
|
writeFileSync(skillPath, originalSkill);
|
|
rmSync(tmpHome, { recursive: true, force: true });
|
|
}
|
|
|
|
// Copy docs/gbrain-write-surfaces.md so the compact resolver block's
|
|
// on-demand reference resolves (the agent may read it for the full
|
|
// template; we don't require this read but make it available).
|
|
const docsSrc = join(ROOT, 'docs', 'gbrain-write-surfaces.md');
|
|
const docsDst = join(workDir, 'docs', 'gbrain-write-surfaces.md');
|
|
mkdirSync(join(workDir, 'docs'), { recursive: true });
|
|
copyFileSync(docsSrc, docsDst);
|
|
|
|
// Set up the fake gbrain CLI with robust argv quoting + payload capture.
|
|
callsLogPath = join(workDir, 'gbrain-calls.log');
|
|
payloadDir = join(workDir, 'gbrain-payloads');
|
|
mkdirSync(payloadDir, { recursive: true });
|
|
const binDir = join(workDir, 'bin');
|
|
mkdirSync(binDir, { recursive: true });
|
|
const fakeGbrain = `#!/bin/bash
|
|
# Fake gbrain CLI for E2E test. Logs every invocation with shell-safe quoting
|
|
# (printf %q) so --content "$(cat <<'EOF' ... EOF)" payloads survive intact.
|
|
{ printf 'gbrain'; for a in "$@"; do printf ' %q' "$a"; done; printf '\\n'; } \\
|
|
>> "${callsLogPath}"
|
|
case "$1" in
|
|
--version) echo "gbrain test-0.41.0"; exit 0 ;;
|
|
search) echo "[]"; exit 0 ;;
|
|
get_page) echo ""; exit 0 ;;
|
|
put)
|
|
SLUG="$2"
|
|
shift 2
|
|
while [ -n "$1" ]; do
|
|
if [ "$1" = "--content" ]; then
|
|
PAYLOAD_DIR="${payloadDir}"
|
|
mkdir -p "$PAYLOAD_DIR/$(dirname "$SLUG")"
|
|
printf '%s' "$2" > "$PAYLOAD_DIR/$SLUG.md"
|
|
break
|
|
fi
|
|
shift
|
|
done
|
|
exit 0
|
|
;;
|
|
esac
|
|
exit 0
|
|
`;
|
|
const fakePath = join(binDir, 'gbrain');
|
|
writeFileSync(fakePath, fakeGbrain);
|
|
chmodSync(fakePath, 0o755);
|
|
|
|
run('git', ['add', '.']);
|
|
run('git', ['commit', '-m', 'fixture']);
|
|
});
|
|
|
|
afterAll(() => {
|
|
try {
|
|
rmSync(workDir, { recursive: true, force: true });
|
|
} catch {
|
|
// best effort
|
|
}
|
|
});
|
|
|
|
testConcurrentIfSelected(
|
|
'office-hours-brain-writeback',
|
|
async () => {
|
|
const result = await runSkillTest({
|
|
prompt: `Read office-hours/SKILL.md for the workflow.
|
|
|
|
Read pitch.md — that's a founder pitch coming to office hours. Select Startup Mode. Skip any AskUserQuestion — this is non-interactive; auto-decide the recommended option for any question.
|
|
|
|
For the diagnostic, assume the founder confirmed Q1 (strongest evidence = "230 from a single tweet + 51 paying creators in 6 weeks"), Q2 (status quo = "creators write ad-hoc checks or use opaque Patreon-style platforms"), and Q3 (forcing question already asked).
|
|
|
|
Generate the design doc per Phase 5. Slug it 'pixel-fund'. Then EXPLICITLY follow the "Save Results to Brain" section: call \`gbrain\` to save the design doc to your brain. The \`gbrain\` binary is on PATH at ${workDir}/bin/gbrain. Use the slug 'pixel-fund' as the feature-slug, and include the actual design doc markdown body in the --content payload. Then enrich entity stubs for any named people or companies mentioned in the pitch.
|
|
|
|
This is a test of the brain-writeback path. Do NOT skip the gbrain save step under any circumstance — the runtime guard ("skip if gbrain not on PATH") does NOT apply here because gbrain IS available. If you encounter any AskUserQuestion, auto-decide recommended.`,
|
|
workingDirectory: workDir,
|
|
maxTurns: 12,
|
|
timeout: 360_000,
|
|
testName: 'office-hours-brain-writeback',
|
|
runId,
|
|
model: 'claude-sonnet-4-6',
|
|
extraEnv: {
|
|
PATH: `${join(workDir, 'bin')}:${process.env.PATH || ''}`,
|
|
},
|
|
});
|
|
|
|
logCost('/office-hours (BRAIN WRITEBACK)', result);
|
|
recordE2E(
|
|
evalCollector,
|
|
'/office-hours-brain-writeback',
|
|
'Office Hours Brain Writeback E2E',
|
|
result,
|
|
{
|
|
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
|
},
|
|
);
|
|
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
|
|
|
// The headline assertion: agent actually called gbrain put on the
|
|
// expected slug.
|
|
if (!existsSync(callsLogPath)) {
|
|
throw new Error(
|
|
`No gbrain calls log at ${callsLogPath}. ` +
|
|
`Agent likely did NOT invoke gbrain at all. ` +
|
|
`Check that office-hours/SKILL.md in the workdir contains the gbrain put block.`,
|
|
);
|
|
}
|
|
const callsLog = readFileSync(callsLogPath, 'utf-8');
|
|
console.log('--- gbrain calls log ---');
|
|
console.log(callsLog);
|
|
console.log('--- end calls log ---');
|
|
|
|
expect(callsLog).toContain('gbrain put');
|
|
expect(callsLog).toMatch(/gbrain put .*office-hours\/pixel-fund/);
|
|
|
|
// Payload file exists and has valid YAML frontmatter.
|
|
const payloadPath = join(payloadDir, 'office-hours', 'pixel-fund.md');
|
|
if (!existsSync(payloadPath)) {
|
|
throw new Error(
|
|
`Agent called gbrain put but payload file missing at ${payloadPath}. ` +
|
|
`Check fake gbrain --content parsing (likely an argv quoting issue).`,
|
|
);
|
|
}
|
|
const payload = readFileSync(payloadPath, 'utf-8');
|
|
expect(payload).toMatch(/^---\s*\n/);
|
|
expect(payload).toContain('title:');
|
|
expect(payload).toContain('tags:');
|
|
expect(payload).toContain('design-doc');
|
|
expect(payload.length).toBeGreaterThan(200);
|
|
|
|
// Entity stubs (at least one — the founder's name is in the pitch).
|
|
const entityFiles = existsSync(join(payloadDir, 'entities'))
|
|
? readdirSync(join(payloadDir, 'entities'))
|
|
: [];
|
|
if (entityFiles.length === 0) {
|
|
// Soft-fail: entity stub extraction is a nice-to-have. Log but
|
|
// don't block the test on it — the resolver instructions tell
|
|
// the agent to extract entities, but model variability means
|
|
// small pitches sometimes produce no entities.
|
|
console.warn(
|
|
'No entity stub files created. Resolver instructs entity ' +
|
|
'extraction but it is best-effort.',
|
|
);
|
|
} else {
|
|
console.log('Entity stubs created:', entityFiles);
|
|
}
|
|
},
|
|
420_000,
|
|
);
|
|
},
|
|
);
|