mirror of https://github.com/garrytan/gstack.git
459 lines
16 KiB
TypeScript
459 lines
16 KiB
TypeScript
/**
|
|
* /plan-tune cathedral E2E (T16) — 5 scenarios, all gate tier per D12.
|
|
*
|
|
* Each scenario verifies that the cathedral's substrate works end-to-end
|
|
* against a real `claude -p` invocation. Unit tests in test/{question-log-hook,
|
|
* question-preference-hook, declared-annotation, distill-*}.test.ts cover
|
|
* deterministic plumbing; this file proves the agent obeys the hook
|
|
* contracts in a live session.
|
|
*
|
|
* Touchfile registration in test/helpers/touchfiles.ts:
|
|
* - plan-tune-hook-capture
|
|
* - plan-tune-enforcement
|
|
* - plan-tune-annotation
|
|
* - plan-tune-codex-import
|
|
* - plan-tune-dream-cycle
|
|
*
|
|
* Each scenario uses GSTACK_STATE_ROOT to isolate from the user's real
|
|
* ~/.gstack (per cathedral T1 + Codex D16 fix). Cost budget ~$3-4/scenario.
|
|
*/
|
|
|
|
import { beforeAll, afterAll, expect } from 'bun:test';
|
|
import {
|
|
ROOT,
|
|
describeIfSelected,
|
|
testConcurrentIfSelected,
|
|
copyDirSync,
|
|
createEvalCollector,
|
|
finalizeEvalCollector,
|
|
} from './helpers/e2e-helpers';
|
|
import { spawnSync } from 'child_process';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
const collector = createEvalCollector('e2e-plan-tune-cathedral');
|
|
|
|
afterAll(() => {
|
|
finalizeEvalCollector(collector);
|
|
});
|
|
|
|
/** Scaffold a fixture project with the bins + scripts the cathedral needs. */
|
|
function scaffoldFixture(prefix: string): { workDir: string; stateRoot: string; slug: string } {
|
|
const workDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
|
|
const stateRoot = path.join(workDir, '.gstack-state');
|
|
fs.mkdirSync(stateRoot, { recursive: true });
|
|
|
|
// git init so gstack-slug resolves a deterministic slug.
|
|
spawnSync('git', ['init', '-b', 'main'], { cwd: workDir, stdio: 'pipe' });
|
|
spawnSync('git', ['config', 'user.email', 't@t.com'], { cwd: workDir, stdio: 'pipe' });
|
|
spawnSync('git', ['config', 'user.name', 'T'], { cwd: workDir, stdio: 'pipe' });
|
|
fs.writeFileSync(path.join(workDir, 'README.md'), '# cathedral fixture\n');
|
|
spawnSync('git', ['add', '.'], { cwd: workDir, stdio: 'pipe' });
|
|
spawnSync('git', ['commit', '-m', 'init'], { cwd: workDir, stdio: 'pipe' });
|
|
|
|
// Copy bins.
|
|
const binDir = path.join(workDir, 'bin');
|
|
fs.mkdirSync(binDir, { recursive: true });
|
|
for (const script of [
|
|
'gstack-slug',
|
|
'gstack-config',
|
|
'gstack-paths',
|
|
'gstack-question-log',
|
|
'gstack-question-preference',
|
|
'gstack-developer-profile',
|
|
'gstack-codex-session-import',
|
|
'gstack-distill-free-text',
|
|
'gstack-distill-apply',
|
|
]) {
|
|
const src = path.join(ROOT, 'bin', script);
|
|
if (fs.existsSync(src)) {
|
|
fs.copyFileSync(src, path.join(binDir, script));
|
|
fs.chmodSync(path.join(binDir, script), 0o755);
|
|
}
|
|
}
|
|
|
|
// Copy scripts that the bins import.
|
|
const scriptsDir = path.join(workDir, 'scripts');
|
|
fs.mkdirSync(scriptsDir, { recursive: true });
|
|
for (const f of [
|
|
'question-registry.ts',
|
|
'psychographic-signals.ts',
|
|
'archetypes.ts',
|
|
'one-way-doors.ts',
|
|
'declared-annotation.ts',
|
|
]) {
|
|
const src = path.join(ROOT, 'scripts', f);
|
|
if (fs.existsSync(src)) fs.copyFileSync(src, path.join(scriptsDir, f));
|
|
}
|
|
|
|
// Copy hooks dir.
|
|
copyDirSync(path.join(ROOT, 'hosts', 'claude', 'hooks'), path.join(workDir, 'hosts', 'claude', 'hooks'));
|
|
|
|
const slug = path.basename(workDir).replace(/[^a-zA-Z0-9._-]/g, '');
|
|
return { workDir, stateRoot, slug };
|
|
}
|
|
|
|
function cleanupFixture(workDir: string): void {
|
|
try {
|
|
fs.rmSync(workDir, { recursive: true, force: true });
|
|
} catch {
|
|
// best-effort
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Scenario 1: Hook capture — PostToolUse hook writes to question-log.jsonl
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describeIfSelected('PlanTune cathedral E2E: hook capture', ['plan-tune-hook-capture'], () => {
|
|
let fixture: ReturnType<typeof scaffoldFixture>;
|
|
|
|
beforeAll(() => {
|
|
fixture = scaffoldFixture('cathedral-cap-');
|
|
});
|
|
|
|
afterAll(() => {
|
|
cleanupFixture(fixture.workDir);
|
|
});
|
|
|
|
testConcurrentIfSelected('hook directly invoked → log fills', async () => {
|
|
// Direct hook invocation simulates Claude Code's PostToolUse delivery.
|
|
// E2E verifies the hook + bin chain works against real bins on disk
|
|
// (the unit test exercises this with mocks).
|
|
const hookPath = path.join(fixture.workDir, 'hosts', 'claude', 'hooks', 'question-log-hook');
|
|
const payload = {
|
|
session_id: 'cathedral-e2e-cap',
|
|
tool_name: 'AskUserQuestion',
|
|
tool_use_id: 'tu-cap-1',
|
|
tool_input: {
|
|
questions: [
|
|
{
|
|
question:
|
|
'D1 — Cathedral E2E capture <gstack-qid:ship-test-failure-triage>\nRecommendation: A',
|
|
options: ['A) Fix now (recommended)', 'B) Investigate'],
|
|
},
|
|
],
|
|
},
|
|
tool_response: { answers: [{ option_label: 'A) Fix now (recommended)' }] },
|
|
cwd: fixture.workDir,
|
|
};
|
|
const res = spawnSync(hookPath, [], {
|
|
env: {
|
|
...process.env,
|
|
GSTACK_STATE_ROOT: fixture.stateRoot,
|
|
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
|
|
},
|
|
input: JSON.stringify(payload),
|
|
encoding: 'utf-8',
|
|
});
|
|
expect(res.status).toBe(0);
|
|
const logPath = path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-log.jsonl');
|
|
expect(fs.existsSync(logPath)).toBe(true);
|
|
const lines = fs.readFileSync(logPath, 'utf-8').trim().split('\n');
|
|
expect(lines.length).toBeGreaterThanOrEqual(1);
|
|
const evt = JSON.parse(lines[0]);
|
|
expect(evt.source).toBe('hook');
|
|
expect(evt.question_id).toBe('ship-test-failure-triage');
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Scenario 2: Enforcement — never-ask preference + marker + 2-way → deny
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describeIfSelected('PlanTune cathedral E2E: enforcement', ['plan-tune-enforcement'], () => {
|
|
let fixture: ReturnType<typeof scaffoldFixture>;
|
|
|
|
beforeAll(() => {
|
|
fixture = scaffoldFixture('cathedral-enf-');
|
|
fs.mkdirSync(path.join(fixture.stateRoot, 'projects', fixture.slug), { recursive: true });
|
|
fs.writeFileSync(
|
|
path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-preferences.json'),
|
|
JSON.stringify({ 'ship-changelog-voice-polish': 'never-ask' }),
|
|
);
|
|
});
|
|
|
|
afterAll(() => {
|
|
cleanupFixture(fixture.workDir);
|
|
});
|
|
|
|
testConcurrentIfSelected('PreToolUse hook denies + logs auto-decided event', async () => {
|
|
const hookPath = path.join(
|
|
fixture.workDir,
|
|
'hosts',
|
|
'claude',
|
|
'hooks',
|
|
'question-preference-hook',
|
|
);
|
|
const payload = {
|
|
session_id: 'cathedral-e2e-enf',
|
|
tool_name: 'AskUserQuestion',
|
|
tool_use_id: 'tu-enf-1',
|
|
tool_input: {
|
|
questions: [
|
|
{
|
|
question:
|
|
'<gstack-qid:ship-changelog-voice-polish> Polish CHANGELOG entry?',
|
|
options: ['A) Accept (recommended)', 'B) Skip'],
|
|
},
|
|
],
|
|
},
|
|
cwd: fixture.workDir,
|
|
};
|
|
const res = spawnSync(hookPath, [], {
|
|
env: {
|
|
...process.env,
|
|
GSTACK_STATE_ROOT: fixture.stateRoot,
|
|
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
|
|
},
|
|
input: JSON.stringify(payload),
|
|
encoding: 'utf-8',
|
|
});
|
|
expect(res.status).toBe(0);
|
|
const parsed = JSON.parse(res.stdout || '{}');
|
|
expect(parsed.hookSpecificOutput?.permissionDecision).toBe('deny');
|
|
expect(parsed.hookSpecificOutput?.permissionDecisionReason).toContain('Accept');
|
|
|
|
// Auto-decided event was logged.
|
|
const logPath = path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-log.jsonl');
|
|
expect(fs.existsSync(logPath)).toBe(true);
|
|
const events = fs
|
|
.readFileSync(logPath, 'utf-8')
|
|
.trim()
|
|
.split('\n')
|
|
.filter(Boolean)
|
|
.map((l) => JSON.parse(l));
|
|
const auto = events.filter((e) => e.source === 'auto-decided');
|
|
expect(auto.length).toBe(1);
|
|
expect(auto[0].question_id).toBe('ship-changelog-voice-polish');
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Scenario 3: Annotation — declared profile injected via additionalContext
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describeIfSelected('PlanTune cathedral E2E: annotation', ['plan-tune-annotation'], () => {
|
|
let fixture: ReturnType<typeof scaffoldFixture>;
|
|
|
|
beforeAll(() => {
|
|
fixture = scaffoldFixture('cathedral-ann-');
|
|
// Strong declared profile that should annotate any signal_key=detail-preference question.
|
|
fs.writeFileSync(
|
|
path.join(fixture.stateRoot, 'developer-profile.json'),
|
|
JSON.stringify({ declared: { detail_preference: 0.9 } }),
|
|
);
|
|
// Seed a memory nugget for the matching signal_key.
|
|
fs.writeFileSync(
|
|
path.join(fixture.stateRoot, 'free-text-memory.json'),
|
|
JSON.stringify({
|
|
nuggets: [
|
|
{
|
|
nugget: 'User prefers verbose explanations with tradeoffs',
|
|
applies_to_signal_keys: ['detail-preference'],
|
|
applied_at: new Date().toISOString(),
|
|
},
|
|
],
|
|
}),
|
|
);
|
|
});
|
|
|
|
afterAll(() => {
|
|
cleanupFixture(fixture.workDir);
|
|
});
|
|
|
|
testConcurrentIfSelected('PreToolUse hook surfaces memory nugget on defer', async () => {
|
|
const hookPath = path.join(
|
|
fixture.workDir,
|
|
'hosts',
|
|
'claude',
|
|
'hooks',
|
|
'question-preference-hook',
|
|
);
|
|
const payload = {
|
|
session_id: 'cathedral-e2e-ann',
|
|
tool_name: 'AskUserQuestion',
|
|
tool_use_id: 'tu-ann-1',
|
|
tool_input: {
|
|
questions: [
|
|
{
|
|
question: '<gstack-qid:ship-todos-reorganize> Reorganize TODOs?',
|
|
options: ['A) Accept (recommended)', 'B) Skip'],
|
|
},
|
|
],
|
|
},
|
|
cwd: fixture.workDir,
|
|
};
|
|
const res = spawnSync(hookPath, [], {
|
|
env: {
|
|
...process.env,
|
|
GSTACK_STATE_ROOT: fixture.stateRoot,
|
|
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
|
|
},
|
|
input: JSON.stringify(payload),
|
|
encoding: 'utf-8',
|
|
});
|
|
expect(res.status).toBe(0);
|
|
const parsed = JSON.parse(res.stdout || '{}');
|
|
expect(parsed.hookSpecificOutput?.permissionDecision).toBe('defer');
|
|
expect(parsed.hookSpecificOutput?.additionalContext).toContain('verbose explanations');
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Scenario 4: Codex import — JSONL session → import bin → log fills
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describeIfSelected('PlanTune cathedral E2E: codex import', ['plan-tune-codex-import'], () => {
|
|
let fixture: ReturnType<typeof scaffoldFixture>;
|
|
let sessionFile: string;
|
|
|
|
beforeAll(() => {
|
|
fixture = scaffoldFixture('cathedral-cdx-');
|
|
sessionFile = path.join(fixture.workDir, 'rollout-cathedral.jsonl');
|
|
const lines = [
|
|
JSON.stringify({
|
|
type: 'session_meta',
|
|
payload: { id: 'cathedral-sess-1', cwd: fixture.workDir },
|
|
}),
|
|
JSON.stringify({
|
|
timestamp: new Date().toISOString(),
|
|
type: 'event_msg',
|
|
payload: {
|
|
type: 'agent_message',
|
|
message:
|
|
'D1 — Cathedral import <gstack-qid:plan-eng-review-scope-reduce>\nRecommendation: A\nA) Reduce (recommended)\nB) Keep',
|
|
},
|
|
}),
|
|
JSON.stringify({
|
|
timestamp: new Date().toISOString(),
|
|
type: 'event_msg',
|
|
payload: { type: 'user_message', message: 'A' },
|
|
}),
|
|
];
|
|
fs.writeFileSync(sessionFile, lines.join('\n') + '\n');
|
|
});
|
|
|
|
afterAll(() => {
|
|
cleanupFixture(fixture.workDir);
|
|
});
|
|
|
|
testConcurrentIfSelected('importer extracts events with codex-import-marker source', async () => {
|
|
const bin = path.join(fixture.workDir, 'bin', 'gstack-codex-session-import');
|
|
const res = spawnSync(bin, [sessionFile], {
|
|
env: {
|
|
...process.env,
|
|
GSTACK_STATE_ROOT: fixture.stateRoot,
|
|
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
|
|
},
|
|
encoding: 'utf-8',
|
|
cwd: fixture.workDir,
|
|
});
|
|
expect(res.status).toBe(0);
|
|
expect(res.stdout).toContain('IMPORTED: 1');
|
|
const logPath = path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-log.jsonl');
|
|
expect(fs.existsSync(logPath)).toBe(true);
|
|
const events = fs
|
|
.readFileSync(logPath, 'utf-8')
|
|
.trim()
|
|
.split('\n')
|
|
.filter(Boolean)
|
|
.map((l) => JSON.parse(l));
|
|
expect(events.length).toBe(1);
|
|
expect(events[0].source).toBe('codex-import-marker');
|
|
expect(events[0].question_id).toBe('plan-eng-review-scope-reduce');
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Scenario 5: Dream cycle round-trip — capture → distill (mocked) → apply →
|
|
// re-fire → memory injection
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describeIfSelected('PlanTune cathedral E2E: dream cycle', ['plan-tune-dream-cycle'], () => {
|
|
let fixture: ReturnType<typeof scaffoldFixture>;
|
|
|
|
beforeAll(() => {
|
|
fixture = scaffoldFixture('cathedral-dream-');
|
|
// Seed proposals file directly (the SDK call is exercised by the unit
|
|
// test; here we verify apply → re-fire round-trip on top of a known
|
|
// proposal shape).
|
|
fs.mkdirSync(path.join(fixture.stateRoot, 'projects', fixture.slug), { recursive: true });
|
|
fs.writeFileSync(
|
|
path.join(fixture.stateRoot, 'projects', fixture.slug, 'distillation-proposals.json'),
|
|
JSON.stringify({
|
|
generated_at: new Date().toISOString(),
|
|
source_event_count: 1,
|
|
proposals: [
|
|
{
|
|
kind: 'memory-nugget',
|
|
confidence: 0.95,
|
|
nugget: 'User wants every fix tested before shipping',
|
|
applies_to_signal_keys: ['test-discipline'],
|
|
source_quotes: ['always add tests for any fix'],
|
|
},
|
|
],
|
|
}),
|
|
);
|
|
});
|
|
|
|
afterAll(() => {
|
|
cleanupFixture(fixture.workDir);
|
|
});
|
|
|
|
testConcurrentIfSelected('apply → re-fire → memory injected via additionalContext', async () => {
|
|
// 1. Apply the proposal via gstack-distill-apply.
|
|
const applyBin = path.join(fixture.workDir, 'bin', 'gstack-distill-apply');
|
|
const applyRes = spawnSync(applyBin, ['--proposal', '0'], {
|
|
env: { ...process.env, GSTACK_STATE_ROOT: fixture.stateRoot },
|
|
encoding: 'utf-8',
|
|
cwd: fixture.workDir,
|
|
});
|
|
expect(applyRes.status).toBe(0);
|
|
|
|
// Memory file should now contain the nugget.
|
|
const memPath = path.join(fixture.stateRoot, 'free-text-memory.json');
|
|
expect(fs.existsSync(memPath)).toBe(true);
|
|
const mem = JSON.parse(fs.readFileSync(memPath, 'utf-8'));
|
|
expect(mem.nuggets.length).toBe(1);
|
|
|
|
// 2. Re-fire a question whose signal_key matches the nugget. PreToolUse
|
|
// hook should surface the nugget via additionalContext.
|
|
const hookPath = path.join(
|
|
fixture.workDir,
|
|
'hosts',
|
|
'claude',
|
|
'hooks',
|
|
'question-preference-hook',
|
|
);
|
|
const payload = {
|
|
session_id: 'cathedral-e2e-dream',
|
|
tool_name: 'AskUserQuestion',
|
|
tool_use_id: 'tu-dream-1',
|
|
tool_input: {
|
|
questions: [
|
|
{
|
|
question:
|
|
'<gstack-qid:plan-eng-review-test-gap> Add tests for this gap?',
|
|
options: ['A) Add (recommended)', 'B) Skip'],
|
|
},
|
|
],
|
|
},
|
|
cwd: fixture.workDir,
|
|
};
|
|
const hookRes = spawnSync(hookPath, [], {
|
|
env: {
|
|
...process.env,
|
|
GSTACK_STATE_ROOT: fixture.stateRoot,
|
|
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
|
|
},
|
|
input: JSON.stringify(payload),
|
|
encoding: 'utf-8',
|
|
});
|
|
expect(hookRes.status).toBe(0);
|
|
const parsed = JSON.parse(hookRes.stdout || '{}');
|
|
expect(parsed.hookSpecificOutput?.additionalContext).toContain('User wants every fix tested');
|
|
});
|
|
});
|