mirror of https://github.com/garrytan/gstack.git
197 lines
7.6 KiB
TypeScript
197 lines
7.6 KiB
TypeScript
/**
|
|
* Transcript section logger (v2 plan T10).
|
|
*
|
|
* Two jobs, both pure analysis over a SkillTestResult / NDJSON transcript:
|
|
*
|
|
* 1. extractSectionReads() — which `sections/*.md` files a run actually Read.
|
|
* Used by the sectioned world (post-carve) to verify the agent opened the
|
|
* chapters its situation required.
|
|
*
|
|
* 2. extractShipActions() — an observable ACTION fingerprint of a /ship run
|
|
* (ran tests, bumped VERSION, wrote CHANGELOG, created PR, ...). This works
|
|
* on BOTH the monolith and the sectioned skill, which is the whole point:
|
|
* capture a baseline on the current monolith ship FIRST, then assert the
|
|
* sectioned ship still performs the same actions. A section-read check alone
|
|
* can't catch "agent read the chapter but skipped the step"; the action
|
|
* fingerprint can.
|
|
*
|
|
* Why baseline-first (Codex outside-voice critique on the T9 plan): a logger
|
|
* shipped in the same PR as the carve is post-failure telemetry unless it has a
|
|
* pre-carve reference. captureShipBaseline() records the monolith's action
|
|
* fingerprint so compareShipActions() can flag a regression introduced by the
|
|
* carve.
|
|
*
|
|
* Pure functions, no I/O except the explicit read/write baseline helpers. The
|
|
* unit tests drive these with synthetic transcripts — no paid run needed to
|
|
* validate the logic.
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
/** Minimal shape we need from SkillTestResult — kept structural so callers can
|
|
* pass a full SkillTestResult or a hand-built fixture in unit tests. */
|
|
export interface ToolCallLike {
|
|
tool: string;
|
|
input: unknown;
|
|
output?: string;
|
|
}
|
|
export interface TranscriptResultLike {
|
|
toolCalls: ToolCallLike[];
|
|
output?: string;
|
|
}
|
|
|
|
/** Pull the file_path off a tool-call input, tolerating unknown shapes. */
|
|
function readFilePath(input: unknown): string | null {
|
|
if (input && typeof input === 'object') {
|
|
const fp = (input as Record<string, unknown>).file_path;
|
|
if (typeof fp === 'string') return fp;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Pull the command string off a Bash tool-call input. */
|
|
function bashCommand(input: unknown): string | null {
|
|
if (input && typeof input === 'object') {
|
|
const cmd = (input as Record<string, unknown>).command;
|
|
if (typeof cmd === 'string') return cmd;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Every `sections/<name>.md` file the run Read, normalized to the section
|
|
* basename (e.g. "version-bump.md"). Deduped, in first-Read order. Matching is
|
|
* on the path segment `/sections/<file>.md` so it works regardless of whether
|
|
* the host resolved a relative, absolute, or prefixed install path.
|
|
*/
|
|
export function extractSectionReads(result: TranscriptResultLike): string[] {
|
|
const seen = new Set<string>();
|
|
const ordered: string[] = [];
|
|
for (const call of result.toolCalls) {
|
|
if (call.tool !== 'Read') continue;
|
|
const fp = readFilePath(call.input);
|
|
if (!fp) continue;
|
|
const m = fp.match(/(?:^|\/)sections\/([A-Za-z0-9._-]+\.md)$/);
|
|
if (!m) continue;
|
|
const name = m[1];
|
|
if (!seen.has(name)) {
|
|
seen.add(name);
|
|
ordered.push(name);
|
|
}
|
|
}
|
|
return ordered;
|
|
}
|
|
|
|
/**
|
|
* The canonical /ship action vocabulary. Each action is detected from the Bash
|
|
* commands the agent ran (plus a couple of Write/Edit signals). Order is the
|
|
* rough ship sequence; detection is order-independent.
|
|
*
|
|
* Keep this list aligned with the ship skeleton's numbered steps. The
|
|
* section-loading eval asserts the sectioned ship still triggers the same
|
|
* actions a monolith run did for the same fixture situation.
|
|
*/
|
|
export const SHIP_ACTIONS = [
|
|
'merged_base', // git merge <base>
|
|
'ran_tests', // bun test / npm test / the project test cmd
|
|
'bumped_version', // wrote VERSION / package.json version / ran gstack-version-bump
|
|
'wrote_changelog', // edited CHANGELOG.md
|
|
'committed', // git commit
|
|
'pushed', // git push
|
|
'opened_pr', // gh pr create / glab mr create
|
|
] as const;
|
|
export type ShipAction = (typeof SHIP_ACTIONS)[number];
|
|
|
|
const BASH_ACTION_PATTERNS: Array<{ action: ShipAction; re: RegExp }> = [
|
|
{ action: 'merged_base', re: /\bgit\s+merge\b/ },
|
|
{ action: 'ran_tests', re: /\b(bun\s+test|npm\s+(run\s+)?test|yarn\s+test|pytest|go\s+test|cargo\s+test|rspec)\b/ },
|
|
{ action: 'bumped_version', re: /gstack-version-bump\b|gstack-next-version\b|>\s*VERSION\b|npm\s+version\b/ },
|
|
{ action: 'wrote_changelog', re: /CHANGELOG\.md/ },
|
|
{ action: 'committed', re: /\bgit\s+commit\b/ },
|
|
{ action: 'pushed', re: /\bgit\s+push\b/ },
|
|
{ action: 'opened_pr', re: /\bgh\s+pr\s+create\b|\bglab\s+mr\s+create\b/ },
|
|
];
|
|
|
|
/**
|
|
* The observable action fingerprint of a ship run. Works on monolith AND
|
|
* sectioned skills because it reads what the agent DID (Bash + file writes),
|
|
* not which prose it loaded.
|
|
*/
|
|
export function extractShipActions(result: TranscriptResultLike): ShipAction[] {
|
|
const found = new Set<ShipAction>();
|
|
for (const call of result.toolCalls) {
|
|
if (call.tool === 'Bash') {
|
|
const cmd = bashCommand(call.input);
|
|
if (!cmd) continue;
|
|
for (const { action, re } of BASH_ACTION_PATTERNS) {
|
|
if (re.test(cmd)) found.add(action);
|
|
}
|
|
} else if (call.tool === 'Write' || call.tool === 'Edit') {
|
|
const fp = readFilePath(call.input);
|
|
if (fp && /CHANGELOG\.md$/.test(fp)) found.add('wrote_changelog');
|
|
if (fp && /(?:^|\/)VERSION$/.test(fp)) found.add('bumped_version');
|
|
}
|
|
}
|
|
// Preserve canonical order.
|
|
return SHIP_ACTIONS.filter(a => found.has(a));
|
|
}
|
|
|
|
export interface ShipBaseline {
|
|
tag: string;
|
|
/** Fixture/situation id this baseline was captured for. */
|
|
situation: string;
|
|
/** Action fingerprint observed on the monolith ship. */
|
|
actions: ShipAction[];
|
|
/** Section reads observed (empty on the monolith — present after carve). */
|
|
sectionReads: string[];
|
|
capturedAt: string;
|
|
}
|
|
|
|
const DEFAULT_BASELINE_DIR = path.join(os.homedir(), '.gstack-dev', 'ship-baselines');
|
|
|
|
/** Where a baseline for a given situation lives. */
|
|
export function baselinePath(situation: string, dir = DEFAULT_BASELINE_DIR): string {
|
|
return path.join(dir, `${situation}.json`);
|
|
}
|
|
|
|
/** Persist a ship baseline (used once on the monolith, before the carve). */
|
|
export function writeShipBaseline(baseline: ShipBaseline, dir = DEFAULT_BASELINE_DIR): string {
|
|
fs.mkdirSync(dir, { recursive: true });
|
|
const p = baselinePath(baseline.situation, dir);
|
|
fs.writeFileSync(p, JSON.stringify(baseline, null, 2) + '\n');
|
|
return p;
|
|
}
|
|
|
|
/** Read a previously-captured baseline, or null if none exists yet. */
|
|
export function readShipBaseline(situation: string, dir = DEFAULT_BASELINE_DIR): ShipBaseline | null {
|
|
try {
|
|
return JSON.parse(fs.readFileSync(baselinePath(situation, dir), 'utf-8')) as ShipBaseline;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export interface ShipActionDiff {
|
|
/** Actions the baseline performed that the current run did NOT (the regression set). */
|
|
missing: ShipAction[];
|
|
/** Actions the current run performed that the baseline did not (usually fine). */
|
|
added: ShipAction[];
|
|
/** True when no baseline action was dropped. */
|
|
ok: boolean;
|
|
}
|
|
|
|
/**
|
|
* Compare a current sectioned-ship run against the monolith baseline. A dropped
|
|
* action (in baseline, not in current) is the carve regression we care about:
|
|
* the sectioned ship stopped doing something the monolith did.
|
|
*/
|
|
export function compareShipActions(baseline: ShipBaseline, current: ShipAction[]): ShipActionDiff {
|
|
const cur = new Set(current);
|
|
const base = new Set(baseline.actions);
|
|
const missing = baseline.actions.filter(a => !cur.has(a));
|
|
const added = current.filter(a => !base.has(a));
|
|
return { missing, added, ok: missing.length === 0 };
|
|
}
|