From 81fdf9cc6154c237484851b6b745337454f40d86 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 25 May 2026 20:36:43 -0700 Subject: [PATCH] =?UTF-8?q?test(budget):=20T5=20=E2=80=94=20hard=20token?= =?UTF-8?q?=20budgets=20+=20override=20audit=20trail=20(Phase=20A.6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new gate-tier guardrails for the v1.45.0.0 compression baseline: 1. test/skill-size-budget.test.ts (NEW) — per-skill SKILL.md size budget. Compares current state to test/fixtures/parity-baseline-v1.44.1.json. Three checks: per-skill (×1.05 default ratio), total corpus, and catalog token estimate (≤7000 for v1.45). The per-skill ratio is 1.05 not 1.0 because the T4 catalog trim moves text from frontmatter to a body section; small skills see a tiny body growth that's fine when offset by the much larger catalog-token win. 2. test/skill-budget-regression.test.ts EXTENDED — hard dollar cap on per-run eval cost. Per-tier defaults: gate $25, periodic $70. Umbrella EVALS_BUDGET_HARD_CAP=$30. Catches runaway eval costs (infinite retry, model price changes) before they amortize across PRs. Both checks support an override path with audit trail: GSTACK_SIZE_BUDGET_OVERRIDE_REASON="why this is OK" — size EVALS_BUDGET_OVERRIDE_REASON="why this is OK" — cost Overrides log to ~/.gstack/analytics/spend-overrides.jsonl with timestamp + scope + reason + CI provenance (runner, branch, commit) via test/helpers/budget-override.ts. Why the override audit: a hard cap with no escape valve becomes operationally hostile (legit price changes, longer transcripts, new required evals can all blow the cap). An override with no audit becomes "everyone overrides everything and the gate is theater." This module ships the audit half so reviewers can see what was waived and why. Codex 2nd-pass critique #3 absorbed: per-suite caps + override path with auditability + budget baselines checked into repo (parity-baseline-v1.44.1.json already in test/fixtures/). Test plan: - bun test test/skill-size-budget.test.ts: 4 pass (per-skill, corpus, catalog, baseline-exists) - bun test test/skill-budget-regression.test.ts: 4 pass (2 existing ratio checks + 2 new hard-cap checks) - Existing eval runs ($14.11 e2e, $0.02 llm-judge) sit well under the new caps Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/budget-override.ts | 50 +++++++++ test/skill-budget-regression.test.ts | 64 ++++++++++++ test/skill-size-budget.test.ts | 151 +++++++++++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 test/helpers/budget-override.ts create mode 100644 test/skill-size-budget.test.ts diff --git a/test/helpers/budget-override.ts b/test/helpers/budget-override.ts new file mode 100644 index 000000000..fefe2c790 --- /dev/null +++ b/test/helpers/budget-override.ts @@ -0,0 +1,50 @@ +/** + * Budget override audit trail (v1.45.0.0 T5). + * + * Records uses of GSTACK_SIZE_BUDGET_OVERRIDE_REASON or + * EVALS_BUDGET_OVERRIDE_REASON so a reviewer can see what was waived, + * by whom, and why. Append-only JSONL at ~/.gstack/analytics/spend-overrides.jsonl. + * + * Why audit: a hard cap with no escape valve becomes operationally hostile + * (legit price changes, longer transcripts, new required evals can all + * blow the cap). An escape valve with no audit becomes "everyone overrides + * everything and we lose the gate." This module is the audit half. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +export interface BudgetOverrideEntry { + scope: string; // e.g. 'skill-size-budget', 'evals-cost-cap' + reason: string; // user-supplied REASON env var + details?: Record; // numbers / regressions +} + +function getAuditPath(): string { + const base = process.env.GSTACK_HOME || path.join(os.homedir(), '.gstack'); + return path.join(base, 'analytics', 'spend-overrides.jsonl'); +} + +export function logBudgetOverride(entry: BudgetOverrideEntry): void { + try { + const auditPath = getAuditPath(); + fs.mkdirSync(path.dirname(auditPath), { recursive: true }); + const line = JSON.stringify({ + timestamp: new Date().toISOString(), + scope: entry.scope, + reason: entry.reason, + details: entry.details ?? {}, + // Capture provenance: who/where/which CI ran + ci: process.env.CI === 'true', + runner: process.env.GITHUB_ACTIONS ? 'github-actions' : process.env.CI_RUNNER || 'local', + branch: process.env.GITHUB_REF_NAME || process.env.CI_COMMIT_REF_NAME || 'unknown', + commit: process.env.GITHUB_SHA?.slice(0, 8) || process.env.CI_COMMIT_SHORT_SHA || 'unknown', + }) + '\n'; + fs.appendFileSync(auditPath, line); + } catch (err) { + // Best-effort logging; don't fail the test on audit-write errors. + // eslint-disable-next-line no-console + console.warn(`[budget-override] could not write audit log: ${(err as Error).message}`); + } +} diff --git a/test/skill-budget-regression.test.ts b/test/skill-budget-regression.test.ts index 651f09180..494ac6781 100644 --- a/test/skill-budget-regression.test.ts +++ b/test/skill-budget-regression.test.ts @@ -35,6 +35,27 @@ import { assertNoBudgetRegression, type EvalResult, } from './helpers/eval-store'; +import { logBudgetOverride } from './helpers/budget-override'; + +/** + * v1.45.0.0 T5 — hard eval cost cap. + * + * Per-tier defaults (override via env): + * EVALS_BUDGET_HARD_CAP_GATE default $25/run + * EVALS_BUDGET_HARD_CAP_PERIODIC default $70/run + * EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $30 + * EVALS_BUDGET_OVERRIDE_REASON if set, override fires AND audit-logs to + * ~/.gstack/analytics/spend-overrides.jsonl + * + * Caps are dollars-per-run, not dollars-per-test. A test that legitimately + * gets more expensive should bake into the baseline; a runaway eval (infinite + * retry, model price change) gets stopped here. + */ +const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 30; +const TIER_CAPS: Record<'e2e' | 'llm-judge', number> = { + e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || DEFAULT_HARD_CAP_USD, + 'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(70, DEFAULT_HARD_CAP_USD), +}; function currentGitBranch(): string { try { @@ -137,6 +158,40 @@ function checkTier(tier: 'e2e' | 'llm-judge'): void { ); } +/** Enforce a hard dollar cap on per-run eval cost. */ +function checkHardCap(tier: 'e2e' | 'llm-judge'): void { + const evalDir = getProjectEvalDir(); + const latest = findLatestRun(evalDir, tier); + if (!latest) return; + const cap = TIER_CAPS[tier]; + const cost = latest.result.total_cost_usd; + if (cost <= cap) { + // eslint-disable-next-line no-console + console.log(`[budget-hard-cap:${tier}] OK — $${cost.toFixed(2)} ≤ $${cap.toFixed(2)} cap`); + return; + } + const overrideReason = process.env.EVALS_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: `evals-cost-cap-${tier}`, + reason: overrideReason, + details: { tier, cap, observed_cost_usd: cost, run_file: latest.filepath }, + }); + // eslint-disable-next-line no-console + console.warn( + `[budget-hard-cap:${tier}] OVERRIDE APPLIED ("${overrideReason}") — $${cost.toFixed(2)} > $${cap.toFixed(2)} cap`, + ); + return; + } + throw new Error( + `Eval cost exceeded hard cap for tier ${tier}: ` + + `$${cost.toFixed(2)} > $${cap.toFixed(2)}. ` + + `Set EVALS_BUDGET_OVERRIDE_REASON="why this is OK" to allow + audit. ` + + `Per-tier override: EVALS_BUDGET_HARD_CAP_${tier === 'e2e' ? 'GATE' : 'PERIODIC'}=. ` + + `Run: ${latest.filepath}`, + ); +} + describe('tool budget regression (gate, free)', () => { test('no e2e test exceeds 2× prior tool calls or turns', () => { checkTier('e2e'); @@ -145,4 +200,13 @@ describe('tool budget regression (gate, free)', () => { test('no llm-judge test exceeds 2× prior tool calls or turns', () => { checkTier('llm-judge'); }); + + // T5: hard dollar cap on per-run cost (different from regression ratio above) + test('e2e run cost ≤ EVALS_BUDGET_HARD_CAP_GATE', () => { + checkHardCap('e2e'); + }); + + test('llm-judge run cost ≤ EVALS_BUDGET_HARD_CAP_PERIODIC', () => { + checkHardCap('llm-judge'); + }); }); diff --git a/test/skill-size-budget.test.ts b/test/skill-size-budget.test.ts new file mode 100644 index 000000000..41aef41a2 --- /dev/null +++ b/test/skill-size-budget.test.ts @@ -0,0 +1,151 @@ +/** + * Per-skill SKILL.md size budget regression (v1.45.0.0 T5). + * + * Asserts that no skill's generated SKILL.md grew beyond the v1.44.1 + * baseline. Catches preamble/resolver changes that bloat skills back to + * the pre-compression size. Free — pure file IO + JSON diff. + * + * Why a separate test from skill-budget-regression.test.ts: that one + * compares LIVE eval runs (tool calls, turns, cost); this one compares + * static SKILL.md sizes. Both gate-tier. + * + * The baseline lives at test/fixtures/parity-baseline-v1.44.1.json, + * captured by scripts/capture-baseline.ts before any Phase A work landed. + * + * Override: + * - GSTACK_SIZE_BUDGET_RATIO= changes the per-skill regression ratio. + * Default 1.0 (no growth allowed). Set to 1.10 to permit 10% growth + * (e.g., during deliberate feature additions that the catalog trim + * doesn't offset). + * - GSTACK_SIZE_BUDGET_OVERRIDE_REASON="text" allows a regression to + * pass and logs the reason to ~/.gstack/analytics/spend-overrides.jsonl + * for audit. Use sparingly; the next baseline should bake in the new + * size. + */ + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import { captureBaseline, type ParityBaseline } from './helpers/capture-parity-baseline'; +import { logBudgetOverride } from './helpers/budget-override'; + +const REPO_ROOT = path.resolve(import.meta.dir, '..'); +const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json'); + +// Default per-skill ratio is 1.05 (5% growth tolerance). T4 catalog trim +// MOVES text from frontmatter (always-loaded catalog) to a body section +// ("## When to invoke"), so small skills with already-short descriptions +// see a tiny body growth from the section header itself (~20 bytes). The +// 5% per-skill tolerance accommodates that while still catching real bloat; +// the always-loaded catalog cost is enforced separately with a hard ceiling. +const DEFAULT_RATIO = 1.05; +const RATIO = Number(process.env.GSTACK_SIZE_BUDGET_RATIO) || DEFAULT_RATIO; + +interface Regression { + skill: string; + beforeBytes: number; + afterBytes: number; + growth: number; +} + +describe('SKILL.md size budget regression (gate, free)', () => { + test('parity-baseline-v1.44.1.json exists', () => { + expect(fs.existsSync(BASELINE_PATH)).toBe(true); + }); + + test('no skill exceeds v1.44.1 baseline size × ratio', () => { + const baseline: ParityBaseline = JSON.parse(fs.readFileSync(BASELINE_PATH, 'utf-8')); + const current = captureBaseline({ repoRoot: REPO_ROOT }); + + const regressions: Regression[] = []; + for (const [skill, before] of Object.entries(baseline.skills)) { + const after = current.skills[skill]; + if (!after) continue; // skill removed since v1.44 — not a regression + if (after.skillMdBytes <= before.skillMdBytes * RATIO) continue; + regressions.push({ + skill, + beforeBytes: before.skillMdBytes, + afterBytes: after.skillMdBytes, + growth: after.skillMdBytes / before.skillMdBytes, + }); + } + + if (regressions.length === 0) return; + + const overrideReason = process.env.GSTACK_SIZE_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: 'skill-size-budget', + reason: overrideReason, + details: { ratio: RATIO, regressions }, + }); + // eslint-disable-next-line no-console + console.warn( + `[skill-size-budget] OVERRIDE APPLIED (${overrideReason}) — ${regressions.length} regression(s) allowed:`, + ); + for (const r of regressions) { + // eslint-disable-next-line no-console + console.warn(` ${r.skill}: ${r.beforeBytes} → ${r.afterBytes} bytes (×${r.growth.toFixed(2)})`); + } + return; + } + + const msg = regressions.map(r => + ` ${r.skill}: ${r.beforeBytes} → ${r.afterBytes} bytes (×${r.growth.toFixed(2)})`, + ).join('\n'); + throw new Error( + `${regressions.length} skill(s) regressed past v1.44.1 baseline × ${RATIO}:\n${msg}\n` + + `Override: set GSTACK_SIZE_BUDGET_OVERRIDE_REASON="why this is OK" to allow and audit-log.`, + ); + }); + + test('total corpus byte count does not regress past baseline × ratio', () => { + const baseline: ParityBaseline = JSON.parse(fs.readFileSync(BASELINE_PATH, 'utf-8')); + const current = captureBaseline({ repoRoot: REPO_ROOT }); + const ratio = current.totalCorpusBytes / baseline.totalCorpusBytes; + if (current.totalCorpusBytes <= baseline.totalCorpusBytes * RATIO) { + // eslint-disable-next-line no-console + console.log( + `[skill-size-budget] corpus OK: ${baseline.totalCorpusBytes} → ${current.totalCorpusBytes} bytes (×${ratio.toFixed(3)})`, + ); + return; + } + const overrideReason = process.env.GSTACK_SIZE_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: 'skill-size-budget-corpus', + reason: overrideReason, + details: { ratio: RATIO, observed: ratio, before: baseline.totalCorpusBytes, after: current.totalCorpusBytes }, + }); + return; + } + throw new Error( + `Total corpus regressed past v1.44.1 baseline × ${RATIO}: ` + + `${baseline.totalCorpusBytes} → ${current.totalCorpusBytes} bytes (×${ratio.toFixed(3)}). ` + + `Override: set GSTACK_SIZE_BUDGET_OVERRIDE_REASON to allow.`, + ); + }); + + test('catalog token estimate stays compressed (v1.45 target ≤ 7000)', () => { + const current = captureBaseline({ repoRoot: REPO_ROOT }); + const v145Target = 7000; + if (current.estTotalCatalogTokens <= v145Target) { + // eslint-disable-next-line no-console + console.log(`[skill-size-budget] catalog OK: ~${current.estTotalCatalogTokens} tokens (target ≤${v145Target})`); + return; + } + const overrideReason = process.env.GSTACK_SIZE_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: 'skill-size-budget-catalog', + reason: overrideReason, + details: { target: v145Target, observed: current.estTotalCatalogTokens }, + }); + return; + } + throw new Error( + `Catalog token estimate regressed past v1.45 target: ${current.estTotalCatalogTokens} tokens > ${v145Target}. ` + + `T4 catalog trim should keep this under control. Override: set GSTACK_SIZE_BUDGET_OVERRIDE_REASON to allow.`, + ); + }); +});