diff --git a/test/helpers/budget-override.ts b/test/helpers/budget-override.ts new file mode 100644 index 000000000..fefe2c790 --- /dev/null +++ b/test/helpers/budget-override.ts @@ -0,0 +1,50 @@ +/** + * Budget override audit trail (v1.45.0.0 T5). + * + * Records uses of GSTACK_SIZE_BUDGET_OVERRIDE_REASON or + * EVALS_BUDGET_OVERRIDE_REASON so a reviewer can see what was waived, + * by whom, and why. Append-only JSONL at ~/.gstack/analytics/spend-overrides.jsonl. + * + * Why audit: a hard cap with no escape valve becomes operationally hostile + * (legit price changes, longer transcripts, new required evals can all + * blow the cap). An escape valve with no audit becomes "everyone overrides + * everything and we lose the gate." This module is the audit half. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +export interface BudgetOverrideEntry { + scope: string; // e.g. 'skill-size-budget', 'evals-cost-cap' + reason: string; // user-supplied REASON env var + details?: Record; // numbers / regressions +} + +function getAuditPath(): string { + const base = process.env.GSTACK_HOME || path.join(os.homedir(), '.gstack'); + return path.join(base, 'analytics', 'spend-overrides.jsonl'); +} + +export function logBudgetOverride(entry: BudgetOverrideEntry): void { + try { + const auditPath = getAuditPath(); + fs.mkdirSync(path.dirname(auditPath), { recursive: true }); + const line = JSON.stringify({ + timestamp: new Date().toISOString(), + scope: entry.scope, + reason: entry.reason, + details: entry.details ?? {}, + // Capture provenance: who/where/which CI ran + ci: process.env.CI === 'true', + runner: process.env.GITHUB_ACTIONS ? 'github-actions' : process.env.CI_RUNNER || 'local', + branch: process.env.GITHUB_REF_NAME || process.env.CI_COMMIT_REF_NAME || 'unknown', + commit: process.env.GITHUB_SHA?.slice(0, 8) || process.env.CI_COMMIT_SHORT_SHA || 'unknown', + }) + '\n'; + fs.appendFileSync(auditPath, line); + } catch (err) { + // Best-effort logging; don't fail the test on audit-write errors. + // eslint-disable-next-line no-console + console.warn(`[budget-override] could not write audit log: ${(err as Error).message}`); + } +} diff --git a/test/skill-budget-regression.test.ts b/test/skill-budget-regression.test.ts index 651f09180..494ac6781 100644 --- a/test/skill-budget-regression.test.ts +++ b/test/skill-budget-regression.test.ts @@ -35,6 +35,27 @@ import { assertNoBudgetRegression, type EvalResult, } from './helpers/eval-store'; +import { logBudgetOverride } from './helpers/budget-override'; + +/** + * v1.45.0.0 T5 — hard eval cost cap. + * + * Per-tier defaults (override via env): + * EVALS_BUDGET_HARD_CAP_GATE default $25/run + * EVALS_BUDGET_HARD_CAP_PERIODIC default $70/run + * EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $30 + * EVALS_BUDGET_OVERRIDE_REASON if set, override fires AND audit-logs to + * ~/.gstack/analytics/spend-overrides.jsonl + * + * Caps are dollars-per-run, not dollars-per-test. A test that legitimately + * gets more expensive should bake into the baseline; a runaway eval (infinite + * retry, model price change) gets stopped here. + */ +const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 30; +const TIER_CAPS: Record<'e2e' | 'llm-judge', number> = { + e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || DEFAULT_HARD_CAP_USD, + 'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(70, DEFAULT_HARD_CAP_USD), +}; function currentGitBranch(): string { try { @@ -137,6 +158,40 @@ function checkTier(tier: 'e2e' | 'llm-judge'): void { ); } +/** Enforce a hard dollar cap on per-run eval cost. */ +function checkHardCap(tier: 'e2e' | 'llm-judge'): void { + const evalDir = getProjectEvalDir(); + const latest = findLatestRun(evalDir, tier); + if (!latest) return; + const cap = TIER_CAPS[tier]; + const cost = latest.result.total_cost_usd; + if (cost <= cap) { + // eslint-disable-next-line no-console + console.log(`[budget-hard-cap:${tier}] OK — $${cost.toFixed(2)} ≤ $${cap.toFixed(2)} cap`); + return; + } + const overrideReason = process.env.EVALS_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: `evals-cost-cap-${tier}`, + reason: overrideReason, + details: { tier, cap, observed_cost_usd: cost, run_file: latest.filepath }, + }); + // eslint-disable-next-line no-console + console.warn( + `[budget-hard-cap:${tier}] OVERRIDE APPLIED ("${overrideReason}") — $${cost.toFixed(2)} > $${cap.toFixed(2)} cap`, + ); + return; + } + throw new Error( + `Eval cost exceeded hard cap for tier ${tier}: ` + + `$${cost.toFixed(2)} > $${cap.toFixed(2)}. ` + + `Set EVALS_BUDGET_OVERRIDE_REASON="why this is OK" to allow + audit. ` + + `Per-tier override: EVALS_BUDGET_HARD_CAP_${tier === 'e2e' ? 'GATE' : 'PERIODIC'}=. ` + + `Run: ${latest.filepath}`, + ); +} + describe('tool budget regression (gate, free)', () => { test('no e2e test exceeds 2× prior tool calls or turns', () => { checkTier('e2e'); @@ -145,4 +200,13 @@ describe('tool budget regression (gate, free)', () => { test('no llm-judge test exceeds 2× prior tool calls or turns', () => { checkTier('llm-judge'); }); + + // T5: hard dollar cap on per-run cost (different from regression ratio above) + test('e2e run cost ≤ EVALS_BUDGET_HARD_CAP_GATE', () => { + checkHardCap('e2e'); + }); + + test('llm-judge run cost ≤ EVALS_BUDGET_HARD_CAP_PERIODIC', () => { + checkHardCap('llm-judge'); + }); }); diff --git a/test/skill-size-budget.test.ts b/test/skill-size-budget.test.ts new file mode 100644 index 000000000..41aef41a2 --- /dev/null +++ b/test/skill-size-budget.test.ts @@ -0,0 +1,151 @@ +/** + * Per-skill SKILL.md size budget regression (v1.45.0.0 T5). + * + * Asserts that no skill's generated SKILL.md grew beyond the v1.44.1 + * baseline. Catches preamble/resolver changes that bloat skills back to + * the pre-compression size. Free — pure file IO + JSON diff. + * + * Why a separate test from skill-budget-regression.test.ts: that one + * compares LIVE eval runs (tool calls, turns, cost); this one compares + * static SKILL.md sizes. Both gate-tier. + * + * The baseline lives at test/fixtures/parity-baseline-v1.44.1.json, + * captured by scripts/capture-baseline.ts before any Phase A work landed. + * + * Override: + * - GSTACK_SIZE_BUDGET_RATIO= changes the per-skill regression ratio. + * Default 1.0 (no growth allowed). Set to 1.10 to permit 10% growth + * (e.g., during deliberate feature additions that the catalog trim + * doesn't offset). + * - GSTACK_SIZE_BUDGET_OVERRIDE_REASON="text" allows a regression to + * pass and logs the reason to ~/.gstack/analytics/spend-overrides.jsonl + * for audit. Use sparingly; the next baseline should bake in the new + * size. + */ + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import { captureBaseline, type ParityBaseline } from './helpers/capture-parity-baseline'; +import { logBudgetOverride } from './helpers/budget-override'; + +const REPO_ROOT = path.resolve(import.meta.dir, '..'); +const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json'); + +// Default per-skill ratio is 1.05 (5% growth tolerance). T4 catalog trim +// MOVES text from frontmatter (always-loaded catalog) to a body section +// ("## When to invoke"), so small skills with already-short descriptions +// see a tiny body growth from the section header itself (~20 bytes). The +// 5% per-skill tolerance accommodates that while still catching real bloat; +// the always-loaded catalog cost is enforced separately with a hard ceiling. +const DEFAULT_RATIO = 1.05; +const RATIO = Number(process.env.GSTACK_SIZE_BUDGET_RATIO) || DEFAULT_RATIO; + +interface Regression { + skill: string; + beforeBytes: number; + afterBytes: number; + growth: number; +} + +describe('SKILL.md size budget regression (gate, free)', () => { + test('parity-baseline-v1.44.1.json exists', () => { + expect(fs.existsSync(BASELINE_PATH)).toBe(true); + }); + + test('no skill exceeds v1.44.1 baseline size × ratio', () => { + const baseline: ParityBaseline = JSON.parse(fs.readFileSync(BASELINE_PATH, 'utf-8')); + const current = captureBaseline({ repoRoot: REPO_ROOT }); + + const regressions: Regression[] = []; + for (const [skill, before] of Object.entries(baseline.skills)) { + const after = current.skills[skill]; + if (!after) continue; // skill removed since v1.44 — not a regression + if (after.skillMdBytes <= before.skillMdBytes * RATIO) continue; + regressions.push({ + skill, + beforeBytes: before.skillMdBytes, + afterBytes: after.skillMdBytes, + growth: after.skillMdBytes / before.skillMdBytes, + }); + } + + if (regressions.length === 0) return; + + const overrideReason = process.env.GSTACK_SIZE_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: 'skill-size-budget', + reason: overrideReason, + details: { ratio: RATIO, regressions }, + }); + // eslint-disable-next-line no-console + console.warn( + `[skill-size-budget] OVERRIDE APPLIED (${overrideReason}) — ${regressions.length} regression(s) allowed:`, + ); + for (const r of regressions) { + // eslint-disable-next-line no-console + console.warn(` ${r.skill}: ${r.beforeBytes} → ${r.afterBytes} bytes (×${r.growth.toFixed(2)})`); + } + return; + } + + const msg = regressions.map(r => + ` ${r.skill}: ${r.beforeBytes} → ${r.afterBytes} bytes (×${r.growth.toFixed(2)})`, + ).join('\n'); + throw new Error( + `${regressions.length} skill(s) regressed past v1.44.1 baseline × ${RATIO}:\n${msg}\n` + + `Override: set GSTACK_SIZE_BUDGET_OVERRIDE_REASON="why this is OK" to allow and audit-log.`, + ); + }); + + test('total corpus byte count does not regress past baseline × ratio', () => { + const baseline: ParityBaseline = JSON.parse(fs.readFileSync(BASELINE_PATH, 'utf-8')); + const current = captureBaseline({ repoRoot: REPO_ROOT }); + const ratio = current.totalCorpusBytes / baseline.totalCorpusBytes; + if (current.totalCorpusBytes <= baseline.totalCorpusBytes * RATIO) { + // eslint-disable-next-line no-console + console.log( + `[skill-size-budget] corpus OK: ${baseline.totalCorpusBytes} → ${current.totalCorpusBytes} bytes (×${ratio.toFixed(3)})`, + ); + return; + } + const overrideReason = process.env.GSTACK_SIZE_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: 'skill-size-budget-corpus', + reason: overrideReason, + details: { ratio: RATIO, observed: ratio, before: baseline.totalCorpusBytes, after: current.totalCorpusBytes }, + }); + return; + } + throw new Error( + `Total corpus regressed past v1.44.1 baseline × ${RATIO}: ` + + `${baseline.totalCorpusBytes} → ${current.totalCorpusBytes} bytes (×${ratio.toFixed(3)}). ` + + `Override: set GSTACK_SIZE_BUDGET_OVERRIDE_REASON to allow.`, + ); + }); + + test('catalog token estimate stays compressed (v1.45 target ≤ 7000)', () => { + const current = captureBaseline({ repoRoot: REPO_ROOT }); + const v145Target = 7000; + if (current.estTotalCatalogTokens <= v145Target) { + // eslint-disable-next-line no-console + console.log(`[skill-size-budget] catalog OK: ~${current.estTotalCatalogTokens} tokens (target ≤${v145Target})`); + return; + } + const overrideReason = process.env.GSTACK_SIZE_BUDGET_OVERRIDE_REASON?.trim(); + if (overrideReason) { + logBudgetOverride({ + scope: 'skill-size-budget-catalog', + reason: overrideReason, + details: { target: v145Target, observed: current.estTotalCatalogTokens }, + }); + return; + } + throw new Error( + `Catalog token estimate regressed past v1.45 target: ${current.estTotalCatalogTokens} tokens > ${v145Target}. ` + + `T4 catalog trim should keep this under control. Override: set GSTACK_SIZE_BUDGET_OVERRIDE_REASON to allow.`, + ); + }); +});