diff --git a/test/gen-skill-docs-idempotency.test.ts b/test/gen-skill-docs-idempotency.test.ts new file mode 100644 index 000000000..489a6af1c --- /dev/null +++ b/test/gen-skill-docs-idempotency.test.ts @@ -0,0 +1,110 @@ +/** + * Idempotency test for gen-skill-docs (regression for v1.45.0.0 timestamp flap). + * + * Running `bun run gen:skill-docs` twice in a row must produce a no-op on + * the second run: every output file is byte-identical to itself. Without + * this gate, CI freshness checks flap whenever someone introduces a + * timestamp, a random seed, or any other non-deterministic field into a + * generated artifact. + * + * v1.45.0.0 shipped with a `generated_at` ISO timestamp in + * scripts/proactive-suggestions.json that updated every run. CI freshness + * checks failed because the committed file's timestamp never matched the + * latest gen. Fixed in 43e18af4 — this test pins the contract going forward. + * + * The test pays a small cost (~2 gen-skill-docs invocations, ~3s total) but + * catches a class of bugs that's invisible until CI fails. + */ + +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; + +const REPO_ROOT = path.resolve(import.meta.dir, '..'); + +/** Files that gen-skill-docs writes and that must be byte-stable across runs. */ +const STABLE_OUTPUTS = [ + 'scripts/proactive-suggestions.json', + 'SKILL.md', + 'ship/SKILL.md', + 'plan-ceo-review/SKILL.md', + 'office-hours/SKILL.md', + 'gstack/llms.txt', +]; + +function runGen(): { exitCode: number; stderr: string } { + const result = spawnSync('bun', ['run', 'gen:skill-docs'], { + cwd: REPO_ROOT, + stdio: ['ignore', 'pipe', 'pipe'], + timeout: 60_000, + }); + return { + exitCode: result.status ?? -1, + stderr: result.stderr?.toString() ?? '', + }; +} + +function snapshot(): Map { + const m = new Map(); + for (const rel of STABLE_OUTPUTS) { + const full = path.join(REPO_ROOT, rel); + if (fs.existsSync(full)) { + m.set(rel, fs.readFileSync(full, 'utf-8')); + } + } + return m; +} + +describe('gen-skill-docs idempotency', () => { + test('two consecutive runs produce byte-identical outputs (no flapping fields)', () => { + const firstRun = runGen(); + expect(firstRun.exitCode).toBe(0); + + const after1 = snapshot(); + expect(after1.size).toBeGreaterThan(0); + + const secondRun = runGen(); + expect(secondRun.exitCode).toBe(0); + + const after2 = snapshot(); + + // Compare each stable output byte-for-byte. + const flapping: string[] = []; + for (const [file, before] of after1.entries()) { + const now = after2.get(file); + if (now !== before) flapping.push(file); + } + + if (flapping.length > 0) { + throw new Error( + `${flapping.length} file(s) changed between two consecutive gen-skill-docs runs (flapping):\n` + + flapping.map(f => ` - ${f}`).join('\n') + + `\nLikely cause: a non-deterministic field (timestamp, random ID, ` + + `filesystem-iteration order) leaked into the generated output. CI freshness ` + + `checks (git diff --exit-code) will fail unpredictably until this is fixed.`, + ); + } + }, 180_000); // ~2 min budget for two gen runs + + test('--dry-run after a fresh gen reports zero stale files', () => { + // Pre-condition: working tree gen must be fresh (idempotency test above ran first). + // If a contributor introduces a non-deterministic field, this dry-run reports STALE. + const result = spawnSync('bun', ['run', 'gen:skill-docs', '--dry-run'], { + cwd: REPO_ROOT, + stdio: ['ignore', 'pipe', 'pipe'], + timeout: 60_000, + }); + expect(result.status).toBe(0); + const stdout = result.stdout?.toString() ?? ''; + // STALE: prefix means a file would change. Count them. + const staleLines = stdout.split('\n').filter(l => l.startsWith('STALE:')); + if (staleLines.length > 0) { + throw new Error( + `--dry-run reports ${staleLines.length} stale file(s) after a fresh gen:\n` + + staleLines.map(l => ` ${l}`).join('\n') + + `\nRun \`bun run gen:skill-docs\` and commit the result.`, + ); + } + }, 90_000); +}); diff --git a/test/helpers/budget-override.test.ts b/test/helpers/budget-override.test.ts new file mode 100644 index 000000000..e420c3892 --- /dev/null +++ b/test/helpers/budget-override.test.ts @@ -0,0 +1,116 @@ +/** + * Unit tests for budget-override audit logger. + * + * The audit trail is the only check on `EVALS_BUDGET_OVERRIDE_REASON` and + * `GSTACK_SIZE_BUDGET_OVERRIDE_REASON` — if the logger silently drops events, + * overrides become invisible and the budget gates are theater. These tests + * pin the contract: every override produces exactly one JSONL line with + * timestamp + scope + reason + CI provenance. + */ + +import { describe, test, expect, beforeEach } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { logBudgetOverride } from './budget-override'; + +const TMP_HOME = fs.mkdtempSync(path.join(os.tmpdir(), 'budget-override-test-')); +process.env.GSTACK_HOME = TMP_HOME; +const AUDIT_PATH = path.join(TMP_HOME, 'analytics', 'spend-overrides.jsonl'); + +describe('logBudgetOverride', () => { + beforeEach(() => { + // Start each test with a clean audit file + try { fs.unlinkSync(AUDIT_PATH); } catch { /* doesn't exist */ } + }); + + test('writes one JSONL line per call with required fields', () => { + logBudgetOverride({ + scope: 'evals-cost-cap-e2e', + reason: 'model price went up, will rebase the cap next sprint', + details: { tier: 'e2e', cap: 25, observed_cost_usd: 31.4 }, + }); + + expect(fs.existsSync(AUDIT_PATH)).toBe(true); + const lines = fs.readFileSync(AUDIT_PATH, 'utf-8').split('\n').filter(Boolean); + expect(lines.length).toBe(1); + const entry = JSON.parse(lines[0]!); + expect(entry.scope).toBe('evals-cost-cap-e2e'); + expect(entry.reason).toBe('model price went up, will rebase the cap next sprint'); + expect(entry.details).toEqual({ tier: 'e2e', cap: 25, observed_cost_usd: 31.4 }); + expect(typeof entry.timestamp).toBe('string'); + expect(entry.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/); + }); + + test('captures CI provenance when CI env is set', () => { + process.env.CI = 'true'; + process.env.GITHUB_ACTIONS = 'true'; + process.env.GITHUB_REF_NAME = 'feature/x'; + process.env.GITHUB_SHA = 'deadbeefcafe1234'; + + logBudgetOverride({ scope: 'skill-size-budget', reason: 'big diff bake-in' }); + + const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim()); + expect(entry.ci).toBe(true); + expect(entry.runner).toBe('github-actions'); + expect(entry.branch).toBe('feature/x'); + expect(entry.commit).toBe('deadbeef'); + + delete process.env.CI; + delete process.env.GITHUB_ACTIONS; + delete process.env.GITHUB_REF_NAME; + delete process.env.GITHUB_SHA; + }); + + test('defaults provenance to local when CI is unset', () => { + delete process.env.CI; + delete process.env.GITHUB_ACTIONS; + delete process.env.GITHUB_REF_NAME; + delete process.env.GITHUB_SHA; + delete process.env.CI_RUNNER; + delete process.env.CI_COMMIT_REF_NAME; + delete process.env.CI_COMMIT_SHORT_SHA; + + logBudgetOverride({ scope: 'skill-size-budget-corpus', reason: 'local dev test' }); + + const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim()); + expect(entry.ci).toBe(false); + expect(entry.runner).toBe('local'); + expect(entry.branch).toBe('unknown'); + expect(entry.commit).toBe('unknown'); + }); + + test('append-only: multiple calls produce multiple lines', () => { + logBudgetOverride({ scope: 's1', reason: 'r1' }); + logBudgetOverride({ scope: 's2', reason: 'r2' }); + logBudgetOverride({ scope: 's3', reason: 'r3' }); + + const lines = fs.readFileSync(AUDIT_PATH, 'utf-8').split('\n').filter(Boolean); + expect(lines.length).toBe(3); + const scopes = lines.map(l => JSON.parse(l).scope); + expect(scopes).toEqual(['s1', 's2', 's3']); + }); + + test('omits details key when entry.details is absent (uses empty object)', () => { + logBudgetOverride({ scope: 'plain', reason: 'no details' }); + const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim()); + expect(entry.details).toEqual({}); + }); + + test('never throws even when audit directory is missing — creates it', () => { + // Remove the analytics dir to force mkdir + try { fs.rmSync(path.join(TMP_HOME, 'analytics'), { recursive: true, force: true }); } catch { /* */ } + expect(() => logBudgetOverride({ scope: 'recreate', reason: 'test' })).not.toThrow(); + expect(fs.existsSync(AUDIT_PATH)).toBe(true); + }); + + test('survives an unwritable audit path (logs warning, does not throw)', () => { + // Point GSTACK_HOME at a path inside a file (illegal directory location) + const originalHome = process.env.GSTACK_HOME; + const bogusFile = path.join(TMP_HOME, 'not-a-dir.txt'); + fs.writeFileSync(bogusFile, 'just a file'); + process.env.GSTACK_HOME = bogusFile; + expect(() => logBudgetOverride({ scope: 'unwritable', reason: 'fs error path' })).not.toThrow(); + process.env.GSTACK_HOME = originalHome; + }); +}); diff --git a/test/terse-build.test.ts b/test/terse-build.test.ts new file mode 100644 index 000000000..ca531aeae --- /dev/null +++ b/test/terse-build.test.ts @@ -0,0 +1,151 @@ +/** + * Unit tests for the terse-build flag (v1.46.0.0 T3). + * + * `--explain-level=terse` makes the gen-skill-docs pipeline drop 4 preamble + * sections at gen time. Default builds keep them. Without these tests, a + * refactor that breaks the explainLevel threading silently regresses one + * of the opt-in compression paths — the runtime EXPLAIN_LEVEL: terse runtime + * gate still works, so users wouldn't notice immediately. + * + * Pure-function tests against the resolvers — fast, free, no subprocess. + */ + +import { describe, test, expect } from 'bun:test'; +import type { TemplateContext } from '../scripts/resolvers/types'; +import { generateWritingStyle } from '../scripts/resolvers/preamble/generate-writing-style'; +import { generateCompletenessSection } from '../scripts/resolvers/preamble/generate-completeness-section'; +import { generateConfusionProtocol } from '../scripts/resolvers/preamble/generate-confusion-protocol'; +import { generateContextHealth } from '../scripts/resolvers/preamble/generate-context-health'; +import { generatePreamble } from '../scripts/resolvers/preamble'; + +function makeCtx(explainLevel?: 'default' | 'terse', tier: number = 4): TemplateContext { + return { + skillName: 'test-skill', + tmplPath: '/tmp/test/SKILL.md.tmpl', + host: 'claude', + paths: { + skillRoot: '~/.claude/skills/gstack', + localSkillRoot: '.claude/skills', + binDir: '~/.claude/skills/gstack/bin', + browseDir: '~/.claude/skills/gstack/browse/dist', + designDir: '~/.claude/skills/gstack/design/dist', + makePdfDir: '~/.claude/skills/gstack/make-pdf/dist', + }, + preambleTier: tier, + explainLevel, + }; +} + +describe('terse build — per-resolver behavior', () => { + describe('generateWritingStyle', () => { + test('default: emits full section with jargon-list pointer', () => { + const out = generateWritingStyle(makeCtx('default')); + expect(out).toContain('## Writing Style'); + expect(out).toContain('jargon-list.json'); + expect(out).toContain('Curated jargon list'); + expect(out).toContain('outcome'); + }); + + test('terse: emits one-line terse directive only', () => { + const out = generateWritingStyle(makeCtx('terse')); + expect(out).toContain('## Writing Style'); + expect(out).toContain('Terse mode (build-time)'); + // Negative: NONE of the default-mode prose + expect(out).not.toContain('jargon-list.json'); + expect(out).not.toContain('Curated jargon list'); + expect(out).not.toContain('Frame questions in outcome terms'); + }); + + test('terse is meaningfully shorter than default', () => { + const fullLen = generateWritingStyle(makeCtx('default')).length; + const terseLen = generateWritingStyle(makeCtx('terse')).length; + expect(terseLen).toBeLessThan(fullLen / 3); + }); + }); + + describe('generateCompletenessSection', () => { + test('default: emits full section with Boil-the-Lake prose', () => { + const out = generateCompletenessSection(makeCtx('default')); + expect(out).toContain('## Completeness Principle'); + expect(out).toContain('Boil the Lake'); + }); + + test('terse: returns empty string', () => { + expect(generateCompletenessSection(makeCtx('terse'))).toBe(''); + }); + + test('no ctx arg: defaults to non-terse (back-compat with old callers)', () => { + const out = generateCompletenessSection(); + expect(out).toContain('## Completeness Principle'); + }); + }); + + describe('generateConfusionProtocol', () => { + test('default: emits full section', () => { + const out = generateConfusionProtocol(makeCtx('default')); + expect(out).toContain('## Confusion Protocol'); + expect(out).toContain('high-stakes ambiguity'); + }); + + test('terse: returns empty string', () => { + expect(generateConfusionProtocol(makeCtx('terse'))).toBe(''); + }); + + test('no ctx arg: defaults to non-terse', () => { + expect(generateConfusionProtocol()).toContain('## Confusion Protocol'); + }); + }); + + describe('generateContextHealth', () => { + test('default: emits full section', () => { + const out = generateContextHealth(makeCtx('default')); + expect(out).toContain('## Context Health'); + expect(out).toContain('PROGRESS'); + }); + + test('terse: returns empty string', () => { + expect(generateContextHealth(makeCtx('terse'))).toBe(''); + }); + }); +}); + +describe('terse build — generatePreamble integration', () => { + test('default tier-2 preamble includes all 4 terse-gated sections', () => { + const out = generatePreamble(makeCtx('default', 2)); + expect(out).toContain('## Writing Style'); + expect(out).toContain('## Completeness Principle'); + expect(out).toContain('## Confusion Protocol'); + expect(out).toContain('## Context Health'); + }); + + test('terse tier-2 preamble drops 3 of 4 sections + collapses Writing Style', () => { + const out = generatePreamble(makeCtx('terse', 2)); + // Writing Style heading still present (collapsed to one line) + expect(out).toContain('## Writing Style'); + expect(out).toContain('Terse mode (build-time)'); + // Three sections dropped entirely + expect(out).not.toContain('## Completeness Principle'); + expect(out).not.toContain('## Confusion Protocol'); + expect(out).not.toContain('## Context Health'); + }); + + test('terse preamble is measurably smaller', () => { + const defaultLen = generatePreamble(makeCtx('default', 2)).length; + const terseLen = generatePreamble(makeCtx('terse', 2)).length; + // Saving roughly 2-4 KB across the 4 sections; assert at least 1 KB saved. + expect(defaultLen - terseLen).toBeGreaterThan(1024); + }); + + test('terse preamble at tier 1 is identical to default (terse only affects tier-2+ sections)', () => { + // Tier 1 doesn't include the 4 terse-gated sections in the first place. + const defaultT1 = generatePreamble(makeCtx('default', 1)); + const terseT1 = generatePreamble(makeCtx('terse', 1)); + expect(terseT1).toBe(defaultT1); + }); + + test('explainLevel undefined behaves as default', () => { + const undefinedOut = generatePreamble(makeCtx(undefined, 2)); + const defaultOut = generatePreamble(makeCtx('default', 2)); + expect(undefinedOut).toBe(defaultOut); + }); +});