mirror of https://github.com/garrytan/gstack.git
test(coverage): fill three remaining v1.46.0.0 test gaps
Three untested surfaces from the v1.46.0.0 work. All three would have caught real bugs we shipped (and fixed) on this branch. 1. test/helpers/budget-override.test.ts — 7 tests pin the audit-trail contract for EVALS_BUDGET_OVERRIDE_REASON and GSTACK_SIZE_BUDGET_OVERRIDE_REASON. Without this, the audit logger could silently drop events and overrides become invisible. Tests cover: required fields per JSONL line, CI provenance capture (CI/GITHUB_ACTIONS/branch/commit), local-runner defaults, append-only behavior, missing-directory recovery, and unwritable- path resilience (logs warning instead of throwing). 2. test/terse-build.test.ts — 16 tests pin --explain-level=terse behavior across the 4 gated resolvers and the composed preamble. Default vs terse vs undefined-ctx all asserted. Without this, a refactor that breaks the explainLevel threading silently regresses the opt-in compression path; the runtime EXPLAIN_LEVEL: terse gate still works so users wouldn't notice. Tier-1 invariant pinned (terse-only-affects-tier-2+). 3. test/gen-skill-docs-idempotency.test.ts — 2 tests catch the class of bug behind the v1.45.0.0 timestamp flap. Two consecutive gen-skill-docs runs must produce byte-identical outputs across STABLE_OUTPUTS (proactive-suggestions.json, SKILL.md, ship/SKILL.md, plan-ceo-review/SKILL.md, office-hours/SKILL.md, gstack/llms.txt). --dry-run reports zero stale files after a fresh gen. CI freshness regressions surface as test failures BEFORE a PR is opened. Test plan: - bun test test/helpers/budget-override.test.ts: 7 pass - bun test test/terse-build.test.ts: 16 pass - bun test test/gen-skill-docs-idempotency.test.ts: 2 pass - Full focused suite (15 test files): 1179 pass, 0 fail (+45 new tests vs the pre-fill baseline of 1134) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c88825dba0
commit
8b94e6d993
|
|
@ -0,0 +1,110 @@
|
|||
/**
|
||||
* Idempotency test for gen-skill-docs (regression for v1.45.0.0 timestamp flap).
|
||||
*
|
||||
* Running `bun run gen:skill-docs` twice in a row must produce a no-op on
|
||||
* the second run: every output file is byte-identical to itself. Without
|
||||
* this gate, CI freshness checks flap whenever someone introduces a
|
||||
* timestamp, a random seed, or any other non-deterministic field into a
|
||||
* generated artifact.
|
||||
*
|
||||
* v1.45.0.0 shipped with a `generated_at` ISO timestamp in
|
||||
* scripts/proactive-suggestions.json that updated every run. CI freshness
|
||||
* checks failed because the committed file's timestamp never matched the
|
||||
* latest gen. Fixed in 43e18af4 — this test pins the contract going forward.
|
||||
*
|
||||
* The test pays a small cost (~2 gen-skill-docs invocations, ~3s total) but
|
||||
* catches a class of bugs that's invisible until CI fails.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
/** Files that gen-skill-docs writes and that must be byte-stable across runs. */
|
||||
const STABLE_OUTPUTS = [
|
||||
'scripts/proactive-suggestions.json',
|
||||
'SKILL.md',
|
||||
'ship/SKILL.md',
|
||||
'plan-ceo-review/SKILL.md',
|
||||
'office-hours/SKILL.md',
|
||||
'gstack/llms.txt',
|
||||
];
|
||||
|
||||
function runGen(): { exitCode: number; stderr: string } {
|
||||
const result = spawnSync('bun', ['run', 'gen:skill-docs'], {
|
||||
cwd: REPO_ROOT,
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
timeout: 60_000,
|
||||
});
|
||||
return {
|
||||
exitCode: result.status ?? -1,
|
||||
stderr: result.stderr?.toString() ?? '',
|
||||
};
|
||||
}
|
||||
|
||||
function snapshot(): Map<string, string> {
|
||||
const m = new Map<string, string>();
|
||||
for (const rel of STABLE_OUTPUTS) {
|
||||
const full = path.join(REPO_ROOT, rel);
|
||||
if (fs.existsSync(full)) {
|
||||
m.set(rel, fs.readFileSync(full, 'utf-8'));
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
describe('gen-skill-docs idempotency', () => {
|
||||
test('two consecutive runs produce byte-identical outputs (no flapping fields)', () => {
|
||||
const firstRun = runGen();
|
||||
expect(firstRun.exitCode).toBe(0);
|
||||
|
||||
const after1 = snapshot();
|
||||
expect(after1.size).toBeGreaterThan(0);
|
||||
|
||||
const secondRun = runGen();
|
||||
expect(secondRun.exitCode).toBe(0);
|
||||
|
||||
const after2 = snapshot();
|
||||
|
||||
// Compare each stable output byte-for-byte.
|
||||
const flapping: string[] = [];
|
||||
for (const [file, before] of after1.entries()) {
|
||||
const now = after2.get(file);
|
||||
if (now !== before) flapping.push(file);
|
||||
}
|
||||
|
||||
if (flapping.length > 0) {
|
||||
throw new Error(
|
||||
`${flapping.length} file(s) changed between two consecutive gen-skill-docs runs (flapping):\n` +
|
||||
flapping.map(f => ` - ${f}`).join('\n') +
|
||||
`\nLikely cause: a non-deterministic field (timestamp, random ID, ` +
|
||||
`filesystem-iteration order) leaked into the generated output. CI freshness ` +
|
||||
`checks (git diff --exit-code) will fail unpredictably until this is fixed.`,
|
||||
);
|
||||
}
|
||||
}, 180_000); // ~2 min budget for two gen runs
|
||||
|
||||
test('--dry-run after a fresh gen reports zero stale files', () => {
|
||||
// Pre-condition: working tree gen must be fresh (idempotency test above ran first).
|
||||
// If a contributor introduces a non-deterministic field, this dry-run reports STALE.
|
||||
const result = spawnSync('bun', ['run', 'gen:skill-docs', '--dry-run'], {
|
||||
cwd: REPO_ROOT,
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
timeout: 60_000,
|
||||
});
|
||||
expect(result.status).toBe(0);
|
||||
const stdout = result.stdout?.toString() ?? '';
|
||||
// STALE: prefix means a file would change. Count them.
|
||||
const staleLines = stdout.split('\n').filter(l => l.startsWith('STALE:'));
|
||||
if (staleLines.length > 0) {
|
||||
throw new Error(
|
||||
`--dry-run reports ${staleLines.length} stale file(s) after a fresh gen:\n` +
|
||||
staleLines.map(l => ` ${l}`).join('\n') +
|
||||
`\nRun \`bun run gen:skill-docs\` and commit the result.`,
|
||||
);
|
||||
}
|
||||
}, 90_000);
|
||||
});
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
/**
|
||||
* Unit tests for budget-override audit logger.
|
||||
*
|
||||
* The audit trail is the only check on `EVALS_BUDGET_OVERRIDE_REASON` and
|
||||
* `GSTACK_SIZE_BUDGET_OVERRIDE_REASON` — if the logger silently drops events,
|
||||
* overrides become invisible and the budget gates are theater. These tests
|
||||
* pin the contract: every override produces exactly one JSONL line with
|
||||
* timestamp + scope + reason + CI provenance.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeEach } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { logBudgetOverride } from './budget-override';
|
||||
|
||||
const TMP_HOME = fs.mkdtempSync(path.join(os.tmpdir(), 'budget-override-test-'));
|
||||
process.env.GSTACK_HOME = TMP_HOME;
|
||||
const AUDIT_PATH = path.join(TMP_HOME, 'analytics', 'spend-overrides.jsonl');
|
||||
|
||||
describe('logBudgetOverride', () => {
|
||||
beforeEach(() => {
|
||||
// Start each test with a clean audit file
|
||||
try { fs.unlinkSync(AUDIT_PATH); } catch { /* doesn't exist */ }
|
||||
});
|
||||
|
||||
test('writes one JSONL line per call with required fields', () => {
|
||||
logBudgetOverride({
|
||||
scope: 'evals-cost-cap-e2e',
|
||||
reason: 'model price went up, will rebase the cap next sprint',
|
||||
details: { tier: 'e2e', cap: 25, observed_cost_usd: 31.4 },
|
||||
});
|
||||
|
||||
expect(fs.existsSync(AUDIT_PATH)).toBe(true);
|
||||
const lines = fs.readFileSync(AUDIT_PATH, 'utf-8').split('\n').filter(Boolean);
|
||||
expect(lines.length).toBe(1);
|
||||
const entry = JSON.parse(lines[0]!);
|
||||
expect(entry.scope).toBe('evals-cost-cap-e2e');
|
||||
expect(entry.reason).toBe('model price went up, will rebase the cap next sprint');
|
||||
expect(entry.details).toEqual({ tier: 'e2e', cap: 25, observed_cost_usd: 31.4 });
|
||||
expect(typeof entry.timestamp).toBe('string');
|
||||
expect(entry.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/);
|
||||
});
|
||||
|
||||
test('captures CI provenance when CI env is set', () => {
|
||||
process.env.CI = 'true';
|
||||
process.env.GITHUB_ACTIONS = 'true';
|
||||
process.env.GITHUB_REF_NAME = 'feature/x';
|
||||
process.env.GITHUB_SHA = 'deadbeefcafe1234';
|
||||
|
||||
logBudgetOverride({ scope: 'skill-size-budget', reason: 'big diff bake-in' });
|
||||
|
||||
const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim());
|
||||
expect(entry.ci).toBe(true);
|
||||
expect(entry.runner).toBe('github-actions');
|
||||
expect(entry.branch).toBe('feature/x');
|
||||
expect(entry.commit).toBe('deadbeef');
|
||||
|
||||
delete process.env.CI;
|
||||
delete process.env.GITHUB_ACTIONS;
|
||||
delete process.env.GITHUB_REF_NAME;
|
||||
delete process.env.GITHUB_SHA;
|
||||
});
|
||||
|
||||
test('defaults provenance to local when CI is unset', () => {
|
||||
delete process.env.CI;
|
||||
delete process.env.GITHUB_ACTIONS;
|
||||
delete process.env.GITHUB_REF_NAME;
|
||||
delete process.env.GITHUB_SHA;
|
||||
delete process.env.CI_RUNNER;
|
||||
delete process.env.CI_COMMIT_REF_NAME;
|
||||
delete process.env.CI_COMMIT_SHORT_SHA;
|
||||
|
||||
logBudgetOverride({ scope: 'skill-size-budget-corpus', reason: 'local dev test' });
|
||||
|
||||
const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim());
|
||||
expect(entry.ci).toBe(false);
|
||||
expect(entry.runner).toBe('local');
|
||||
expect(entry.branch).toBe('unknown');
|
||||
expect(entry.commit).toBe('unknown');
|
||||
});
|
||||
|
||||
test('append-only: multiple calls produce multiple lines', () => {
|
||||
logBudgetOverride({ scope: 's1', reason: 'r1' });
|
||||
logBudgetOverride({ scope: 's2', reason: 'r2' });
|
||||
logBudgetOverride({ scope: 's3', reason: 'r3' });
|
||||
|
||||
const lines = fs.readFileSync(AUDIT_PATH, 'utf-8').split('\n').filter(Boolean);
|
||||
expect(lines.length).toBe(3);
|
||||
const scopes = lines.map(l => JSON.parse(l).scope);
|
||||
expect(scopes).toEqual(['s1', 's2', 's3']);
|
||||
});
|
||||
|
||||
test('omits details key when entry.details is absent (uses empty object)', () => {
|
||||
logBudgetOverride({ scope: 'plain', reason: 'no details' });
|
||||
const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim());
|
||||
expect(entry.details).toEqual({});
|
||||
});
|
||||
|
||||
test('never throws even when audit directory is missing — creates it', () => {
|
||||
// Remove the analytics dir to force mkdir
|
||||
try { fs.rmSync(path.join(TMP_HOME, 'analytics'), { recursive: true, force: true }); } catch { /* */ }
|
||||
expect(() => logBudgetOverride({ scope: 'recreate', reason: 'test' })).not.toThrow();
|
||||
expect(fs.existsSync(AUDIT_PATH)).toBe(true);
|
||||
});
|
||||
|
||||
test('survives an unwritable audit path (logs warning, does not throw)', () => {
|
||||
// Point GSTACK_HOME at a path inside a file (illegal directory location)
|
||||
const originalHome = process.env.GSTACK_HOME;
|
||||
const bogusFile = path.join(TMP_HOME, 'not-a-dir.txt');
|
||||
fs.writeFileSync(bogusFile, 'just a file');
|
||||
process.env.GSTACK_HOME = bogusFile;
|
||||
expect(() => logBudgetOverride({ scope: 'unwritable', reason: 'fs error path' })).not.toThrow();
|
||||
process.env.GSTACK_HOME = originalHome;
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
/**
|
||||
* Unit tests for the terse-build flag (v1.46.0.0 T3).
|
||||
*
|
||||
* `--explain-level=terse` makes the gen-skill-docs pipeline drop 4 preamble
|
||||
* sections at gen time. Default builds keep them. Without these tests, a
|
||||
* refactor that breaks the explainLevel threading silently regresses one
|
||||
* of the opt-in compression paths — the runtime EXPLAIN_LEVEL: terse runtime
|
||||
* gate still works, so users wouldn't notice immediately.
|
||||
*
|
||||
* Pure-function tests against the resolvers — fast, free, no subprocess.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import type { TemplateContext } from '../scripts/resolvers/types';
|
||||
import { generateWritingStyle } from '../scripts/resolvers/preamble/generate-writing-style';
|
||||
import { generateCompletenessSection } from '../scripts/resolvers/preamble/generate-completeness-section';
|
||||
import { generateConfusionProtocol } from '../scripts/resolvers/preamble/generate-confusion-protocol';
|
||||
import { generateContextHealth } from '../scripts/resolvers/preamble/generate-context-health';
|
||||
import { generatePreamble } from '../scripts/resolvers/preamble';
|
||||
|
||||
function makeCtx(explainLevel?: 'default' | 'terse', tier: number = 4): TemplateContext {
|
||||
return {
|
||||
skillName: 'test-skill',
|
||||
tmplPath: '/tmp/test/SKILL.md.tmpl',
|
||||
host: 'claude',
|
||||
paths: {
|
||||
skillRoot: '~/.claude/skills/gstack',
|
||||
localSkillRoot: '.claude/skills',
|
||||
binDir: '~/.claude/skills/gstack/bin',
|
||||
browseDir: '~/.claude/skills/gstack/browse/dist',
|
||||
designDir: '~/.claude/skills/gstack/design/dist',
|
||||
makePdfDir: '~/.claude/skills/gstack/make-pdf/dist',
|
||||
},
|
||||
preambleTier: tier,
|
||||
explainLevel,
|
||||
};
|
||||
}
|
||||
|
||||
describe('terse build — per-resolver behavior', () => {
|
||||
describe('generateWritingStyle', () => {
|
||||
test('default: emits full section with jargon-list pointer', () => {
|
||||
const out = generateWritingStyle(makeCtx('default'));
|
||||
expect(out).toContain('## Writing Style');
|
||||
expect(out).toContain('jargon-list.json');
|
||||
expect(out).toContain('Curated jargon list');
|
||||
expect(out).toContain('outcome');
|
||||
});
|
||||
|
||||
test('terse: emits one-line terse directive only', () => {
|
||||
const out = generateWritingStyle(makeCtx('terse'));
|
||||
expect(out).toContain('## Writing Style');
|
||||
expect(out).toContain('Terse mode (build-time)');
|
||||
// Negative: NONE of the default-mode prose
|
||||
expect(out).not.toContain('jargon-list.json');
|
||||
expect(out).not.toContain('Curated jargon list');
|
||||
expect(out).not.toContain('Frame questions in outcome terms');
|
||||
});
|
||||
|
||||
test('terse is meaningfully shorter than default', () => {
|
||||
const fullLen = generateWritingStyle(makeCtx('default')).length;
|
||||
const terseLen = generateWritingStyle(makeCtx('terse')).length;
|
||||
expect(terseLen).toBeLessThan(fullLen / 3);
|
||||
});
|
||||
});
|
||||
|
||||
describe('generateCompletenessSection', () => {
|
||||
test('default: emits full section with Boil-the-Lake prose', () => {
|
||||
const out = generateCompletenessSection(makeCtx('default'));
|
||||
expect(out).toContain('## Completeness Principle');
|
||||
expect(out).toContain('Boil the Lake');
|
||||
});
|
||||
|
||||
test('terse: returns empty string', () => {
|
||||
expect(generateCompletenessSection(makeCtx('terse'))).toBe('');
|
||||
});
|
||||
|
||||
test('no ctx arg: defaults to non-terse (back-compat with old callers)', () => {
|
||||
const out = generateCompletenessSection();
|
||||
expect(out).toContain('## Completeness Principle');
|
||||
});
|
||||
});
|
||||
|
||||
describe('generateConfusionProtocol', () => {
|
||||
test('default: emits full section', () => {
|
||||
const out = generateConfusionProtocol(makeCtx('default'));
|
||||
expect(out).toContain('## Confusion Protocol');
|
||||
expect(out).toContain('high-stakes ambiguity');
|
||||
});
|
||||
|
||||
test('terse: returns empty string', () => {
|
||||
expect(generateConfusionProtocol(makeCtx('terse'))).toBe('');
|
||||
});
|
||||
|
||||
test('no ctx arg: defaults to non-terse', () => {
|
||||
expect(generateConfusionProtocol()).toContain('## Confusion Protocol');
|
||||
});
|
||||
});
|
||||
|
||||
describe('generateContextHealth', () => {
|
||||
test('default: emits full section', () => {
|
||||
const out = generateContextHealth(makeCtx('default'));
|
||||
expect(out).toContain('## Context Health');
|
||||
expect(out).toContain('PROGRESS');
|
||||
});
|
||||
|
||||
test('terse: returns empty string', () => {
|
||||
expect(generateContextHealth(makeCtx('terse'))).toBe('');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('terse build — generatePreamble integration', () => {
|
||||
test('default tier-2 preamble includes all 4 terse-gated sections', () => {
|
||||
const out = generatePreamble(makeCtx('default', 2));
|
||||
expect(out).toContain('## Writing Style');
|
||||
expect(out).toContain('## Completeness Principle');
|
||||
expect(out).toContain('## Confusion Protocol');
|
||||
expect(out).toContain('## Context Health');
|
||||
});
|
||||
|
||||
test('terse tier-2 preamble drops 3 of 4 sections + collapses Writing Style', () => {
|
||||
const out = generatePreamble(makeCtx('terse', 2));
|
||||
// Writing Style heading still present (collapsed to one line)
|
||||
expect(out).toContain('## Writing Style');
|
||||
expect(out).toContain('Terse mode (build-time)');
|
||||
// Three sections dropped entirely
|
||||
expect(out).not.toContain('## Completeness Principle');
|
||||
expect(out).not.toContain('## Confusion Protocol');
|
||||
expect(out).not.toContain('## Context Health');
|
||||
});
|
||||
|
||||
test('terse preamble is measurably smaller', () => {
|
||||
const defaultLen = generatePreamble(makeCtx('default', 2)).length;
|
||||
const terseLen = generatePreamble(makeCtx('terse', 2)).length;
|
||||
// Saving roughly 2-4 KB across the 4 sections; assert at least 1 KB saved.
|
||||
expect(defaultLen - terseLen).toBeGreaterThan(1024);
|
||||
});
|
||||
|
||||
test('terse preamble at tier 1 is identical to default (terse only affects tier-2+ sections)', () => {
|
||||
// Tier 1 doesn't include the 4 terse-gated sections in the first place.
|
||||
const defaultT1 = generatePreamble(makeCtx('default', 1));
|
||||
const terseT1 = generatePreamble(makeCtx('terse', 1));
|
||||
expect(terseT1).toBe(defaultT1);
|
||||
});
|
||||
|
||||
test('explainLevel undefined behaves as default', () => {
|
||||
const undefinedOut = generatePreamble(makeCtx(undefined, 2));
|
||||
const defaultOut = generatePreamble(makeCtx('default', 2));
|
||||
expect(undefinedOut).toBe(defaultOut);
|
||||
});
|
||||
});
|
||||
Loading…
Reference in New Issue