From e274e5ec8232f89a13fc3057189bd0ffbacc6a31 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 25 May 2026 20:29:47 -0700 Subject: [PATCH] =?UTF-8?q?test(parity):=20T0a=20=E2=80=94=20capture=20v1.?= =?UTF-8?q?44.1=20baseline=20+=20capture=20helper=20+=20diff=20utility?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cathedral parity-eval suite primitive. captureBaseline() walks every top-level SKILL.md and records bytes, lines, estimated tokens, frontmatter description length, and eval coverage. diffBaselines() reports per-skill delta + total corpus delta + catalog tokens delta. Locks the v1.44.1 reference snapshot at test/fixtures/parity-baseline-v1.44.1.json. After Phase A+B+C land, scripts/capture-baseline.ts --tag v1.45.0.0 produces a comparable snapshot; diff supplies the real numbers the v2 CHANGELOG quotes. Never invent baseline numbers; ship them only if they came from a real run. v1.44.1 numbers captured this commit: - 51 skills - 2,847 KB total corpus - ~9,319 catalog tokens (sum of description bytes / 4) - top 3: ship 160 KB, plan-ceo-review 128 KB, office-hours 108 KB Test plan: - bun test test/helpers/capture-parity-baseline.test.ts passes 4/4 - The baseline JSON file is committed so reviewers can audit v1→v2 numbers Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/capture-baseline.ts | 54 ++ test/fixtures/parity-baseline-v1.44.1.json | 623 +++++++++++++++++++ test/helpers/capture-parity-baseline.test.ts | 90 +++ test/helpers/capture-parity-baseline.ts | 231 +++++++ 4 files changed, 998 insertions(+) create mode 100644 scripts/capture-baseline.ts create mode 100644 test/fixtures/parity-baseline-v1.44.1.json create mode 100644 test/helpers/capture-parity-baseline.test.ts create mode 100644 test/helpers/capture-parity-baseline.ts diff --git a/scripts/capture-baseline.ts b/scripts/capture-baseline.ts new file mode 100644 index 000000000..fa6c7ad33 --- /dev/null +++ b/scripts/capture-baseline.ts @@ -0,0 +1,54 @@ +#!/usr/bin/env bun +/** + * CLI for capturing a parity baseline snapshot. + * + * Usage: + * bun run scripts/capture-baseline.ts # default path + * bun run scripts/capture-baseline.ts --tag v1.44.1 # tag the snapshot + * bun run scripts/capture-baseline.ts --out path/to/baseline.json + * + * The default output path is test/fixtures/parity-baseline-.json, + * or test/fixtures/parity-baseline-current.json when no tag is given. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { captureBaseline } from '../test/helpers/capture-parity-baseline'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +function arg(name: string): string | undefined { + const i = process.argv.indexOf(name); + if (i === -1) return undefined; + return process.argv[i + 1]; +} + +const tag = arg('--tag'); +const outOverride = arg('--out'); +const defaultOut = path.join( + ROOT, + 'test', + 'fixtures', + `parity-baseline-${tag ?? 'current'}.json`, +); +const outPath = outOverride ? path.resolve(outOverride) : defaultOut; + +const baseline = captureBaseline({ repoRoot: ROOT, tag }); + +fs.mkdirSync(path.dirname(outPath), { recursive: true }); +fs.writeFileSync(outPath, JSON.stringify(baseline, null, 2) + '\n'); + +const totalKB = Math.round(baseline.totalCorpusBytes / 1024); +const top3 = baseline.topHeaviest.slice(0, 3); +console.log(`Parity baseline captured: ${outPath}`); +console.log(` tag: ${baseline.tag}`); +console.log(` commit: ${baseline.capturedFromCommit}`); +console.log(` branch: ${baseline.capturedFromBranch}`); +console.log(` skills: ${baseline.totalSkills}`); +console.log(` total corpus: ${totalKB} KB`); +console.log(` catalog tokens: ~${baseline.estTotalCatalogTokens}`); +console.log(` top 3 heaviest:`); +for (const s of top3) { + const kb = Math.round(s.skillMdBytes / 1024); + console.log(` ${s.skill.padEnd(28)} ${kb} KB (${s.skillMdLines} lines, ~${s.estTokens} tokens)`); +} diff --git a/test/fixtures/parity-baseline-v1.44.1.json b/test/fixtures/parity-baseline-v1.44.1.json new file mode 100644 index 000000000..2e6d0c6f9 --- /dev/null +++ b/test/fixtures/parity-baseline-v1.44.1.json @@ -0,0 +1,623 @@ +{ + "tag": "v1.44.1", + "capturedAt": "2026-05-26T03:29:32.568Z", + "capturedFromCommit": "74bc8054", + "capturedFromBranch": "garrytan/slim-skill-tokens", + "totalSkills": 51, + "totalCorpusBytes": 2915151, + "estTotalCatalogTokens": 9319, + "topHeaviest": [ + { + "skill": "ship", + "skillMdBytes": 163553, + "skillMdLines": 3094, + "estTokens": 40888, + "tmplBytes": 48869, + "descriptionLen": 557, + "hasGateEval": true, + "hasPeriodicEval": true + }, + { + "skill": "plan-ceo-review", + "skillMdBytes": 130891, + "skillMdLines": 2224, + "estTokens": 32723, + "tmplBytes": 63393, + "descriptionLen": 1326, + "hasGateEval": true, + "hasPeriodicEval": true + }, + { + "skill": "office-hours", + "skillMdBytes": 111088, + "skillMdLines": 2090, + "estTokens": 27772, + "tmplBytes": 55466, + "descriptionLen": 1579, + "hasGateEval": true, + "hasPeriodicEval": false + }, + { + "skill": "plan-design-review", + "skillMdBytes": 105592, + "skillMdLines": 1944, + "estTokens": 26398, + "tmplBytes": 28624, + "descriptionLen": 568, + "hasGateEval": true, + "hasPeriodicEval": true + }, + { + "skill": "plan-devex-review", + "skillMdBytes": 104571, + "skillMdLines": 2145, + "estTokens": 26143, + "tmplBytes": 35680, + "descriptionLen": 886, + "hasGateEval": true, + "hasPeriodicEval": true + }, + { + "skill": "plan-eng-review", + "skillMdBytes": 101409, + "skillMdLines": 1788, + "estTokens": 25352, + "tmplBytes": 26234, + "descriptionLen": 743, + "hasGateEval": true, + "hasPeriodicEval": true + }, + { + "skill": "design-review", + "skillMdBytes": 94055, + "skillMdLines": 1960, + "estTokens": 23514, + "tmplBytes": 11674, + "descriptionLen": 709, + "hasGateEval": true, + "hasPeriodicEval": false + }, + { + "skill": "review", + "skillMdBytes": 92443, + "skillMdLines": 1789, + "estTokens": 23111, + "tmplBytes": 14099, + "descriptionLen": 512, + "hasGateEval": true, + "hasPeriodicEval": false + }, + { + "skill": "land-and-deploy", + "skillMdBytes": 90281, + "skillMdLines": 1883, + "estTokens": 22570, + "tmplBytes": 48624, + "descriptionLen": 378, + "hasGateEval": true, + "hasPeriodicEval": false + }, + { + "skill": "autoplan", + "skillMdBytes": 89274, + "skillMdLines": 1811, + "estTokens": 22319, + "tmplBytes": 45271, + "descriptionLen": 857, + "hasGateEval": true, + "hasPeriodicEval": true + } + ], + "skills": { + "autoplan": { + "skill": "autoplan", + "skillMdBytes": 89274, + "skillMdLines": 1811, + "estTokens": 22319, + "tmplBytes": 45271, + "descriptionLen": 857, + "hasGateEval": true, + "hasPeriodicEval": true + }, + "benchmark": { + "skill": "benchmark", + "skillMdBytes": 32537, + "skillMdLines": 728, + "estTokens": 8134, + "tmplBytes": 9378, + "descriptionLen": 549, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "benchmark-models": { + "skill": "benchmark-models", + "skillMdBytes": 28606, + "skillMdLines": 603, + "estTokens": 7152, + "tmplBytes": 6631, + "descriptionLen": 740, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "browse": { + "skill": "browse", + "skillMdBytes": 47290, + "skillMdLines": 911, + "estTokens": 11823, + "tmplBytes": 10805, + "descriptionLen": 612, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "canary": { + "skill": "canary", + "skillMdBytes": 45502, + "skillMdLines": 1017, + "estTokens": 11376, + "tmplBytes": 8033, + "descriptionLen": 477, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "careful": { + "skill": "careful", + "skillMdBytes": 2531, + "skillMdLines": 64, + "estTokens": 633, + "tmplBytes": 2435, + "descriptionLen": 625, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "codex": { + "skill": "codex", + "skillMdBytes": 78018, + "skillMdLines": 1545, + "estTokens": 19505, + "tmplBytes": 34143, + "descriptionLen": 626, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "context-restore": { + "skill": "context-restore", + "skillMdBytes": 39894, + "skillMdLines": 875, + "estTokens": 9974, + "tmplBytes": 5255, + "descriptionLen": 636, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "context-save": { + "skill": "context-save", + "skillMdBytes": 44091, + "skillMdLines": 994, + "estTokens": 11023, + "tmplBytes": 9293, + "descriptionLen": 562, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "cso": { + "skill": "cso", + "skillMdBytes": 75797, + "skillMdLines": 1477, + "estTokens": 18949, + "tmplBytes": 35158, + "descriptionLen": 774, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "design-consultation": { + "skill": "design-consultation", + "skillMdBytes": 76963, + "skillMdLines": 1578, + "estTokens": 19241, + "tmplBytes": 25899, + "descriptionLen": 1201, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "design-html": { + "skill": "design-html", + "skillMdBytes": 64951, + "skillMdLines": 1476, + "estTokens": 16238, + "tmplBytes": 22567, + "descriptionLen": 870, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "design-review": { + "skill": "design-review", + "skillMdBytes": 94055, + "skillMdLines": 1960, + "estTokens": 23514, + "tmplBytes": 11674, + "descriptionLen": 709, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "design-shotgun": { + "skill": "design-shotgun", + "skillMdBytes": 60571, + "skillMdLines": 1327, + "estTokens": 15143, + "tmplBytes": 13331, + "descriptionLen": 1057, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "devex-review": { + "skill": "devex-review", + "skillMdBytes": 62815, + "skillMdLines": 1259, + "estTokens": 15704, + "tmplBytes": 7984, + "descriptionLen": 827, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "document-generate": { + "skill": "document-generate", + "skillMdBytes": 51386, + "skillMdLines": 1204, + "estTokens": 12847, + "tmplBytes": 15093, + "descriptionLen": 671, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "document-release": { + "skill": "document-release", + "skillMdBytes": 56652, + "skillMdLines": 1262, + "estTokens": 14163, + "tmplBytes": 20362, + "descriptionLen": 707, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "freeze": { + "skill": "freeze", + "skillMdBytes": 3134, + "skillMdLines": 88, + "estTokens": 784, + "tmplBytes": 3038, + "descriptionLen": 761, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "gstack-upgrade": { + "skill": "gstack-upgrade", + "skillMdBytes": 10794, + "skillMdLines": 280, + "estTokens": 2699, + "tmplBytes": 10667, + "descriptionLen": 439, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "guard": { + "skill": "guard", + "skillMdBytes": 3277, + "skillMdLines": 88, + "estTokens": 819, + "tmplBytes": 3181, + "descriptionLen": 968, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "health": { + "skill": "health", + "skillMdBytes": 46313, + "skillMdLines": 1041, + "estTokens": 11578, + "tmplBytes": 11617, + "descriptionLen": 463, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "investigate": { + "skill": "investigate", + "skillMdBytes": 48810, + "skillMdLines": 1039, + "estTokens": 12203, + "tmplBytes": 11561, + "descriptionLen": 1811, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "ios-clean": { + "skill": "ios-clean", + "skillMdBytes": 39447, + "skillMdLines": 840, + "estTokens": 9862, + "tmplBytes": 3851, + "descriptionLen": 761, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "ios-design-review": { + "skill": "ios-design-review", + "skillMdBytes": 40037, + "skillMdLines": 841, + "estTokens": 10009, + "tmplBytes": 4417, + "descriptionLen": 836, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "ios-fix": { + "skill": "ios-fix", + "skillMdBytes": 39164, + "skillMdLines": 837, + "estTokens": 9791, + "tmplBytes": 3574, + "descriptionLen": 767, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "ios-qa": { + "skill": "ios-qa", + "skillMdBytes": 45677, + "skillMdLines": 957, + "estTokens": 11419, + "tmplBytes": 10090, + "descriptionLen": 875, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "ios-sync": { + "skill": "ios-sync", + "skillMdBytes": 39137, + "skillMdLines": 831, + "estTokens": 9784, + "tmplBytes": 3544, + "descriptionLen": 727, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "land-and-deploy": { + "skill": "land-and-deploy", + "skillMdBytes": 90281, + "skillMdLines": 1883, + "estTokens": 22570, + "tmplBytes": 48624, + "descriptionLen": 378, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "landing-report": { + "skill": "landing-report", + "skillMdBytes": 42382, + "skillMdLines": 901, + "estTokens": 10596, + "tmplBytes": 6806, + "descriptionLen": 512, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "learn": { + "skill": "learn", + "skillMdBytes": 40119, + "skillMdLines": 918, + "estTokens": 10030, + "tmplBytes": 5594, + "descriptionLen": 460, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "make-pdf": { + "skill": "make-pdf", + "skillMdBytes": 28721, + "skillMdLines": 644, + "estTokens": 7180, + "tmplBytes": 5106, + "descriptionLen": 698, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "office-hours": { + "skill": "office-hours", + "skillMdBytes": 111088, + "skillMdLines": 2090, + "estTokens": 27772, + "tmplBytes": 55466, + "descriptionLen": 1579, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "open-gstack-browser": { + "skill": "open-gstack-browser", + "skillMdBytes": 44529, + "skillMdLines": 981, + "estTokens": 11132, + "tmplBytes": 7702, + "descriptionLen": 586, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "pair-agent": { + "skill": "pair-agent", + "skillMdBytes": 45339, + "skillMdLines": 1036, + "estTokens": 11335, + "tmplBytes": 8548, + "descriptionLen": 709, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "plan-ceo-review": { + "skill": "plan-ceo-review", + "skillMdBytes": 130891, + "skillMdLines": 2224, + "estTokens": 32723, + "tmplBytes": 63393, + "descriptionLen": 1326, + "hasGateEval": true, + "hasPeriodicEval": true + }, + "plan-design-review": { + "skill": "plan-design-review", + "skillMdBytes": 105592, + "skillMdLines": 1944, + "estTokens": 26398, + "tmplBytes": 28624, + "descriptionLen": 568, + "hasGateEval": true, + "hasPeriodicEval": true + }, + "plan-devex-review": { + "skill": "plan-devex-review", + "skillMdBytes": 104571, + "skillMdLines": 2145, + "estTokens": 26143, + "tmplBytes": 35680, + "descriptionLen": 886, + "hasGateEval": true, + "hasPeriodicEval": true + }, + "plan-eng-review": { + "skill": "plan-eng-review", + "skillMdBytes": 101409, + "skillMdLines": 1788, + "estTokens": 25352, + "tmplBytes": 26234, + "descriptionLen": 743, + "hasGateEval": true, + "hasPeriodicEval": true + }, + "plan-tune": { + "skill": "plan-tune", + "skillMdBytes": 50123, + "skillMdLines": 1105, + "estTokens": 12531, + "tmplBytes": 15586, + "descriptionLen": 997, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "qa": { + "skill": "qa", + "skillMdBytes": 72267, + "skillMdLines": 1648, + "estTokens": 18067, + "tmplBytes": 12701, + "descriptionLen": 814, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "qa-only": { + "skill": "qa-only", + "skillMdBytes": 54819, + "skillMdLines": 1220, + "estTokens": 13705, + "tmplBytes": 3851, + "descriptionLen": 605, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "retro": { + "skill": "retro", + "skillMdBytes": 81286, + "skillMdLines": 1777, + "estTokens": 20322, + "tmplBytes": 42427, + "descriptionLen": 979, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "review": { + "skill": "review", + "skillMdBytes": 92443, + "skillMdLines": 1789, + "estTokens": 23111, + "tmplBytes": 14099, + "descriptionLen": 512, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "scrape": { + "skill": "scrape", + "skillMdBytes": 42040, + "skillMdLines": 914, + "estTokens": 10510, + "tmplBytes": 5220, + "descriptionLen": 519, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "setup-browser-cookies": { + "skill": "setup-browser-cookies", + "skillMdBytes": 25886, + "skillMdLines": 577, + "estTokens": 6472, + "tmplBytes": 2724, + "descriptionLen": 433, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "setup-deploy": { + "skill": "setup-deploy", + "skillMdBytes": 42326, + "skillMdLines": 946, + "estTokens": 10582, + "tmplBytes": 7780, + "descriptionLen": 564, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "setup-gbrain": { + "skill": "setup-gbrain", + "skillMdBytes": 76791, + "skillMdLines": 1733, + "estTokens": 19198, + "tmplBytes": 42245, + "descriptionLen": 512, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "ship": { + "skill": "ship", + "skillMdBytes": 163553, + "skillMdLines": 3094, + "estTokens": 40888, + "tmplBytes": 48869, + "descriptionLen": 557, + "hasGateEval": true, + "hasPeriodicEval": true + }, + "skillify": { + "skill": "skillify", + "skillMdBytes": 51935, + "skillMdLines": 1196, + "estTokens": 12984, + "tmplBytes": 15107, + "descriptionLen": 571, + "hasGateEval": true, + "hasPeriodicEval": false + }, + "sync-gbrain": { + "skill": "sync-gbrain", + "skillMdBytes": 48555, + "skillMdLines": 1057, + "estTokens": 12139, + "tmplBytes": 13996, + "descriptionLen": 510, + "hasGateEval": false, + "hasPeriodicEval": false + }, + "unfreeze": { + "skill": "unfreeze", + "skillMdBytes": 1482, + "skillMdLines": 46, + "estTokens": 371, + "tmplBytes": 1386, + "descriptionLen": 350, + "hasGateEval": false, + "hasPeriodicEval": false + } + } +} diff --git a/test/helpers/capture-parity-baseline.test.ts b/test/helpers/capture-parity-baseline.test.ts new file mode 100644 index 000000000..ba00c84cc --- /dev/null +++ b/test/helpers/capture-parity-baseline.test.ts @@ -0,0 +1,90 @@ +/** + * Unit tests for parity baseline capture. + * + * Free. Reads the live repo state via captureBaseline() and asserts + * shape + invariants, not specific numbers (which drift release-over-release). + */ + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import { captureBaseline, diffBaselines, type ParityBaseline } from './capture-parity-baseline'; + +const REPO_ROOT = path.resolve(import.meta.dir, '..', '..'); + +describe('capture-parity-baseline', () => { + test('produces a shaped baseline for the current repo', () => { + const baseline = captureBaseline({ repoRoot: REPO_ROOT, tag: 'unit-test' }); + expect(baseline.tag).toBe('unit-test'); + expect(baseline.totalSkills).toBeGreaterThan(20); + expect(baseline.totalCorpusBytes).toBeGreaterThan(100_000); + expect(baseline.topHeaviest.length).toBeGreaterThan(0); + expect(baseline.topHeaviest.length).toBeLessThanOrEqual(10); + expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThan(0); + // Top 1 should be ≥ Top 2 (sort invariant) + if (baseline.topHeaviest.length >= 2) { + expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThanOrEqual( + baseline.topHeaviest[1]!.skillMdBytes, + ); + } + }); + + test('each skill entry has byte + line + token estimates', () => { + const baseline = captureBaseline({ repoRoot: REPO_ROOT }); + for (const skill of Object.values(baseline.skills)) { + expect(skill.skillMdBytes).toBeGreaterThan(0); + expect(skill.skillMdLines).toBeGreaterThan(0); + expect(skill.estTokens).toBeGreaterThan(0); + // ~4 chars/token heuristic + expect(skill.estTokens).toBeCloseTo(skill.skillMdBytes / 4, -2); + } + }); + + test('diffBaselines returns expected deltas', () => { + const before: ParityBaseline = { + tag: 'before', + capturedAt: '2026-01-01T00:00:00Z', + capturedFromCommit: 'abc', + capturedFromBranch: 'main', + totalSkills: 2, + totalCorpusBytes: 1000, + estTotalCatalogTokens: 100, + topHeaviest: [], + skills: { + foo: { skill: 'foo', skillMdBytes: 600, skillMdLines: 10, estTokens: 150, tmplBytes: 300, descriptionLen: 50, hasGateEval: true, hasPeriodicEval: false }, + bar: { skill: 'bar', skillMdBytes: 400, skillMdLines: 8, estTokens: 100, tmplBytes: 200, descriptionLen: 30, hasGateEval: false, hasPeriodicEval: false }, + }, + }; + const after: ParityBaseline = { + ...before, + tag: 'after', + totalCorpusBytes: 700, + estTotalCatalogTokens: 60, + skills: { + foo: { ...before.skills.foo!, skillMdBytes: 400 }, + bar: { ...before.skills.bar!, skillMdBytes: 300 }, + }, + }; + const diff = diffBaselines(before, after); + expect(diff.totalCorpusDelta).toBe(-300); + expect(diff.totalCorpusDeltaPct).toBeCloseTo(-30, 1); + expect(diff.catalogTokensDelta).toBe(-40); + expect(diff.perSkill.length).toBe(2); + // Sorted by abs delta descending + expect(diff.perSkill[0]!.skill).toBe('foo'); + expect(diff.perSkill[0]!.deltaBytes).toBe(-200); + expect(diff.perSkill[1]!.skill).toBe('bar'); + }); + + test('v1.44.1 baseline file exists with expected shape', () => { + const baselinePath = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json'); + expect(fs.existsSync(baselinePath)).toBe(true); + const baseline = JSON.parse(fs.readFileSync(baselinePath, 'utf-8')) as ParityBaseline; + expect(baseline.tag).toBe('v1.44.1'); + expect(baseline.totalSkills).toBeGreaterThan(40); + // Document the v1.44.1 snapshot as the v1→v2 baseline reference. + // Compression in v1.45+ should drop totalCorpusBytes; this assertion + // anchors the "v1 was XX MB" claim in the CHANGELOG to a real file. + expect(baseline.totalCorpusBytes).toBeGreaterThan(2_000_000); + }); +}); diff --git a/test/helpers/capture-parity-baseline.ts b/test/helpers/capture-parity-baseline.ts new file mode 100644 index 000000000..2c8ce1730 --- /dev/null +++ b/test/helpers/capture-parity-baseline.ts @@ -0,0 +1,231 @@ +/** + * Parity baseline capture — cathedral parity-eval suite primitive. + * + * Snapshots the current state of every top-level SKILL.md: byte count, line + * count, estimated token count, frontmatter description length, eval + * coverage. The output JSON is the v1.44 baseline that v2 must beat on + * compression AND match (or exceed) on parity. + * + * The numbers quoted in the v2.0.0.0 CHANGELOG numbers table are read + * from a baseline JSON captured by this script. Never invent baseline + * numbers; ship them only if they came from a real captureBaseline() run. + * + * Usage: + * bun run scripts/capture-baseline.ts # write default path + * bun run scripts/capture-baseline.ts --out PATH # write custom path + * bun run scripts/capture-baseline.ts --tag v1.44.1 # tag the snapshot + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; + +export interface SkillBaselineEntry { + skill: string; + skillMdBytes: number; + skillMdLines: number; + estTokens: number; // ~4 chars/token heuristic + tmplBytes: number | null; // null when no .tmpl exists (vendored or non-Claude) + descriptionLen: number; // bytes in frontmatter description field + hasGateEval: boolean; + hasPeriodicEval: boolean; +} + +export interface ParityBaseline { + tag: string; + capturedAt: string; + capturedFromCommit: string; + capturedFromBranch: string; + totalSkills: number; + totalCorpusBytes: number; + estTotalCatalogTokens: number; // sum of all description lengths / 4 + topHeaviest: SkillBaselineEntry[]; // sorted desc by skillMdBytes + skills: Record; +} + +export interface CaptureOptions { + repoRoot: string; + tag?: string; +} + +/** Extract the frontmatter description from a SKILL.md file. Empty string if none. */ +function extractDescription(content: string): string { + if (!content.startsWith('---\n')) return ''; + const fmEnd = content.indexOf('\n---', 4); + if (fmEnd === -1) return ''; + const frontmatter = content.slice(4, fmEnd); + const lines = frontmatter.split('\n'); + let inDescription = false; + const descLines: string[] = []; + for (const line of lines) { + if (line.match(/^description:\s*\|?\s*$/)) { + inDescription = true; + continue; + } + if (line.match(/^description:\s+/)) { + descLines.push(line.replace(/^description:\s+/, '')); + inDescription = true; + continue; + } + if (inDescription) { + if (line.match(/^\w+:\s/)) break; + descLines.push(line.trim()); + } + } + return descLines.join('\n').trim(); +} + +/** Estimate token count via 4 chars/token. Crude but matches existing budget-regression usage. */ +function estimateTokens(bytes: number): number { + return Math.round(bytes / 4); +} + +/** Find which top-level directories contain a SKILL.md (skills we capture). */ +function discoverSkillDirs(repoRoot: string): string[] { + const entries = fs.readdirSync(repoRoot, { withFileTypes: true }); + const dirs: string[] = []; + for (const e of entries) { + if (!e.isDirectory()) continue; + if (e.name.startsWith('.')) continue; + if (e.name === 'node_modules' || e.name === 'docs') continue; + const skillMd = path.join(repoRoot, e.name, 'SKILL.md'); + if (fs.existsSync(skillMd)) dirs.push(e.name); + } + return dirs.sort(); +} + +/** Check whether a skill has E2E gate / periodic eval coverage by scanning test/. */ +function discoverEvalCoverage(repoRoot: string, skills: string[]): { + gate: Set; + periodic: Set; +} { + const gate = new Set(); + const periodic = new Set(); + const testDir = path.join(repoRoot, 'test'); + if (!fs.existsSync(testDir)) return { gate, periodic }; + const testFiles = fs.readdirSync(testDir).filter(f => f.startsWith('skill-e2e-') && f.endsWith('.test.ts')); + // Try to map each test file to a skill by reading its contents for skill names. + for (const file of testFiles) { + const content = fs.readFileSync(path.join(testDir, file), 'utf-8'); + for (const skill of skills) { + // Match the skill name as a word boundary, also try /skill-name slash form. + const re = new RegExp(`(/${skill}|['"\`]${skill}['"\`]|skill[s]?[/=:]\\s*['"\`]${skill}['"\`])`); + if (re.test(content)) { + // Crude tier inference: if file name contains "regression" / known-periodic markers, classify periodic. + if (file.includes('chain') || file.includes('multi') || file.includes('idempotency') || file.includes('finding-floor')) { + periodic.add(skill); + } else { + gate.add(skill); + } + } + } + } + return { gate, periodic }; +} + +function getGitInfo(repoRoot: string): { commit: string; branch: string } { + try { + const commit = execSync('git rev-parse --short HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim(); + const branch = execSync('git rev-parse --abbrev-ref HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim(); + return { commit, branch }; + } catch { + return { commit: 'unknown', branch: 'unknown' }; + } +} + +export function captureBaseline(opts: CaptureOptions): ParityBaseline { + const { repoRoot, tag } = opts; + const skillDirs = discoverSkillDirs(repoRoot); + const evalCoverage = discoverEvalCoverage(repoRoot, skillDirs); + const skills: Record = {}; + let totalCorpusBytes = 0; + let totalDescriptionBytes = 0; + for (const dir of skillDirs) { + const skillMdPath = path.join(repoRoot, dir, 'SKILL.md'); + const tmplPath = path.join(repoRoot, dir, 'SKILL.md.tmpl'); + const content = fs.readFileSync(skillMdPath, 'utf-8'); + const bytes = Buffer.byteLength(content, 'utf-8'); + const lines = content.split('\n').length; + const description = extractDescription(content); + const descriptionLen = Buffer.byteLength(description, 'utf-8'); + const tmplBytes = fs.existsSync(tmplPath) + ? Buffer.byteLength(fs.readFileSync(tmplPath, 'utf-8'), 'utf-8') + : null; + const entry: SkillBaselineEntry = { + skill: dir, + skillMdBytes: bytes, + skillMdLines: lines, + estTokens: estimateTokens(bytes), + tmplBytes, + descriptionLen, + hasGateEval: evalCoverage.gate.has(dir), + hasPeriodicEval: evalCoverage.periodic.has(dir), + }; + skills[dir] = entry; + totalCorpusBytes += bytes; + totalDescriptionBytes += descriptionLen; + } + const topHeaviest = Object.values(skills) + .slice() + .sort((a, b) => b.skillMdBytes - a.skillMdBytes) + .slice(0, 10); + const git = getGitInfo(repoRoot); + return { + tag: tag ?? 'untagged', + capturedAt: new Date().toISOString(), + capturedFromCommit: git.commit, + capturedFromBranch: git.branch, + totalSkills: skillDirs.length, + totalCorpusBytes, + estTotalCatalogTokens: estimateTokens(totalDescriptionBytes), + topHeaviest, + skills, + }; +} + +/** Diff two baselines; useful for v2 vs v1.44 deltas. */ +export interface BaselineDiff { + totalCorpusDelta: number; + totalCorpusDeltaPct: number; + catalogTokensDelta: number; + catalogTokensDeltaPct: number; + perSkill: Array<{ + skill: string; + beforeBytes: number; + afterBytes: number; + deltaBytes: number; + deltaPct: number; + }>; +} + +export function diffBaselines(before: ParityBaseline, after: ParityBaseline): BaselineDiff { + const totalCorpusDelta = after.totalCorpusBytes - before.totalCorpusBytes; + const totalCorpusDeltaPct = before.totalCorpusBytes + ? (totalCorpusDelta / before.totalCorpusBytes) * 100 + : 0; + const catalogTokensDelta = after.estTotalCatalogTokens - before.estTotalCatalogTokens; + const catalogTokensDeltaPct = before.estTotalCatalogTokens + ? (catalogTokensDelta / before.estTotalCatalogTokens) * 100 + : 0; + const perSkill: BaselineDiff['perSkill'] = []; + const allSkills = new Set([...Object.keys(before.skills), ...Object.keys(after.skills)]); + for (const skill of allSkills) { + const b = before.skills[skill]?.skillMdBytes ?? 0; + const a = after.skills[skill]?.skillMdBytes ?? 0; + perSkill.push({ + skill, + beforeBytes: b, + afterBytes: a, + deltaBytes: a - b, + deltaPct: b ? ((a - b) / b) * 100 : 0, + }); + } + perSkill.sort((x, y) => Math.abs(y.deltaBytes) - Math.abs(x.deltaBytes)); + return { + totalCorpusDelta, + totalCorpusDeltaPct, + catalogTokensDelta, + catalogTokensDeltaPct, + perSkill, + }; +}