mirror of https://github.com/garrytan/gstack.git
test(parity): T0a — capture v1.44.1 baseline + capture helper + diff utility
Cathedral parity-eval suite primitive. captureBaseline() walks every top-level SKILL.md and records bytes, lines, estimated tokens, frontmatter description length, and eval coverage. diffBaselines() reports per-skill delta + total corpus delta + catalog tokens delta. Locks the v1.44.1 reference snapshot at test/fixtures/parity-baseline-v1.44.1.json. After Phase A+B+C land, scripts/capture-baseline.ts --tag v1.45.0.0 produces a comparable snapshot; diff supplies the real numbers the v2 CHANGELOG quotes. Never invent baseline numbers; ship them only if they came from a real run. v1.44.1 numbers captured this commit: - 51 skills - 2,847 KB total corpus - ~9,319 catalog tokens (sum of description bytes / 4) - top 3: ship 160 KB, plan-ceo-review 128 KB, office-hours 108 KB Test plan: - bun test test/helpers/capture-parity-baseline.test.ts passes 4/4 - The baseline JSON file is committed so reviewers can audit v1→v2 numbers Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
74bc80545f
commit
e274e5ec82
|
|
@ -0,0 +1,54 @@
|
|||
#!/usr/bin/env bun
|
||||
/**
|
||||
* CLI for capturing a parity baseline snapshot.
|
||||
*
|
||||
* Usage:
|
||||
* bun run scripts/capture-baseline.ts # default path
|
||||
* bun run scripts/capture-baseline.ts --tag v1.44.1 # tag the snapshot
|
||||
* bun run scripts/capture-baseline.ts --out path/to/baseline.json
|
||||
*
|
||||
* The default output path is test/fixtures/parity-baseline-<tag>.json,
|
||||
* or test/fixtures/parity-baseline-current.json when no tag is given.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { captureBaseline } from '../test/helpers/capture-parity-baseline';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
function arg(name: string): string | undefined {
|
||||
const i = process.argv.indexOf(name);
|
||||
if (i === -1) return undefined;
|
||||
return process.argv[i + 1];
|
||||
}
|
||||
|
||||
const tag = arg('--tag');
|
||||
const outOverride = arg('--out');
|
||||
const defaultOut = path.join(
|
||||
ROOT,
|
||||
'test',
|
||||
'fixtures',
|
||||
`parity-baseline-${tag ?? 'current'}.json`,
|
||||
);
|
||||
const outPath = outOverride ? path.resolve(outOverride) : defaultOut;
|
||||
|
||||
const baseline = captureBaseline({ repoRoot: ROOT, tag });
|
||||
|
||||
fs.mkdirSync(path.dirname(outPath), { recursive: true });
|
||||
fs.writeFileSync(outPath, JSON.stringify(baseline, null, 2) + '\n');
|
||||
|
||||
const totalKB = Math.round(baseline.totalCorpusBytes / 1024);
|
||||
const top3 = baseline.topHeaviest.slice(0, 3);
|
||||
console.log(`Parity baseline captured: ${outPath}`);
|
||||
console.log(` tag: ${baseline.tag}`);
|
||||
console.log(` commit: ${baseline.capturedFromCommit}`);
|
||||
console.log(` branch: ${baseline.capturedFromBranch}`);
|
||||
console.log(` skills: ${baseline.totalSkills}`);
|
||||
console.log(` total corpus: ${totalKB} KB`);
|
||||
console.log(` catalog tokens: ~${baseline.estTotalCatalogTokens}`);
|
||||
console.log(` top 3 heaviest:`);
|
||||
for (const s of top3) {
|
||||
const kb = Math.round(s.skillMdBytes / 1024);
|
||||
console.log(` ${s.skill.padEnd(28)} ${kb} KB (${s.skillMdLines} lines, ~${s.estTokens} tokens)`);
|
||||
}
|
||||
|
|
@ -0,0 +1,623 @@
|
|||
{
|
||||
"tag": "v1.44.1",
|
||||
"capturedAt": "2026-05-26T03:29:32.568Z",
|
||||
"capturedFromCommit": "74bc8054",
|
||||
"capturedFromBranch": "garrytan/slim-skill-tokens",
|
||||
"totalSkills": 51,
|
||||
"totalCorpusBytes": 2915151,
|
||||
"estTotalCatalogTokens": 9319,
|
||||
"topHeaviest": [
|
||||
{
|
||||
"skill": "ship",
|
||||
"skillMdBytes": 163553,
|
||||
"skillMdLines": 3094,
|
||||
"estTokens": 40888,
|
||||
"tmplBytes": 48869,
|
||||
"descriptionLen": 557,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
{
|
||||
"skill": "plan-ceo-review",
|
||||
"skillMdBytes": 130891,
|
||||
"skillMdLines": 2224,
|
||||
"estTokens": 32723,
|
||||
"tmplBytes": 63393,
|
||||
"descriptionLen": 1326,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
{
|
||||
"skill": "office-hours",
|
||||
"skillMdBytes": 111088,
|
||||
"skillMdLines": 2090,
|
||||
"estTokens": 27772,
|
||||
"tmplBytes": 55466,
|
||||
"descriptionLen": 1579,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
{
|
||||
"skill": "plan-design-review",
|
||||
"skillMdBytes": 105592,
|
||||
"skillMdLines": 1944,
|
||||
"estTokens": 26398,
|
||||
"tmplBytes": 28624,
|
||||
"descriptionLen": 568,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
{
|
||||
"skill": "plan-devex-review",
|
||||
"skillMdBytes": 104571,
|
||||
"skillMdLines": 2145,
|
||||
"estTokens": 26143,
|
||||
"tmplBytes": 35680,
|
||||
"descriptionLen": 886,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
{
|
||||
"skill": "plan-eng-review",
|
||||
"skillMdBytes": 101409,
|
||||
"skillMdLines": 1788,
|
||||
"estTokens": 25352,
|
||||
"tmplBytes": 26234,
|
||||
"descriptionLen": 743,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
{
|
||||
"skill": "design-review",
|
||||
"skillMdBytes": 94055,
|
||||
"skillMdLines": 1960,
|
||||
"estTokens": 23514,
|
||||
"tmplBytes": 11674,
|
||||
"descriptionLen": 709,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
{
|
||||
"skill": "review",
|
||||
"skillMdBytes": 92443,
|
||||
"skillMdLines": 1789,
|
||||
"estTokens": 23111,
|
||||
"tmplBytes": 14099,
|
||||
"descriptionLen": 512,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
{
|
||||
"skill": "land-and-deploy",
|
||||
"skillMdBytes": 90281,
|
||||
"skillMdLines": 1883,
|
||||
"estTokens": 22570,
|
||||
"tmplBytes": 48624,
|
||||
"descriptionLen": 378,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
{
|
||||
"skill": "autoplan",
|
||||
"skillMdBytes": 89274,
|
||||
"skillMdLines": 1811,
|
||||
"estTokens": 22319,
|
||||
"tmplBytes": 45271,
|
||||
"descriptionLen": 857,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
}
|
||||
],
|
||||
"skills": {
|
||||
"autoplan": {
|
||||
"skill": "autoplan",
|
||||
"skillMdBytes": 89274,
|
||||
"skillMdLines": 1811,
|
||||
"estTokens": 22319,
|
||||
"tmplBytes": 45271,
|
||||
"descriptionLen": 857,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
"benchmark": {
|
||||
"skill": "benchmark",
|
||||
"skillMdBytes": 32537,
|
||||
"skillMdLines": 728,
|
||||
"estTokens": 8134,
|
||||
"tmplBytes": 9378,
|
||||
"descriptionLen": 549,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"benchmark-models": {
|
||||
"skill": "benchmark-models",
|
||||
"skillMdBytes": 28606,
|
||||
"skillMdLines": 603,
|
||||
"estTokens": 7152,
|
||||
"tmplBytes": 6631,
|
||||
"descriptionLen": 740,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"browse": {
|
||||
"skill": "browse",
|
||||
"skillMdBytes": 47290,
|
||||
"skillMdLines": 911,
|
||||
"estTokens": 11823,
|
||||
"tmplBytes": 10805,
|
||||
"descriptionLen": 612,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"canary": {
|
||||
"skill": "canary",
|
||||
"skillMdBytes": 45502,
|
||||
"skillMdLines": 1017,
|
||||
"estTokens": 11376,
|
||||
"tmplBytes": 8033,
|
||||
"descriptionLen": 477,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"careful": {
|
||||
"skill": "careful",
|
||||
"skillMdBytes": 2531,
|
||||
"skillMdLines": 64,
|
||||
"estTokens": 633,
|
||||
"tmplBytes": 2435,
|
||||
"descriptionLen": 625,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"codex": {
|
||||
"skill": "codex",
|
||||
"skillMdBytes": 78018,
|
||||
"skillMdLines": 1545,
|
||||
"estTokens": 19505,
|
||||
"tmplBytes": 34143,
|
||||
"descriptionLen": 626,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"context-restore": {
|
||||
"skill": "context-restore",
|
||||
"skillMdBytes": 39894,
|
||||
"skillMdLines": 875,
|
||||
"estTokens": 9974,
|
||||
"tmplBytes": 5255,
|
||||
"descriptionLen": 636,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"context-save": {
|
||||
"skill": "context-save",
|
||||
"skillMdBytes": 44091,
|
||||
"skillMdLines": 994,
|
||||
"estTokens": 11023,
|
||||
"tmplBytes": 9293,
|
||||
"descriptionLen": 562,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"cso": {
|
||||
"skill": "cso",
|
||||
"skillMdBytes": 75797,
|
||||
"skillMdLines": 1477,
|
||||
"estTokens": 18949,
|
||||
"tmplBytes": 35158,
|
||||
"descriptionLen": 774,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"design-consultation": {
|
||||
"skill": "design-consultation",
|
||||
"skillMdBytes": 76963,
|
||||
"skillMdLines": 1578,
|
||||
"estTokens": 19241,
|
||||
"tmplBytes": 25899,
|
||||
"descriptionLen": 1201,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"design-html": {
|
||||
"skill": "design-html",
|
||||
"skillMdBytes": 64951,
|
||||
"skillMdLines": 1476,
|
||||
"estTokens": 16238,
|
||||
"tmplBytes": 22567,
|
||||
"descriptionLen": 870,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"design-review": {
|
||||
"skill": "design-review",
|
||||
"skillMdBytes": 94055,
|
||||
"skillMdLines": 1960,
|
||||
"estTokens": 23514,
|
||||
"tmplBytes": 11674,
|
||||
"descriptionLen": 709,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"design-shotgun": {
|
||||
"skill": "design-shotgun",
|
||||
"skillMdBytes": 60571,
|
||||
"skillMdLines": 1327,
|
||||
"estTokens": 15143,
|
||||
"tmplBytes": 13331,
|
||||
"descriptionLen": 1057,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"devex-review": {
|
||||
"skill": "devex-review",
|
||||
"skillMdBytes": 62815,
|
||||
"skillMdLines": 1259,
|
||||
"estTokens": 15704,
|
||||
"tmplBytes": 7984,
|
||||
"descriptionLen": 827,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"document-generate": {
|
||||
"skill": "document-generate",
|
||||
"skillMdBytes": 51386,
|
||||
"skillMdLines": 1204,
|
||||
"estTokens": 12847,
|
||||
"tmplBytes": 15093,
|
||||
"descriptionLen": 671,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"document-release": {
|
||||
"skill": "document-release",
|
||||
"skillMdBytes": 56652,
|
||||
"skillMdLines": 1262,
|
||||
"estTokens": 14163,
|
||||
"tmplBytes": 20362,
|
||||
"descriptionLen": 707,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"freeze": {
|
||||
"skill": "freeze",
|
||||
"skillMdBytes": 3134,
|
||||
"skillMdLines": 88,
|
||||
"estTokens": 784,
|
||||
"tmplBytes": 3038,
|
||||
"descriptionLen": 761,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"gstack-upgrade": {
|
||||
"skill": "gstack-upgrade",
|
||||
"skillMdBytes": 10794,
|
||||
"skillMdLines": 280,
|
||||
"estTokens": 2699,
|
||||
"tmplBytes": 10667,
|
||||
"descriptionLen": 439,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"guard": {
|
||||
"skill": "guard",
|
||||
"skillMdBytes": 3277,
|
||||
"skillMdLines": 88,
|
||||
"estTokens": 819,
|
||||
"tmplBytes": 3181,
|
||||
"descriptionLen": 968,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"health": {
|
||||
"skill": "health",
|
||||
"skillMdBytes": 46313,
|
||||
"skillMdLines": 1041,
|
||||
"estTokens": 11578,
|
||||
"tmplBytes": 11617,
|
||||
"descriptionLen": 463,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"investigate": {
|
||||
"skill": "investigate",
|
||||
"skillMdBytes": 48810,
|
||||
"skillMdLines": 1039,
|
||||
"estTokens": 12203,
|
||||
"tmplBytes": 11561,
|
||||
"descriptionLen": 1811,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"ios-clean": {
|
||||
"skill": "ios-clean",
|
||||
"skillMdBytes": 39447,
|
||||
"skillMdLines": 840,
|
||||
"estTokens": 9862,
|
||||
"tmplBytes": 3851,
|
||||
"descriptionLen": 761,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"ios-design-review": {
|
||||
"skill": "ios-design-review",
|
||||
"skillMdBytes": 40037,
|
||||
"skillMdLines": 841,
|
||||
"estTokens": 10009,
|
||||
"tmplBytes": 4417,
|
||||
"descriptionLen": 836,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"ios-fix": {
|
||||
"skill": "ios-fix",
|
||||
"skillMdBytes": 39164,
|
||||
"skillMdLines": 837,
|
||||
"estTokens": 9791,
|
||||
"tmplBytes": 3574,
|
||||
"descriptionLen": 767,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"ios-qa": {
|
||||
"skill": "ios-qa",
|
||||
"skillMdBytes": 45677,
|
||||
"skillMdLines": 957,
|
||||
"estTokens": 11419,
|
||||
"tmplBytes": 10090,
|
||||
"descriptionLen": 875,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"ios-sync": {
|
||||
"skill": "ios-sync",
|
||||
"skillMdBytes": 39137,
|
||||
"skillMdLines": 831,
|
||||
"estTokens": 9784,
|
||||
"tmplBytes": 3544,
|
||||
"descriptionLen": 727,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"land-and-deploy": {
|
||||
"skill": "land-and-deploy",
|
||||
"skillMdBytes": 90281,
|
||||
"skillMdLines": 1883,
|
||||
"estTokens": 22570,
|
||||
"tmplBytes": 48624,
|
||||
"descriptionLen": 378,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"landing-report": {
|
||||
"skill": "landing-report",
|
||||
"skillMdBytes": 42382,
|
||||
"skillMdLines": 901,
|
||||
"estTokens": 10596,
|
||||
"tmplBytes": 6806,
|
||||
"descriptionLen": 512,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"learn": {
|
||||
"skill": "learn",
|
||||
"skillMdBytes": 40119,
|
||||
"skillMdLines": 918,
|
||||
"estTokens": 10030,
|
||||
"tmplBytes": 5594,
|
||||
"descriptionLen": 460,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"make-pdf": {
|
||||
"skill": "make-pdf",
|
||||
"skillMdBytes": 28721,
|
||||
"skillMdLines": 644,
|
||||
"estTokens": 7180,
|
||||
"tmplBytes": 5106,
|
||||
"descriptionLen": 698,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"office-hours": {
|
||||
"skill": "office-hours",
|
||||
"skillMdBytes": 111088,
|
||||
"skillMdLines": 2090,
|
||||
"estTokens": 27772,
|
||||
"tmplBytes": 55466,
|
||||
"descriptionLen": 1579,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"open-gstack-browser": {
|
||||
"skill": "open-gstack-browser",
|
||||
"skillMdBytes": 44529,
|
||||
"skillMdLines": 981,
|
||||
"estTokens": 11132,
|
||||
"tmplBytes": 7702,
|
||||
"descriptionLen": 586,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"pair-agent": {
|
||||
"skill": "pair-agent",
|
||||
"skillMdBytes": 45339,
|
||||
"skillMdLines": 1036,
|
||||
"estTokens": 11335,
|
||||
"tmplBytes": 8548,
|
||||
"descriptionLen": 709,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"plan-ceo-review": {
|
||||
"skill": "plan-ceo-review",
|
||||
"skillMdBytes": 130891,
|
||||
"skillMdLines": 2224,
|
||||
"estTokens": 32723,
|
||||
"tmplBytes": 63393,
|
||||
"descriptionLen": 1326,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
"plan-design-review": {
|
||||
"skill": "plan-design-review",
|
||||
"skillMdBytes": 105592,
|
||||
"skillMdLines": 1944,
|
||||
"estTokens": 26398,
|
||||
"tmplBytes": 28624,
|
||||
"descriptionLen": 568,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
"plan-devex-review": {
|
||||
"skill": "plan-devex-review",
|
||||
"skillMdBytes": 104571,
|
||||
"skillMdLines": 2145,
|
||||
"estTokens": 26143,
|
||||
"tmplBytes": 35680,
|
||||
"descriptionLen": 886,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
"plan-eng-review": {
|
||||
"skill": "plan-eng-review",
|
||||
"skillMdBytes": 101409,
|
||||
"skillMdLines": 1788,
|
||||
"estTokens": 25352,
|
||||
"tmplBytes": 26234,
|
||||
"descriptionLen": 743,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
"plan-tune": {
|
||||
"skill": "plan-tune",
|
||||
"skillMdBytes": 50123,
|
||||
"skillMdLines": 1105,
|
||||
"estTokens": 12531,
|
||||
"tmplBytes": 15586,
|
||||
"descriptionLen": 997,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"qa": {
|
||||
"skill": "qa",
|
||||
"skillMdBytes": 72267,
|
||||
"skillMdLines": 1648,
|
||||
"estTokens": 18067,
|
||||
"tmplBytes": 12701,
|
||||
"descriptionLen": 814,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"qa-only": {
|
||||
"skill": "qa-only",
|
||||
"skillMdBytes": 54819,
|
||||
"skillMdLines": 1220,
|
||||
"estTokens": 13705,
|
||||
"tmplBytes": 3851,
|
||||
"descriptionLen": 605,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"retro": {
|
||||
"skill": "retro",
|
||||
"skillMdBytes": 81286,
|
||||
"skillMdLines": 1777,
|
||||
"estTokens": 20322,
|
||||
"tmplBytes": 42427,
|
||||
"descriptionLen": 979,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"review": {
|
||||
"skill": "review",
|
||||
"skillMdBytes": 92443,
|
||||
"skillMdLines": 1789,
|
||||
"estTokens": 23111,
|
||||
"tmplBytes": 14099,
|
||||
"descriptionLen": 512,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"scrape": {
|
||||
"skill": "scrape",
|
||||
"skillMdBytes": 42040,
|
||||
"skillMdLines": 914,
|
||||
"estTokens": 10510,
|
||||
"tmplBytes": 5220,
|
||||
"descriptionLen": 519,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"setup-browser-cookies": {
|
||||
"skill": "setup-browser-cookies",
|
||||
"skillMdBytes": 25886,
|
||||
"skillMdLines": 577,
|
||||
"estTokens": 6472,
|
||||
"tmplBytes": 2724,
|
||||
"descriptionLen": 433,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"setup-deploy": {
|
||||
"skill": "setup-deploy",
|
||||
"skillMdBytes": 42326,
|
||||
"skillMdLines": 946,
|
||||
"estTokens": 10582,
|
||||
"tmplBytes": 7780,
|
||||
"descriptionLen": 564,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"setup-gbrain": {
|
||||
"skill": "setup-gbrain",
|
||||
"skillMdBytes": 76791,
|
||||
"skillMdLines": 1733,
|
||||
"estTokens": 19198,
|
||||
"tmplBytes": 42245,
|
||||
"descriptionLen": 512,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"ship": {
|
||||
"skill": "ship",
|
||||
"skillMdBytes": 163553,
|
||||
"skillMdLines": 3094,
|
||||
"estTokens": 40888,
|
||||
"tmplBytes": 48869,
|
||||
"descriptionLen": 557,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": true
|
||||
},
|
||||
"skillify": {
|
||||
"skill": "skillify",
|
||||
"skillMdBytes": 51935,
|
||||
"skillMdLines": 1196,
|
||||
"estTokens": 12984,
|
||||
"tmplBytes": 15107,
|
||||
"descriptionLen": 571,
|
||||
"hasGateEval": true,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"sync-gbrain": {
|
||||
"skill": "sync-gbrain",
|
||||
"skillMdBytes": 48555,
|
||||
"skillMdLines": 1057,
|
||||
"estTokens": 12139,
|
||||
"tmplBytes": 13996,
|
||||
"descriptionLen": 510,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
},
|
||||
"unfreeze": {
|
||||
"skill": "unfreeze",
|
||||
"skillMdBytes": 1482,
|
||||
"skillMdLines": 46,
|
||||
"estTokens": 371,
|
||||
"tmplBytes": 1386,
|
||||
"descriptionLen": 350,
|
||||
"hasGateEval": false,
|
||||
"hasPeriodicEval": false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* Unit tests for parity baseline capture.
|
||||
*
|
||||
* Free. Reads the live repo state via captureBaseline() and asserts
|
||||
* shape + invariants, not specific numbers (which drift release-over-release).
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { captureBaseline, diffBaselines, type ParityBaseline } from './capture-parity-baseline';
|
||||
|
||||
const REPO_ROOT = path.resolve(import.meta.dir, '..', '..');
|
||||
|
||||
describe('capture-parity-baseline', () => {
|
||||
test('produces a shaped baseline for the current repo', () => {
|
||||
const baseline = captureBaseline({ repoRoot: REPO_ROOT, tag: 'unit-test' });
|
||||
expect(baseline.tag).toBe('unit-test');
|
||||
expect(baseline.totalSkills).toBeGreaterThan(20);
|
||||
expect(baseline.totalCorpusBytes).toBeGreaterThan(100_000);
|
||||
expect(baseline.topHeaviest.length).toBeGreaterThan(0);
|
||||
expect(baseline.topHeaviest.length).toBeLessThanOrEqual(10);
|
||||
expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThan(0);
|
||||
// Top 1 should be ≥ Top 2 (sort invariant)
|
||||
if (baseline.topHeaviest.length >= 2) {
|
||||
expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThanOrEqual(
|
||||
baseline.topHeaviest[1]!.skillMdBytes,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('each skill entry has byte + line + token estimates', () => {
|
||||
const baseline = captureBaseline({ repoRoot: REPO_ROOT });
|
||||
for (const skill of Object.values(baseline.skills)) {
|
||||
expect(skill.skillMdBytes).toBeGreaterThan(0);
|
||||
expect(skill.skillMdLines).toBeGreaterThan(0);
|
||||
expect(skill.estTokens).toBeGreaterThan(0);
|
||||
// ~4 chars/token heuristic
|
||||
expect(skill.estTokens).toBeCloseTo(skill.skillMdBytes / 4, -2);
|
||||
}
|
||||
});
|
||||
|
||||
test('diffBaselines returns expected deltas', () => {
|
||||
const before: ParityBaseline = {
|
||||
tag: 'before',
|
||||
capturedAt: '2026-01-01T00:00:00Z',
|
||||
capturedFromCommit: 'abc',
|
||||
capturedFromBranch: 'main',
|
||||
totalSkills: 2,
|
||||
totalCorpusBytes: 1000,
|
||||
estTotalCatalogTokens: 100,
|
||||
topHeaviest: [],
|
||||
skills: {
|
||||
foo: { skill: 'foo', skillMdBytes: 600, skillMdLines: 10, estTokens: 150, tmplBytes: 300, descriptionLen: 50, hasGateEval: true, hasPeriodicEval: false },
|
||||
bar: { skill: 'bar', skillMdBytes: 400, skillMdLines: 8, estTokens: 100, tmplBytes: 200, descriptionLen: 30, hasGateEval: false, hasPeriodicEval: false },
|
||||
},
|
||||
};
|
||||
const after: ParityBaseline = {
|
||||
...before,
|
||||
tag: 'after',
|
||||
totalCorpusBytes: 700,
|
||||
estTotalCatalogTokens: 60,
|
||||
skills: {
|
||||
foo: { ...before.skills.foo!, skillMdBytes: 400 },
|
||||
bar: { ...before.skills.bar!, skillMdBytes: 300 },
|
||||
},
|
||||
};
|
||||
const diff = diffBaselines(before, after);
|
||||
expect(diff.totalCorpusDelta).toBe(-300);
|
||||
expect(diff.totalCorpusDeltaPct).toBeCloseTo(-30, 1);
|
||||
expect(diff.catalogTokensDelta).toBe(-40);
|
||||
expect(diff.perSkill.length).toBe(2);
|
||||
// Sorted by abs delta descending
|
||||
expect(diff.perSkill[0]!.skill).toBe('foo');
|
||||
expect(diff.perSkill[0]!.deltaBytes).toBe(-200);
|
||||
expect(diff.perSkill[1]!.skill).toBe('bar');
|
||||
});
|
||||
|
||||
test('v1.44.1 baseline file exists with expected shape', () => {
|
||||
const baselinePath = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json');
|
||||
expect(fs.existsSync(baselinePath)).toBe(true);
|
||||
const baseline = JSON.parse(fs.readFileSync(baselinePath, 'utf-8')) as ParityBaseline;
|
||||
expect(baseline.tag).toBe('v1.44.1');
|
||||
expect(baseline.totalSkills).toBeGreaterThan(40);
|
||||
// Document the v1.44.1 snapshot as the v1→v2 baseline reference.
|
||||
// Compression in v1.45+ should drop totalCorpusBytes; this assertion
|
||||
// anchors the "v1 was XX MB" claim in the CHANGELOG to a real file.
|
||||
expect(baseline.totalCorpusBytes).toBeGreaterThan(2_000_000);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/**
|
||||
* Parity baseline capture — cathedral parity-eval suite primitive.
|
||||
*
|
||||
* Snapshots the current state of every top-level SKILL.md: byte count, line
|
||||
* count, estimated token count, frontmatter description length, eval
|
||||
* coverage. The output JSON is the v1.44 baseline that v2 must beat on
|
||||
* compression AND match (or exceed) on parity.
|
||||
*
|
||||
* The numbers quoted in the v2.0.0.0 CHANGELOG numbers table are read
|
||||
* from a baseline JSON captured by this script. Never invent baseline
|
||||
* numbers; ship them only if they came from a real captureBaseline() run.
|
||||
*
|
||||
* Usage:
|
||||
* bun run scripts/capture-baseline.ts # write default path
|
||||
* bun run scripts/capture-baseline.ts --out PATH # write custom path
|
||||
* bun run scripts/capture-baseline.ts --tag v1.44.1 # tag the snapshot
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { execSync } from 'child_process';
|
||||
|
||||
export interface SkillBaselineEntry {
|
||||
skill: string;
|
||||
skillMdBytes: number;
|
||||
skillMdLines: number;
|
||||
estTokens: number; // ~4 chars/token heuristic
|
||||
tmplBytes: number | null; // null when no .tmpl exists (vendored or non-Claude)
|
||||
descriptionLen: number; // bytes in frontmatter description field
|
||||
hasGateEval: boolean;
|
||||
hasPeriodicEval: boolean;
|
||||
}
|
||||
|
||||
export interface ParityBaseline {
|
||||
tag: string;
|
||||
capturedAt: string;
|
||||
capturedFromCommit: string;
|
||||
capturedFromBranch: string;
|
||||
totalSkills: number;
|
||||
totalCorpusBytes: number;
|
||||
estTotalCatalogTokens: number; // sum of all description lengths / 4
|
||||
topHeaviest: SkillBaselineEntry[]; // sorted desc by skillMdBytes
|
||||
skills: Record<string, SkillBaselineEntry>;
|
||||
}
|
||||
|
||||
export interface CaptureOptions {
|
||||
repoRoot: string;
|
||||
tag?: string;
|
||||
}
|
||||
|
||||
/** Extract the frontmatter description from a SKILL.md file. Empty string if none. */
|
||||
function extractDescription(content: string): string {
|
||||
if (!content.startsWith('---\n')) return '';
|
||||
const fmEnd = content.indexOf('\n---', 4);
|
||||
if (fmEnd === -1) return '';
|
||||
const frontmatter = content.slice(4, fmEnd);
|
||||
const lines = frontmatter.split('\n');
|
||||
let inDescription = false;
|
||||
const descLines: string[] = [];
|
||||
for (const line of lines) {
|
||||
if (line.match(/^description:\s*\|?\s*$/)) {
|
||||
inDescription = true;
|
||||
continue;
|
||||
}
|
||||
if (line.match(/^description:\s+/)) {
|
||||
descLines.push(line.replace(/^description:\s+/, ''));
|
||||
inDescription = true;
|
||||
continue;
|
||||
}
|
||||
if (inDescription) {
|
||||
if (line.match(/^\w+:\s/)) break;
|
||||
descLines.push(line.trim());
|
||||
}
|
||||
}
|
||||
return descLines.join('\n').trim();
|
||||
}
|
||||
|
||||
/** Estimate token count via 4 chars/token. Crude but matches existing budget-regression usage. */
|
||||
function estimateTokens(bytes: number): number {
|
||||
return Math.round(bytes / 4);
|
||||
}
|
||||
|
||||
/** Find which top-level directories contain a SKILL.md (skills we capture). */
|
||||
function discoverSkillDirs(repoRoot: string): string[] {
|
||||
const entries = fs.readdirSync(repoRoot, { withFileTypes: true });
|
||||
const dirs: string[] = [];
|
||||
for (const e of entries) {
|
||||
if (!e.isDirectory()) continue;
|
||||
if (e.name.startsWith('.')) continue;
|
||||
if (e.name === 'node_modules' || e.name === 'docs') continue;
|
||||
const skillMd = path.join(repoRoot, e.name, 'SKILL.md');
|
||||
if (fs.existsSync(skillMd)) dirs.push(e.name);
|
||||
}
|
||||
return dirs.sort();
|
||||
}
|
||||
|
||||
/** Check whether a skill has E2E gate / periodic eval coverage by scanning test/. */
|
||||
function discoverEvalCoverage(repoRoot: string, skills: string[]): {
|
||||
gate: Set<string>;
|
||||
periodic: Set<string>;
|
||||
} {
|
||||
const gate = new Set<string>();
|
||||
const periodic = new Set<string>();
|
||||
const testDir = path.join(repoRoot, 'test');
|
||||
if (!fs.existsSync(testDir)) return { gate, periodic };
|
||||
const testFiles = fs.readdirSync(testDir).filter(f => f.startsWith('skill-e2e-') && f.endsWith('.test.ts'));
|
||||
// Try to map each test file to a skill by reading its contents for skill names.
|
||||
for (const file of testFiles) {
|
||||
const content = fs.readFileSync(path.join(testDir, file), 'utf-8');
|
||||
for (const skill of skills) {
|
||||
// Match the skill name as a word boundary, also try /skill-name slash form.
|
||||
const re = new RegExp(`(/${skill}|['"\`]${skill}['"\`]|skill[s]?[/=:]\\s*['"\`]${skill}['"\`])`);
|
||||
if (re.test(content)) {
|
||||
// Crude tier inference: if file name contains "regression" / known-periodic markers, classify periodic.
|
||||
if (file.includes('chain') || file.includes('multi') || file.includes('idempotency') || file.includes('finding-floor')) {
|
||||
periodic.add(skill);
|
||||
} else {
|
||||
gate.add(skill);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return { gate, periodic };
|
||||
}
|
||||
|
||||
function getGitInfo(repoRoot: string): { commit: string; branch: string } {
|
||||
try {
|
||||
const commit = execSync('git rev-parse --short HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
|
||||
const branch = execSync('git rev-parse --abbrev-ref HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
|
||||
return { commit, branch };
|
||||
} catch {
|
||||
return { commit: 'unknown', branch: 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
export function captureBaseline(opts: CaptureOptions): ParityBaseline {
|
||||
const { repoRoot, tag } = opts;
|
||||
const skillDirs = discoverSkillDirs(repoRoot);
|
||||
const evalCoverage = discoverEvalCoverage(repoRoot, skillDirs);
|
||||
const skills: Record<string, SkillBaselineEntry> = {};
|
||||
let totalCorpusBytes = 0;
|
||||
let totalDescriptionBytes = 0;
|
||||
for (const dir of skillDirs) {
|
||||
const skillMdPath = path.join(repoRoot, dir, 'SKILL.md');
|
||||
const tmplPath = path.join(repoRoot, dir, 'SKILL.md.tmpl');
|
||||
const content = fs.readFileSync(skillMdPath, 'utf-8');
|
||||
const bytes = Buffer.byteLength(content, 'utf-8');
|
||||
const lines = content.split('\n').length;
|
||||
const description = extractDescription(content);
|
||||
const descriptionLen = Buffer.byteLength(description, 'utf-8');
|
||||
const tmplBytes = fs.existsSync(tmplPath)
|
||||
? Buffer.byteLength(fs.readFileSync(tmplPath, 'utf-8'), 'utf-8')
|
||||
: null;
|
||||
const entry: SkillBaselineEntry = {
|
||||
skill: dir,
|
||||
skillMdBytes: bytes,
|
||||
skillMdLines: lines,
|
||||
estTokens: estimateTokens(bytes),
|
||||
tmplBytes,
|
||||
descriptionLen,
|
||||
hasGateEval: evalCoverage.gate.has(dir),
|
||||
hasPeriodicEval: evalCoverage.periodic.has(dir),
|
||||
};
|
||||
skills[dir] = entry;
|
||||
totalCorpusBytes += bytes;
|
||||
totalDescriptionBytes += descriptionLen;
|
||||
}
|
||||
const topHeaviest = Object.values(skills)
|
||||
.slice()
|
||||
.sort((a, b) => b.skillMdBytes - a.skillMdBytes)
|
||||
.slice(0, 10);
|
||||
const git = getGitInfo(repoRoot);
|
||||
return {
|
||||
tag: tag ?? 'untagged',
|
||||
capturedAt: new Date().toISOString(),
|
||||
capturedFromCommit: git.commit,
|
||||
capturedFromBranch: git.branch,
|
||||
totalSkills: skillDirs.length,
|
||||
totalCorpusBytes,
|
||||
estTotalCatalogTokens: estimateTokens(totalDescriptionBytes),
|
||||
topHeaviest,
|
||||
skills,
|
||||
};
|
||||
}
|
||||
|
||||
/** Diff two baselines; useful for v2 vs v1.44 deltas. */
|
||||
export interface BaselineDiff {
|
||||
totalCorpusDelta: number;
|
||||
totalCorpusDeltaPct: number;
|
||||
catalogTokensDelta: number;
|
||||
catalogTokensDeltaPct: number;
|
||||
perSkill: Array<{
|
||||
skill: string;
|
||||
beforeBytes: number;
|
||||
afterBytes: number;
|
||||
deltaBytes: number;
|
||||
deltaPct: number;
|
||||
}>;
|
||||
}
|
||||
|
||||
export function diffBaselines(before: ParityBaseline, after: ParityBaseline): BaselineDiff {
|
||||
const totalCorpusDelta = after.totalCorpusBytes - before.totalCorpusBytes;
|
||||
const totalCorpusDeltaPct = before.totalCorpusBytes
|
||||
? (totalCorpusDelta / before.totalCorpusBytes) * 100
|
||||
: 0;
|
||||
const catalogTokensDelta = after.estTotalCatalogTokens - before.estTotalCatalogTokens;
|
||||
const catalogTokensDeltaPct = before.estTotalCatalogTokens
|
||||
? (catalogTokensDelta / before.estTotalCatalogTokens) * 100
|
||||
: 0;
|
||||
const perSkill: BaselineDiff['perSkill'] = [];
|
||||
const allSkills = new Set([...Object.keys(before.skills), ...Object.keys(after.skills)]);
|
||||
for (const skill of allSkills) {
|
||||
const b = before.skills[skill]?.skillMdBytes ?? 0;
|
||||
const a = after.skills[skill]?.skillMdBytes ?? 0;
|
||||
perSkill.push({
|
||||
skill,
|
||||
beforeBytes: b,
|
||||
afterBytes: a,
|
||||
deltaBytes: a - b,
|
||||
deltaPct: b ? ((a - b) / b) * 100 : 0,
|
||||
});
|
||||
}
|
||||
perSkill.sort((x, y) => Math.abs(y.deltaBytes) - Math.abs(x.deltaBytes));
|
||||
return {
|
||||
totalCorpusDelta,
|
||||
totalCorpusDeltaPct,
|
||||
catalogTokensDelta,
|
||||
catalogTokensDeltaPct,
|
||||
perSkill,
|
||||
};
|
||||
}
|
||||
Loading…
Reference in New Issue