mirror of https://github.com/garrytan/gstack.git
232 lines
8.1 KiB
TypeScript
232 lines
8.1 KiB
TypeScript
/**
|
|
* Parity baseline capture — cathedral parity-eval suite primitive.
|
|
*
|
|
* Snapshots the current state of every top-level SKILL.md: byte count, line
|
|
* count, estimated token count, frontmatter description length, eval
|
|
* coverage. The output JSON is the v1.44 baseline that v2 must beat on
|
|
* compression AND match (or exceed) on parity.
|
|
*
|
|
* The numbers quoted in the v2.0.0.0 CHANGELOG numbers table are read
|
|
* from a baseline JSON captured by this script. Never invent baseline
|
|
* numbers; ship them only if they came from a real captureBaseline() run.
|
|
*
|
|
* Usage:
|
|
* bun run scripts/capture-baseline.ts # write default path
|
|
* bun run scripts/capture-baseline.ts --out PATH # write custom path
|
|
* bun run scripts/capture-baseline.ts --tag v1.44.1 # tag the snapshot
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import { execSync } from 'child_process';
|
|
|
|
export interface SkillBaselineEntry {
|
|
skill: string;
|
|
skillMdBytes: number;
|
|
skillMdLines: number;
|
|
estTokens: number; // ~4 chars/token heuristic
|
|
tmplBytes: number | null; // null when no .tmpl exists (vendored or non-Claude)
|
|
descriptionLen: number; // bytes in frontmatter description field
|
|
hasGateEval: boolean;
|
|
hasPeriodicEval: boolean;
|
|
}
|
|
|
|
export interface ParityBaseline {
|
|
tag: string;
|
|
capturedAt: string;
|
|
capturedFromCommit: string;
|
|
capturedFromBranch: string;
|
|
totalSkills: number;
|
|
totalCorpusBytes: number;
|
|
estTotalCatalogTokens: number; // sum of all description lengths / 4
|
|
topHeaviest: SkillBaselineEntry[]; // sorted desc by skillMdBytes
|
|
skills: Record<string, SkillBaselineEntry>;
|
|
}
|
|
|
|
export interface CaptureOptions {
|
|
repoRoot: string;
|
|
tag?: string;
|
|
}
|
|
|
|
/** Extract the frontmatter description from a SKILL.md file. Empty string if none. */
|
|
function extractDescription(content: string): string {
|
|
if (!content.startsWith('---\n')) return '';
|
|
const fmEnd = content.indexOf('\n---', 4);
|
|
if (fmEnd === -1) return '';
|
|
const frontmatter = content.slice(4, fmEnd);
|
|
const lines = frontmatter.split('\n');
|
|
let inDescription = false;
|
|
const descLines: string[] = [];
|
|
for (const line of lines) {
|
|
if (line.match(/^description:\s*\|?\s*$/)) {
|
|
inDescription = true;
|
|
continue;
|
|
}
|
|
if (line.match(/^description:\s+/)) {
|
|
descLines.push(line.replace(/^description:\s+/, ''));
|
|
inDescription = true;
|
|
continue;
|
|
}
|
|
if (inDescription) {
|
|
if (line.match(/^\w+:\s/)) break;
|
|
descLines.push(line.trim());
|
|
}
|
|
}
|
|
return descLines.join('\n').trim();
|
|
}
|
|
|
|
/** Estimate token count via 4 chars/token. Crude but matches existing budget-regression usage. */
|
|
function estimateTokens(bytes: number): number {
|
|
return Math.round(bytes / 4);
|
|
}
|
|
|
|
/** Find which top-level directories contain a SKILL.md (skills we capture). */
|
|
function discoverSkillDirs(repoRoot: string): string[] {
|
|
const entries = fs.readdirSync(repoRoot, { withFileTypes: true });
|
|
const dirs: string[] = [];
|
|
for (const e of entries) {
|
|
if (!e.isDirectory()) continue;
|
|
if (e.name.startsWith('.')) continue;
|
|
if (e.name === 'node_modules' || e.name === 'docs') continue;
|
|
const skillMd = path.join(repoRoot, e.name, 'SKILL.md');
|
|
if (fs.existsSync(skillMd)) dirs.push(e.name);
|
|
}
|
|
return dirs.sort();
|
|
}
|
|
|
|
/** Check whether a skill has E2E gate / periodic eval coverage by scanning test/. */
|
|
function discoverEvalCoverage(repoRoot: string, skills: string[]): {
|
|
gate: Set<string>;
|
|
periodic: Set<string>;
|
|
} {
|
|
const gate = new Set<string>();
|
|
const periodic = new Set<string>();
|
|
const testDir = path.join(repoRoot, 'test');
|
|
if (!fs.existsSync(testDir)) return { gate, periodic };
|
|
const testFiles = fs.readdirSync(testDir).filter(f => f.startsWith('skill-e2e-') && f.endsWith('.test.ts'));
|
|
// Try to map each test file to a skill by reading its contents for skill names.
|
|
for (const file of testFiles) {
|
|
const content = fs.readFileSync(path.join(testDir, file), 'utf-8');
|
|
for (const skill of skills) {
|
|
// Match the skill name as a word boundary, also try /skill-name slash form.
|
|
const re = new RegExp(`(/${skill}|['"\`]${skill}['"\`]|skill[s]?[/=:]\\s*['"\`]${skill}['"\`])`);
|
|
if (re.test(content)) {
|
|
// Crude tier inference: if file name contains "regression" / known-periodic markers, classify periodic.
|
|
if (file.includes('chain') || file.includes('multi') || file.includes('idempotency') || file.includes('finding-floor')) {
|
|
periodic.add(skill);
|
|
} else {
|
|
gate.add(skill);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return { gate, periodic };
|
|
}
|
|
|
|
function getGitInfo(repoRoot: string): { commit: string; branch: string } {
|
|
try {
|
|
const commit = execSync('git rev-parse --short HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
|
|
const branch = execSync('git rev-parse --abbrev-ref HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
|
|
return { commit, branch };
|
|
} catch {
|
|
return { commit: 'unknown', branch: 'unknown' };
|
|
}
|
|
}
|
|
|
|
export function captureBaseline(opts: CaptureOptions): ParityBaseline {
|
|
const { repoRoot, tag } = opts;
|
|
const skillDirs = discoverSkillDirs(repoRoot);
|
|
const evalCoverage = discoverEvalCoverage(repoRoot, skillDirs);
|
|
const skills: Record<string, SkillBaselineEntry> = {};
|
|
let totalCorpusBytes = 0;
|
|
let totalDescriptionBytes = 0;
|
|
for (const dir of skillDirs) {
|
|
const skillMdPath = path.join(repoRoot, dir, 'SKILL.md');
|
|
const tmplPath = path.join(repoRoot, dir, 'SKILL.md.tmpl');
|
|
const content = fs.readFileSync(skillMdPath, 'utf-8');
|
|
const bytes = Buffer.byteLength(content, 'utf-8');
|
|
const lines = content.split('\n').length;
|
|
const description = extractDescription(content);
|
|
const descriptionLen = Buffer.byteLength(description, 'utf-8');
|
|
const tmplBytes = fs.existsSync(tmplPath)
|
|
? Buffer.byteLength(fs.readFileSync(tmplPath, 'utf-8'), 'utf-8')
|
|
: null;
|
|
const entry: SkillBaselineEntry = {
|
|
skill: dir,
|
|
skillMdBytes: bytes,
|
|
skillMdLines: lines,
|
|
estTokens: estimateTokens(bytes),
|
|
tmplBytes,
|
|
descriptionLen,
|
|
hasGateEval: evalCoverage.gate.has(dir),
|
|
hasPeriodicEval: evalCoverage.periodic.has(dir),
|
|
};
|
|
skills[dir] = entry;
|
|
totalCorpusBytes += bytes;
|
|
totalDescriptionBytes += descriptionLen;
|
|
}
|
|
const topHeaviest = Object.values(skills)
|
|
.slice()
|
|
.sort((a, b) => b.skillMdBytes - a.skillMdBytes)
|
|
.slice(0, 10);
|
|
const git = getGitInfo(repoRoot);
|
|
return {
|
|
tag: tag ?? 'untagged',
|
|
capturedAt: new Date().toISOString(),
|
|
capturedFromCommit: git.commit,
|
|
capturedFromBranch: git.branch,
|
|
totalSkills: skillDirs.length,
|
|
totalCorpusBytes,
|
|
estTotalCatalogTokens: estimateTokens(totalDescriptionBytes),
|
|
topHeaviest,
|
|
skills,
|
|
};
|
|
}
|
|
|
|
/** Diff two baselines; useful for v2 vs v1.44 deltas. */
|
|
export interface BaselineDiff {
|
|
totalCorpusDelta: number;
|
|
totalCorpusDeltaPct: number;
|
|
catalogTokensDelta: number;
|
|
catalogTokensDeltaPct: number;
|
|
perSkill: Array<{
|
|
skill: string;
|
|
beforeBytes: number;
|
|
afterBytes: number;
|
|
deltaBytes: number;
|
|
deltaPct: number;
|
|
}>;
|
|
}
|
|
|
|
export function diffBaselines(before: ParityBaseline, after: ParityBaseline): BaselineDiff {
|
|
const totalCorpusDelta = after.totalCorpusBytes - before.totalCorpusBytes;
|
|
const totalCorpusDeltaPct = before.totalCorpusBytes
|
|
? (totalCorpusDelta / before.totalCorpusBytes) * 100
|
|
: 0;
|
|
const catalogTokensDelta = after.estTotalCatalogTokens - before.estTotalCatalogTokens;
|
|
const catalogTokensDeltaPct = before.estTotalCatalogTokens
|
|
? (catalogTokensDelta / before.estTotalCatalogTokens) * 100
|
|
: 0;
|
|
const perSkill: BaselineDiff['perSkill'] = [];
|
|
const allSkills = new Set([...Object.keys(before.skills), ...Object.keys(after.skills)]);
|
|
for (const skill of allSkills) {
|
|
const b = before.skills[skill]?.skillMdBytes ?? 0;
|
|
const a = after.skills[skill]?.skillMdBytes ?? 0;
|
|
perSkill.push({
|
|
skill,
|
|
beforeBytes: b,
|
|
afterBytes: a,
|
|
deltaBytes: a - b,
|
|
deltaPct: b ? ((a - b) / b) * 100 : 0,
|
|
});
|
|
}
|
|
perSkill.sort((x, y) => Math.abs(y.deltaBytes) - Math.abs(x.deltaBytes));
|
|
return {
|
|
totalCorpusDelta,
|
|
totalCorpusDeltaPct,
|
|
catalogTokensDelta,
|
|
catalogTokensDeltaPct,
|
|
perSkill,
|
|
};
|
|
}
|