mirror of https://github.com/garrytan/gstack.git
test(parity): T0b — cathedral parity-suite harness + invariant registry
Adds the harness that the v2_PLAN.md cathedral parity-eval suite is built
on. Compares CURRENT SKILL.md output to v1.44.1 baseline along three axes:
STRUCTURE frontmatter shape (catalog trim landed, "## When to invoke" present)
CONTENT must-preserve phrases per skill family (cso: OWASP/STRIDE;
plan-ceo: SCOPE EXPANSION/HOLD SCOPE/REDUCTION; ship:
VERSION/CHANGELOG/PR; etc.)
SIZE per-skill byte budget (maxSizeRatio + minBytes guards)
PARITY_INVARIANTS registry pins 10 load-bearing skills (cso, ship, plan-*-
review, review, qa, investigate, office-hours, autoplan). Each entry
declares what must NOT regress; future compression that strips these
phrases or shrinks a skill past its minBytes cliff fails CI.
Periodic-tier LLM-judge parity (paid, ~$0.20/skill) lands in v2.0.0.0
sections/ phase. Same registry, same harness, judge added on top.
Test plan:
- bun test test/parity-suite.test.ts: 10/10 invariants pass vs v1.44.1
- Per-skill failures get actionable per-line breakdown so a reviewer can
see which phrase / heading / size limit went sideways
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6d48d23ba7
commit
ebebc95a34
|
|
@ -0,0 +1,230 @@
|
|||
/**
|
||||
* Cathedral parity-eval harness (v1.45.0.0 T0b).
|
||||
*
|
||||
* Compares CURRENT SKILL.md output to a v1.44.1 golden baseline along three
|
||||
* axes: STRUCTURE (frontmatter shape), CONTENT (must-preserve phrases per
|
||||
* skill family), and SIZE (per-skill byte budget). The fourth axis —
|
||||
* BEHAVIORAL parity via LLM-as-judge — runs on top of this harness in the
|
||||
* periodic-tier eval suite (paid, ~$0.20 per skill judge call).
|
||||
*
|
||||
* The structural + content checks ship in v1.45.0.0 as the foundation; the
|
||||
* LLM-judge layer lands in v2.0.0.0 alongside the sections/ pattern. Both
|
||||
* use this module's APIs.
|
||||
*
|
||||
* Why a separate harness from skill-size-budget.test.ts: that one enforces
|
||||
* size discipline only. This module supports content invariants per skill
|
||||
* family (e.g., cso must preserve OWASP/STRIDE; plan-ceo must preserve
|
||||
* mode-selection phrasing) so future compression can't silently strip
|
||||
* load-bearing prose even when size stays within ratio.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import type { ParityBaseline, SkillBaselineEntry } from './capture-parity-baseline';
|
||||
import { captureBaseline } from './capture-parity-baseline';
|
||||
|
||||
export interface ParityInvariant {
|
||||
skill: string;
|
||||
/** Phrases that MUST appear in the generated SKILL.md (case-insensitive substring). */
|
||||
mustContain?: string[];
|
||||
/** Markdown H2 headings that MUST appear. */
|
||||
mustHaveHeadings?: string[];
|
||||
/** Maximum byte size growth ratio vs baseline. 1.0 = no growth allowed. */
|
||||
maxSizeRatio?: number;
|
||||
/** Minimum byte size (catches over-stripping cliffs). */
|
||||
minBytes?: number;
|
||||
}
|
||||
|
||||
export interface ParityCheckResult {
|
||||
skill: string;
|
||||
passed: boolean;
|
||||
failures: string[];
|
||||
}
|
||||
|
||||
export function checkSkillParity(
|
||||
invariant: ParityInvariant,
|
||||
current: SkillBaselineEntry,
|
||||
baseline: SkillBaselineEntry | undefined,
|
||||
repoRoot: string,
|
||||
): ParityCheckResult {
|
||||
const failures: string[] = [];
|
||||
|
||||
// SIZE checks
|
||||
if (invariant.maxSizeRatio !== undefined && baseline) {
|
||||
const ratio = current.skillMdBytes / baseline.skillMdBytes;
|
||||
if (ratio > invariant.maxSizeRatio) {
|
||||
failures.push(`size ratio ${ratio.toFixed(3)} > maxSizeRatio ${invariant.maxSizeRatio}`);
|
||||
}
|
||||
}
|
||||
if (invariant.minBytes !== undefined && current.skillMdBytes < invariant.minBytes) {
|
||||
failures.push(`size ${current.skillMdBytes} < minBytes ${invariant.minBytes}`);
|
||||
}
|
||||
|
||||
// CONTENT checks (read live file for fresh content)
|
||||
if (invariant.mustContain?.length || invariant.mustHaveHeadings?.length) {
|
||||
const skillMdPath = path.join(repoRoot, invariant.skill, 'SKILL.md');
|
||||
let content: string | null = null;
|
||||
try {
|
||||
content = fs.readFileSync(skillMdPath, 'utf-8');
|
||||
} catch (err) {
|
||||
failures.push(`cannot read ${skillMdPath}: ${(err as Error).message}`);
|
||||
}
|
||||
if (content) {
|
||||
const lower = content.toLowerCase();
|
||||
for (const phrase of invariant.mustContain ?? []) {
|
||||
if (!lower.includes(phrase.toLowerCase())) {
|
||||
failures.push(`missing required phrase: "${phrase}"`);
|
||||
}
|
||||
}
|
||||
for (const heading of invariant.mustHaveHeadings ?? []) {
|
||||
if (!content.includes(heading)) {
|
||||
failures.push(`missing required heading: "${heading}"`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
skill: invariant.skill,
|
||||
passed: failures.length === 0,
|
||||
failures,
|
||||
};
|
||||
}
|
||||
|
||||
export interface ParityReport {
|
||||
baselineTag: string;
|
||||
currentCapturedAt: string;
|
||||
totalChecks: number;
|
||||
passed: number;
|
||||
failed: number;
|
||||
details: ParityCheckResult[];
|
||||
}
|
||||
|
||||
export function runParityChecks(opts: {
|
||||
repoRoot: string;
|
||||
baseline: ParityBaseline;
|
||||
invariants: ParityInvariant[];
|
||||
}): ParityReport {
|
||||
const { repoRoot, baseline, invariants } = opts;
|
||||
const current = captureBaseline({ repoRoot });
|
||||
const details: ParityCheckResult[] = [];
|
||||
for (const invariant of invariants) {
|
||||
const baselineEntry = baseline.skills[invariant.skill];
|
||||
const currentEntry = current.skills[invariant.skill];
|
||||
if (!currentEntry) {
|
||||
details.push({
|
||||
skill: invariant.skill,
|
||||
passed: false,
|
||||
failures: [`skill removed: ${invariant.skill} present in baseline but not current state`],
|
||||
});
|
||||
continue;
|
||||
}
|
||||
details.push(checkSkillParity(invariant, currentEntry, baselineEntry, repoRoot));
|
||||
}
|
||||
return {
|
||||
baselineTag: baseline.tag,
|
||||
currentCapturedAt: current.capturedAt,
|
||||
totalChecks: details.length,
|
||||
passed: details.filter(d => d.passed).length,
|
||||
failed: details.filter(d => !d.passed).length,
|
||||
details,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Standard invariant registry — the v1.45.0.0 set.
|
||||
*
|
||||
* Each entry pins what must-not-break in a skill family. Extend as future
|
||||
* skills land. Phase B (v2.0.0.0) adds LLM-judge invariants on top of these.
|
||||
*/
|
||||
export const PARITY_INVARIANTS: ParityInvariant[] = [
|
||||
{
|
||||
skill: 'cso',
|
||||
mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 30_000,
|
||||
},
|
||||
{
|
||||
skill: 'ship',
|
||||
mustContain: [
|
||||
'VERSION',
|
||||
'CHANGELOG',
|
||||
'review',
|
||||
'merge',
|
||||
'PR',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 80_000,
|
||||
},
|
||||
{
|
||||
skill: 'plan-ceo-review',
|
||||
mustContain: [
|
||||
'SCOPE EXPANSION',
|
||||
'SELECTIVE EXPANSION',
|
||||
'HOLD SCOPE',
|
||||
'SCOPE REDUCTION',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 80_000,
|
||||
},
|
||||
{
|
||||
skill: 'plan-eng-review',
|
||||
mustContain: [
|
||||
'Architecture',
|
||||
'Code Quality',
|
||||
'Test',
|
||||
'Performance',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
skill: 'plan-design-review',
|
||||
mustContain: [
|
||||
'design',
|
||||
'visual',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
skill: 'review',
|
||||
mustContain: ['confidence', 'P1', 'P2'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
skill: 'qa',
|
||||
mustContain: ['bug', 'browse', 'fix'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 50_000,
|
||||
},
|
||||
{
|
||||
skill: 'investigate',
|
||||
mustContain: ['root cause', 'hypothes'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 30_000,
|
||||
},
|
||||
{
|
||||
skill: 'office-hours',
|
||||
mustContain: ['design doc', 'problem statement'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
skill: 'autoplan',
|
||||
mustContain: ['ceo', 'eng', 'design'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
];
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Cathedral parity suite — gate-tier (free, structural + content checks).
|
||||
*
|
||||
* Runs every PARITY_INVARIANTS check against the current SKILL.md output
|
||||
* vs the v1.44.1 baseline. Failures get an actionable, per-skill report
|
||||
* showing missing phrases, missing headings, and size ratios.
|
||||
*
|
||||
* Periodic-tier LLM-judge parity (paid) lands in Phase B (v2.0.0.0)
|
||||
* alongside the sections/ extraction. Plumbing is in parity-harness.ts.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { runParityChecks, PARITY_INVARIANTS } from './helpers/parity-harness';
|
||||
import type { ParityBaseline } from './helpers/capture-parity-baseline';
|
||||
|
||||
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
||||
const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json');
|
||||
|
||||
describe('parity suite vs v1.44.1 baseline (gate, free)', () => {
|
||||
test('baseline exists', () => {
|
||||
expect(fs.existsSync(BASELINE_PATH)).toBe(true);
|
||||
});
|
||||
|
||||
test('all PARITY_INVARIANTS pass', () => {
|
||||
const baseline: ParityBaseline = JSON.parse(fs.readFileSync(BASELINE_PATH, 'utf-8'));
|
||||
const report = runParityChecks({
|
||||
repoRoot: REPO_ROOT,
|
||||
baseline,
|
||||
invariants: PARITY_INVARIANTS,
|
||||
});
|
||||
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(
|
||||
`[parity] ${report.passed}/${report.totalChecks} skills passed parity vs ${baseline.tag}`,
|
||||
);
|
||||
|
||||
if (report.failed === 0) return;
|
||||
|
||||
const failureMessages = report.details
|
||||
.filter(d => !d.passed)
|
||||
.map(d => ` ${d.skill}:\n - ${d.failures.join('\n - ')}`)
|
||||
.join('\n');
|
||||
throw new Error(
|
||||
`${report.failed} skill(s) failed parity checks vs v1.44.1:\n${failureMessages}`,
|
||||
);
|
||||
});
|
||||
});
|
||||
Loading…
Reference in New Issue