test(parity): T0b — cathedral parity-suite harness + invariant registry

Adds the harness that the v2_PLAN.md cathedral parity-eval suite is built on. Compares CURRENT SKILL.md output to v1.44.1 baseline along three axes: STRUCTURE frontmatter shape (catalog trim landed, "## When to invoke" present) CONTENT must-preserve phrases per skill family (cso: OWASP/STRIDE; plan-ceo: SCOPE EXPANSION/HOLD SCOPE/REDUCTION; ship: VERSION/CHANGELOG/PR; etc.) SIZE per-skill byte budget (maxSizeRatio + minBytes guards) PARITY_INVARIANTS registry pins 10 load-bearing skills (cso, ship, plan-*- review, review, qa, investigate, office-hours, autoplan). Each entry declares what must NOT regress; future compression that strips these phrases or shrinks a skill past its minBytes cliff fails CI. Periodic-tier LLM-judge parity (paid, ~$0.20/skill) lands in v2.0.0.0 sections/ phase. Same registry, same harness, judge added on top. Test plan: - bun test test/parity-suite.test.ts: 10/10 invariants pass vs v1.44.1 - Per-skill failures get actionable per-line breakdown so a reviewer can see which phrase / heading / size limit went sideways Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:37:36 -07:00 · 2026-05-25 20:37:36 -07:00 · ebebc95a34
parent 6d48d23ba7
commit ebebc95a34
2 changed files with 279 additions and 0 deletions
--- a/test/helpers/parity-harness.ts
+++ b/test/helpers/parity-harness.ts
@ -0,0 +1,230 @@
+/**
+ * Cathedral parity-eval harness (v1.45.0.0 T0b).
+ *
+ * Compares CURRENT SKILL.md output to a v1.44.1 golden baseline along three
+ * axes: STRUCTURE (frontmatter shape), CONTENT (must-preserve phrases per
+ * skill family), and SIZE (per-skill byte budget). The fourth axis —
+ * BEHAVIORAL parity via LLM-as-judge — runs on top of this harness in the
+ * periodic-tier eval suite (paid, ~$0.20 per skill judge call).
+ *
+ * The structural + content checks ship in v1.45.0.0 as the foundation; the
+ * LLM-judge layer lands in v2.0.0.0 alongside the sections/ pattern. Both
+ * use this module's APIs.
+ *
+ * Why a separate harness from skill-size-budget.test.ts: that one enforces
+ * size discipline only. This module supports content invariants per skill
+ * family (e.g., cso must preserve OWASP/STRIDE; plan-ceo must preserve
+ * mode-selection phrasing) so future compression can't silently strip
+ * load-bearing prose even when size stays within ratio.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import type { ParityBaseline, SkillBaselineEntry } from './capture-parity-baseline';
+import { captureBaseline } from './capture-parity-baseline';
+
+export interface ParityInvariant {
+  skill: string;
+  /** Phrases that MUST appear in the generated SKILL.md (case-insensitive substring). */
+  mustContain?: string[];
+  /** Markdown H2 headings that MUST appear. */
+  mustHaveHeadings?: string[];
+  /** Maximum byte size growth ratio vs baseline. 1.0 = no growth allowed. */
+  maxSizeRatio?: number;
+  /** Minimum byte size (catches over-stripping cliffs). */
+  minBytes?: number;
+}
+
+export interface ParityCheckResult {
+  skill: string;
+  passed: boolean;
+  failures: string[];
+}
+
+export function checkSkillParity(
+  invariant: ParityInvariant,
+  current: SkillBaselineEntry,
+  baseline: SkillBaselineEntry | undefined,
+  repoRoot: string,
+): ParityCheckResult {
+  const failures: string[] = [];
+
+  // SIZE checks
+  if (invariant.maxSizeRatio !== undefined && baseline) {
+    const ratio = current.skillMdBytes / baseline.skillMdBytes;
+    if (ratio > invariant.maxSizeRatio) {
+      failures.push(`size ratio ${ratio.toFixed(3)} > maxSizeRatio ${invariant.maxSizeRatio}`);
+    }
+  }
+  if (invariant.minBytes !== undefined && current.skillMdBytes < invariant.minBytes) {
+    failures.push(`size ${current.skillMdBytes} < minBytes ${invariant.minBytes}`);
+  }
+
+  // CONTENT checks (read live file for fresh content)
+  if (invariant.mustContain?.length || invariant.mustHaveHeadings?.length) {
+    const skillMdPath = path.join(repoRoot, invariant.skill, 'SKILL.md');
+    let content: string | null = null;
+    try {
+      content = fs.readFileSync(skillMdPath, 'utf-8');
+    } catch (err) {
+      failures.push(`cannot read ${skillMdPath}: ${(err as Error).message}`);
+    }
+    if (content) {
+      const lower = content.toLowerCase();
+      for (const phrase of invariant.mustContain ?? []) {
+        if (!lower.includes(phrase.toLowerCase())) {
+          failures.push(`missing required phrase: "${phrase}"`);
+        }
+      }
+      for (const heading of invariant.mustHaveHeadings ?? []) {
+        if (!content.includes(heading)) {
+          failures.push(`missing required heading: "${heading}"`);
+        }
+      }
+    }
+  }
+
+  return {
+    skill: invariant.skill,
+    passed: failures.length === 0,
+    failures,
+  };
+}
+
+export interface ParityReport {
+  baselineTag: string;
+  currentCapturedAt: string;
+  totalChecks: number;
+  passed: number;
+  failed: number;
+  details: ParityCheckResult[];
+}
+
+export function runParityChecks(opts: {
+  repoRoot: string;
+  baseline: ParityBaseline;
+  invariants: ParityInvariant[];
+}): ParityReport {
+  const { repoRoot, baseline, invariants } = opts;
+  const current = captureBaseline({ repoRoot });
+  const details: ParityCheckResult[] = [];
+  for (const invariant of invariants) {
+    const baselineEntry = baseline.skills[invariant.skill];
+    const currentEntry = current.skills[invariant.skill];
+    if (!currentEntry) {
+      details.push({
+        skill: invariant.skill,
+        passed: false,
+        failures: [`skill removed: ${invariant.skill} present in baseline but not current state`],
+      });
+      continue;
+    }
+    details.push(checkSkillParity(invariant, currentEntry, baselineEntry, repoRoot));
+  }
+  return {
+    baselineTag: baseline.tag,
+    currentCapturedAt: current.capturedAt,
+    totalChecks: details.length,
+    passed: details.filter(d => d.passed).length,
+    failed: details.filter(d => !d.passed).length,
+    details,
+  };
+}
+
+/**
+ * Standard invariant registry — the v1.45.0.0 set.
+ *
+ * Each entry pins what must-not-break in a skill family. Extend as future
+ * skills land. Phase B (v2.0.0.0) adds LLM-judge invariants on top of these.
+ */
+export const PARITY_INVARIANTS: ParityInvariant[] = [
+  {
+    skill: 'cso',
+    mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 30_000,
+  },
+  {
+    skill: 'ship',
+    mustContain: [
+      'VERSION',
+      'CHANGELOG',
+      'review',
+      'merge',
+      'PR',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 80_000,
+  },
+  {
+    skill: 'plan-ceo-review',
+    mustContain: [
+      'SCOPE EXPANSION',
+      'SELECTIVE EXPANSION',
+      'HOLD SCOPE',
+      'SCOPE REDUCTION',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 80_000,
+  },
+  {
+    skill: 'plan-eng-review',
+    mustContain: [
+      'Architecture',
+      'Code Quality',
+      'Test',
+      'Performance',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'plan-design-review',
+    mustContain: [
+      'design',
+      'visual',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'review',
+    mustContain: ['confidence', 'P1', 'P2'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'qa',
+    mustContain: ['bug', 'browse', 'fix'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 50_000,
+  },
+  {
+    skill: 'investigate',
+    mustContain: ['root cause', 'hypothes'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 30_000,
+  },
+  {
+    skill: 'office-hours',
+    mustContain: ['design doc', 'problem statement'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'autoplan',
+    mustContain: ['ceo', 'eng', 'design'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+];
--- a/test/parity-suite.test.ts
+++ b/test/parity-suite.test.ts
@ -0,0 +1,49 @@
+/**
+ * Cathedral parity suite — gate-tier (free, structural + content checks).
+ *
+ * Runs every PARITY_INVARIANTS check against the current SKILL.md output
+ * vs the v1.44.1 baseline. Failures get an actionable, per-skill report
+ * showing missing phrases, missing headings, and size ratios.
+ *
+ * Periodic-tier LLM-judge parity (paid) lands in Phase B (v2.0.0.0)
+ * alongside the sections/ extraction. Plumbing is in parity-harness.ts.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import { runParityChecks, PARITY_INVARIANTS } from './helpers/parity-harness';
+import type { ParityBaseline } from './helpers/capture-parity-baseline';
+
+const REPO_ROOT = path.resolve(import.meta.dir, '..');
+const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json');
+
+describe('parity suite vs v1.44.1 baseline (gate, free)', () => {
+  test('baseline exists', () => {
+    expect(fs.existsSync(BASELINE_PATH)).toBe(true);
+  });
+
+  test('all PARITY_INVARIANTS pass', () => {
+    const baseline: ParityBaseline = JSON.parse(fs.readFileSync(BASELINE_PATH, 'utf-8'));
+    const report = runParityChecks({
+      repoRoot: REPO_ROOT,
+      baseline,
+      invariants: PARITY_INVARIANTS,
+    });
+
+    // eslint-disable-next-line no-console
+    console.log(
+      `[parity] ${report.passed}/${report.totalChecks} skills passed parity vs ${baseline.tag}`,
+    );
+
+    if (report.failed === 0) return;
+
+    const failureMessages = report.details
+      .filter(d => !d.passed)
+      .map(d => `  ${d.skill}:\n    - ${d.failures.join('\n    - ')}`)
+      .join('\n');
+    throw new Error(
+      `${report.failed} skill(s) failed parity checks vs v1.44.1:\n${failureMessages}`,
+    );
+  });
+});