/** * AUQ no-degradation A/B: verbose (full-token) vs carved (slimmed) — periodic, * paid, SDK capture. * * The keystone empirical proof behind the token-reduction work: carving * /plan-ceo-review into an 80KB skeleton + on-demand section did NOT degrade the * AskUserQuestion it shows the user. Layer 0 (auq-format-always-loaded.test.ts) * proves the format SPEC is present in both skeletons deterministically; this * proves the model still GENERATES an equal-quality question with the smaller * context. * * Method — identical prompt, two SKILL.md versions, compare: * - CARVED : this branch's plan-ceo-review/SKILL.md (80KB skeleton) + sections. * - VERBOSE : the pre-carve monolith (137KB) read from git (ab66193e^). * Both are driven to Step 0F mode selection via the SDK $OUT_FILE capture path * (clean text, no TTY mangling). We score the 7 decision-brief format elements * and grade recommendation substance, then assert the carved version is NOT * WORSE than verbose. Relative parity is the bar (absolute compliance is the * format-compliance gate test's job). * * Expectation: carved >= verbose. At the mode-selection AUQ the carved skeleton * carries the same {{PREAMBLE}} format spec + Step 0 prose as verbose, with * strictly less unrelated review-section text in context. */ import { describe, test } from 'bun:test'; import * as fs from 'node:fs'; import { setupPlanCeoDir, captureModeSelectionAuq, scoreAuqFormat, carvedSkill, verboseSkill, } from './helpers/auq-sdk-capture'; import { judgeRecommendation } from './helpers/llm-judge'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; const describeE2E = shouldRun ? describe : describe.skip; const runId = `auq-ab-${process.env.EVALS_RUN_ID ?? 'local'}`; async function grade(label: string, dir: string) { const text = await captureModeSelectionAuq({ planDir: dir, testName: `auq-ab-${label}`, runId }); const fmt = scoreAuqFormat(text); let substance = 0; let present = false; if (text.trim()) { try { const r = await judgeRecommendation(text); substance = r.reason_substance; present = r.present; } catch { /* judge unavailable */ } } // eslint-disable-next-line no-console console.log( `[AUQ-AB ${label}] captured=${text.length}B format=${fmt.present}/${fmt.total} ` + `missing=[${fmt.missing.join(',')}] recPresent=${present} substance=${substance}`, ); return { text, fmt, substance }; } describeE2E('AUQ no-degradation: verbose vs carved (periodic)', () => { test( 'carved plan-ceo-review AUQ is not worse than verbose on the same prompt', async () => { const carved = carvedSkill(); const carvedDir = setupPlanCeoDir({ skillMd: carved.skillMd, sectionsFrom: carved.sectionsFrom, tmpPrefix: 'auq-ab-carved-', }); const verboseDir = setupPlanCeoDir({ skillMd: verboseSkill(), tmpPrefix: 'auq-ab-verbose-', }); let c, v; try { c = await grade('CARVED', carvedDir); v = await grade('VERBOSE', verboseDir); } finally { fs.rmSync(carvedDir, { recursive: true, force: true }); fs.rmSync(verboseDir, { recursive: true, force: true }); } const summary = [ `CARVED : format ${c.fmt.present}/${c.fmt.total}, substance ${c.substance}`, `VERBOSE: format ${v.fmt.present}/${v.fmt.total}, substance ${v.substance}`, ].join('\n'); // Both must have actually produced a question, else the comparison is // vacuous — fail loud with the captures. if (!c.text.trim() || !v.text.trim()) { throw new Error( `A/B inconclusive — a side produced no AUQ capture:\n${summary}\n` + `--- carved ---\n${c.text.slice(0, 2000)}\n--- verbose ---\n${v.text.slice(0, 2000)}`, ); } const formatRegressed = c.fmt.present < v.fmt.present; const substanceRegressed = c.substance < v.substance - 1; // 1-pt judge tolerance if (formatRegressed || substanceRegressed) { throw new Error( `AUQ DEGRADATION carving plan-ceo-review:\n${summary}` + (formatRegressed ? `\n -> carved dropped: [${c.fmt.missing.join(',')}]` : '') + (substanceRegressed ? `\n -> carved substance regressed >1 pt` : '') + `\n--- carved AUQ ---\n${c.text}\n--- verbose AUQ ---\n${v.text}`, ); } // eslint-disable-next-line no-console console.log('[AUQ-AB] NO DEGRADATION:\n' + summary); }, 600_000, ); });