gstack/test/skill-e2e-auq-verbose-vs-ca...

115 lines
4.5 KiB
TypeScript

/**
* AUQ no-degradation A/B: verbose (full-token) vs carved (slimmed) — periodic,
* paid, SDK capture.
*
* The keystone empirical proof behind the token-reduction work: carving
* /plan-ceo-review into an 80KB skeleton + on-demand section did NOT degrade the
* AskUserQuestion it shows the user. Layer 0 (auq-format-always-loaded.test.ts)
* proves the format SPEC is present in both skeletons deterministically; this
* proves the model still GENERATES an equal-quality question with the smaller
* context.
*
* Method — identical prompt, two SKILL.md versions, compare:
* - CARVED : this branch's plan-ceo-review/SKILL.md (80KB skeleton) + sections.
* - VERBOSE : the pre-carve monolith (137KB) read from git (ab66193e^).
* Both are driven to Step 0F mode selection via the SDK $OUT_FILE capture path
* (clean text, no TTY mangling). We score the 7 decision-brief format elements
* and grade recommendation substance, then assert the carved version is NOT
* WORSE than verbose. Relative parity is the bar (absolute compliance is the
* format-compliance gate test's job).
*
* Expectation: carved >= verbose. At the mode-selection AUQ the carved skeleton
* carries the same {{PREAMBLE}} format spec + Step 0 prose as verbose, with
* strictly less unrelated review-section text in context.
*/
import { describe, test } from 'bun:test';
import * as fs from 'node:fs';
import {
setupPlanCeoDir,
captureModeSelectionAuq,
scoreAuqFormat,
carvedSkill,
verboseSkill,
} from './helpers/auq-sdk-capture';
import { judgeRecommendation } from './helpers/llm-judge';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const runId = `auq-ab-${process.env.EVALS_RUN_ID ?? 'local'}`;
async function grade(label: string, dir: string) {
const text = await captureModeSelectionAuq({ planDir: dir, testName: `auq-ab-${label}`, runId });
const fmt = scoreAuqFormat(text);
let substance = 0;
let present = false;
if (text.trim()) {
try {
const r = await judgeRecommendation(text);
substance = r.reason_substance;
present = r.present;
} catch { /* judge unavailable */ }
}
// eslint-disable-next-line no-console
console.log(
`[AUQ-AB ${label}] captured=${text.length}B format=${fmt.present}/${fmt.total} ` +
`missing=[${fmt.missing.join(',')}] recPresent=${present} substance=${substance}`,
);
return { text, fmt, substance };
}
describeE2E('AUQ no-degradation: verbose vs carved (periodic)', () => {
test(
'carved plan-ceo-review AUQ is not worse than verbose on the same prompt',
async () => {
const carved = carvedSkill();
const carvedDir = setupPlanCeoDir({
skillMd: carved.skillMd,
sectionsFrom: carved.sectionsFrom,
tmpPrefix: 'auq-ab-carved-',
});
const verboseDir = setupPlanCeoDir({
skillMd: verboseSkill(),
tmpPrefix: 'auq-ab-verbose-',
});
let c, v;
try {
c = await grade('CARVED', carvedDir);
v = await grade('VERBOSE', verboseDir);
} finally {
fs.rmSync(carvedDir, { recursive: true, force: true });
fs.rmSync(verboseDir, { recursive: true, force: true });
}
const summary = [
`CARVED : format ${c.fmt.present}/${c.fmt.total}, substance ${c.substance}`,
`VERBOSE: format ${v.fmt.present}/${v.fmt.total}, substance ${v.substance}`,
].join('\n');
// Both must have actually produced a question, else the comparison is
// vacuous — fail loud with the captures.
if (!c.text.trim() || !v.text.trim()) {
throw new Error(
`A/B inconclusive — a side produced no AUQ capture:\n${summary}\n` +
`--- carved ---\n${c.text.slice(0, 2000)}\n--- verbose ---\n${v.text.slice(0, 2000)}`,
);
}
const formatRegressed = c.fmt.present < v.fmt.present;
const substanceRegressed = c.substance < v.substance - 1; // 1-pt judge tolerance
if (formatRegressed || substanceRegressed) {
throw new Error(
`AUQ DEGRADATION carving plan-ceo-review:\n${summary}` +
(formatRegressed ? `\n -> carved dropped: [${c.fmt.missing.join(',')}]` : '') +
(substanceRegressed ? `\n -> carved substance regressed >1 pt` : '') +
`\n--- carved AUQ ---\n${c.text}\n--- verbose AUQ ---\n${v.text}`,
);
}
// eslint-disable-next-line no-console
console.log('[AUQ-AB] NO DEGRADATION:\n' + summary);
},
600_000,
);
});