From 28199374d17798c612fd37f3d7ca621fb9b38dc0 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 1 Jun 2026 22:17:37 -0700 Subject: [PATCH] =?UTF-8?q?test(auq):=20consistency=20=E2=80=94=20same=20t?= =?UTF-8?q?rigger=20N=20runs,=20stable=20format=20+=20substance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drives the carved /plan-ceo-review AUQ N=3 times and fails if any format element appears in one run but not another, or substance craters. Targets the "fine one run, broken the next" failure class a single snapshot can't see. Result: 3/3 stable, 7/7 + substance 5 every run. Periodic tier. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/skill-e2e-auq-consistency.test.ts | 104 +++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 test/skill-e2e-auq-consistency.test.ts diff --git a/test/skill-e2e-auq-consistency.test.ts b/test/skill-e2e-auq-consistency.test.ts new file mode 100644 index 000000000..9601981cf --- /dev/null +++ b/test/skill-e2e-auq-consistency.test.ts @@ -0,0 +1,104 @@ +/** + * AUQ consistency — same prompt, N runs, stable format + substance (periodic). + * + * The user's core anxiety: AUQ is fine one run and broken the next — sometimes + * no ELI10, sometimes no recommendation, sometimes minimal context. A single + * snapshot can't see drift. This drives the carved /plan-ceo-review mode-selection + * AUQ N times via the SDK capture path (clean text, no TTY mangling) and asserts + * the decision-brief format holds EVERY time and substance never craters. + * + * Pass bar: + * - Format: no element present in one run may be missing in another (that IS + * the inconsistency the user feels). + * - Substance: every run >= 3, spread (max-min) <= 2. + * + * Reports per-run scores so drift is visible even on a pass. Periodic tier + * (N SDK runs, ~$0.50-1 each). + */ +import { describe, test } from 'bun:test'; +import * as fs from 'node:fs'; +import { + setupPlanCeoDir, + captureModeSelectionAuq, + AUQ_FORMAT_ELEMENTS, + carvedSkill, +} from './helpers/auq-sdk-capture'; +import { judgeRecommendation } from './helpers/llm-judge'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; +const N_RUNS = Number(process.env.AUQ_CONSISTENCY_RUNS ?? '3'); +const runId = `auq-consistency-${process.env.EVALS_RUN_ID ?? 'local'}`; + +describeE2E('AUQ consistency across runs (periodic)', () => { + test( + `carved /plan-ceo-review AUQ format + substance stable across ${N_RUNS} runs`, + async () => { + const runs: Array<{ i: number; present: Set; substance: number; empty: boolean }> = []; + + for (let i = 0; i < N_RUNS; i++) { + const carved = carvedSkill(); + const dir = setupPlanCeoDir({ + skillMd: carved.skillMd, + sectionsFrom: carved.sectionsFrom, + tmpPrefix: `auq-consistency-${i}-`, + }); + let text = ''; + try { + text = await captureModeSelectionAuq({ planDir: dir, testName: `auq-consistency-${i}`, runId }); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + const present = new Set(AUQ_FORMAT_ELEMENTS.filter(e => e.re.test(text)).map(e => e.field)); + let substance = 0; + if (text.trim()) { + try { + substance = (await judgeRecommendation(text)).reason_substance; + } catch { /* judge unavailable */ } + } + runs.push({ i, present, substance, empty: !text.trim() }); + // eslint-disable-next-line no-console + console.log( + `[AUQ-consistency run ${i + 1}/${N_RUNS}] present=${present.size}/${AUQ_FORMAT_ELEMENTS.length} ` + + `missing=[${AUQ_FORMAT_ELEMENTS.filter(e => !present.has(e.field)).map(e => e.field).join(',')}] ` + + `substance=${substance}${runs[i]?.empty ? ' (EMPTY CAPTURE)' : ''}`, + ); + } + + const problems: string[] = []; + + const anyEmpty = runs.filter(r => r.empty).map(r => r.i + 1); + if (anyEmpty.length > 0) problems.push(`run(s) produced no AUQ at all: ${anyEmpty.join(',')}`); + + // Inconsistency = an element present in SOME run but missing in another. + const everPresent = new Set(); + for (const r of runs) for (const f of r.present) everPresent.add(f); + for (const f of everPresent) { + const runsMissing = runs.filter(r => !r.present.has(f)).map(r => r.i + 1); + if (runsMissing.length > 0) problems.push(`format element "${f}" missing in run(s) ${runsMissing.join(',')}`); + } + + const subs = runs.map(r => r.substance); + const minSub = Math.min(...subs); + const maxSub = Math.max(...subs); + if (minSub < 3) problems.push(`a run cratered: min substance ${minSub} < 3`); + if (maxSub - minSub > 2) problems.push(`substance unstable: spread ${maxSub - minSub} > 2 (${subs.join(',')})`); + + if (problems.length > 0) { + throw new Error( + `AUQ inconsistency across ${N_RUNS} runs:\n` + + problems.map(p => ` - ${p}`).join('\n') + + `\nper-run: ` + + runs.map(r => `[${r.i + 1}] fmt=${r.present.size}/${AUQ_FORMAT_ELEMENTS.length} sub=${r.substance}`).join(' '), + ); + } + + // eslint-disable-next-line no-console + console.log( + `[AUQ-consistency] STABLE across ${N_RUNS} runs: all ${AUQ_FORMAT_ELEMENTS.length} ` + + `format elements every run; substance ${minSub}-${maxSub}`, + ); + }, + N_RUNS * 300_000 + 60_000, + ); +});