gstack/test/codex-e2e-recommendation-su...

104 lines
4.1 KiB
TypeScript

/**
* /codex recommendation substance — LIVE grade (periodic, paid, Codex CLI).
*
* The gap this closes: skill-cross-model-recommendation-emit.test.ts only checks
* the /codex TEMPLATE contains the "Recommendation: <action> because <reason>"
* instruction (static grep). llm-judge-recommendation.test.ts grades the rubric
* against FIXTURES. Nothing runs /codex live and grades the recommendation it
* actually emits. The user reports codex recommendations were the least
* consistent surface on main — so this is the one that needs live coverage.
*
* Method: drive the real /codex skill via codex exec (isolated temp HOME) over a
* small, deliberately-flawed fixture diff. Capture codex's output, extract its
* synthesis "Recommendation: ... because ..." line, and grade it with the same
* judgeRecommendation() rubric used everywhere else:
* - present : a Recommendation line exists
* - commits : names exactly one action (no hedging)
* - has_because : a because-clause follows
* - substance>=4: the reason is option-specific / names a concrete tradeoff,
* not boilerplate ("because it's better")
*
* Periodic tier (Codex non-determinism, ~$2-3/run).
*/
import { describe, test, expect } from 'bun:test';
import * as path from 'node:path';
import { runCodexSkill } from './helpers/codex-session-runner';
import { judgeRecommendation } from './helpers/llm-judge';
const ROOT = path.resolve(import.meta.dir, '..');
const CODEX_AVAILABLE = (() => {
try {
return Bun.spawnSync(['which', 'codex']).exitCode === 0;
} catch {
return false;
}
})();
const shouldRun =
CODEX_AVAILABLE && !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeCodex = shouldRun ? describe : describe.skip;
// A small fixture with two real, comparable problems so a good recommendation
// must CHOOSE (and justify the choice against the alternative) — the exact
// shape judgeRecommendation scores >= 4.
const FIXTURE_DIFF = `
Review this change. It has more than one issue; finish with a single synthesis
recommendation line in your skill's required format: "Recommendation: <action>
because <one-line reason that names the most important finding and why it beats
the alternative>."
--- a/server/auth.ts
+++ b/server/auth.ts
@@
export function login(req, res) {
- const user = db.query("SELECT * FROM users WHERE name = ?", [req.body.name]);
+ const user = db.query("SELECT * FROM users WHERE name = '" + req.body.name + "'");
if (user && user.password === req.body.password) {
res.cookie('session', user.id); // no HttpOnly, no Secure, no expiry
return res.json({ ok: true });
}
return res.status(401).json({ ok: false });
}
`;
describeCodex('/codex recommendation substance (live, periodic)', () => {
test(
'codex emits a committed, substance>=4 synthesis recommendation',
async () => {
const result = await runCodexSkill({
skillDir: path.join(ROOT, 'codex'),
skillName: 'codex',
prompt: FIXTURE_DIFF,
timeoutMs: 300_000,
});
if (result.output.startsWith('SKIP:')) {
// codex binary missing — describeCodex already guards, but double-safe.
return;
}
const score = await judgeRecommendation(result.output);
// eslint-disable-next-line no-console
console.log(
`[codex-rec] present=${score.present} commits=${score.commits} ` +
`has_because=${score.has_because} substance=${score.reason_substance}\n` +
` reason: ${score.reason_text}`,
);
expect(score.present).toBe(true);
expect(score.has_because).toBe(true);
expect(score.commits).toBe(true);
// The named weak spot: substance must clear the boilerplate bar.
if (score.reason_substance < 4) {
throw new Error(
`codex recommendation substance ${score.reason_substance} < 4 (boilerplate/weak):\n` +
` reason: ${score.reason_text}\n` +
` judge: ${score.reasoning}\n` +
`--- codex output (last 2KB) ---\n${result.output.slice(-2000)}`,
);
}
},
360_000,
);
});