mirror of https://github.com/garrytan/gstack.git
test: codex skill validation (12 stub tests) + E2E eval test
Stub tests (free tier): verify template content — three modes, gate verdict, session continuity, cost tracking, cross-model comparison, binary discovery, error handling, mktemp usage, and integrations into /review, /ship, /plan-eng-review. E2E test (paid tier): runs /codex review on vulnerable fixture repo via session-runner, verifies output contains findings and GATE verdict.
This commit is contained in:
parent
d5e6dd3abd
commit
c9cead34e2
|
|
@ -2841,6 +2841,76 @@ Output the diagram directly.`,
|
||||||
}, 180_000);
|
}, 180_000);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// --- Codex skill E2E ---
|
||||||
|
|
||||||
|
describeIfSelected('Codex skill E2E', ['codex-review'], () => {
|
||||||
|
let codexDir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
codexDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-'));
|
||||||
|
|
||||||
|
const run = (cmd: string, args: string[]) =>
|
||||||
|
spawnSync(cmd, args, { cwd: codexDir, stdio: 'pipe', timeout: 5000 });
|
||||||
|
|
||||||
|
run('git', ['init']);
|
||||||
|
run('git', ['config', 'user.email', 'test@test.com']);
|
||||||
|
run('git', ['config', 'user.name', 'Test']);
|
||||||
|
|
||||||
|
// Commit a clean base on main
|
||||||
|
fs.writeFileSync(path.join(codexDir, 'app.rb'), '# clean base\nclass App\nend\n');
|
||||||
|
run('git', ['add', 'app.rb']);
|
||||||
|
run('git', ['commit', '-m', 'initial commit']);
|
||||||
|
|
||||||
|
// Create feature branch with vulnerable code (reuse review fixture)
|
||||||
|
run('git', ['checkout', '-b', 'feature/add-vuln']);
|
||||||
|
const vulnContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8');
|
||||||
|
fs.writeFileSync(path.join(codexDir, 'user_controller.rb'), vulnContent);
|
||||||
|
run('git', ['add', 'user_controller.rb']);
|
||||||
|
run('git', ['commit', '-m', 'add vulnerable controller']);
|
||||||
|
|
||||||
|
// Copy the codex skill file
|
||||||
|
fs.copyFileSync(path.join(ROOT, 'codex', 'SKILL.md'), path.join(codexDir, 'codex-SKILL.md'));
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => {
|
||||||
|
try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('/codex review produces findings and GATE verdict', async () => {
|
||||||
|
// Check codex is available — skip if not installed
|
||||||
|
const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
|
||||||
|
if (codexCheck.status !== 0) {
|
||||||
|
console.warn('codex CLI not installed — skipping E2E test');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are in a git repo on branch feature/add-vuln with changes against main.
|
||||||
|
Read codex-SKILL.md for the /codex skill instructions.
|
||||||
|
Run /codex review to review the current diff against main.
|
||||||
|
Write the full output (including the GATE verdict) to ${codexDir}/codex-output.md`,
|
||||||
|
workingDirectory: codexDir,
|
||||||
|
maxTurns: 10,
|
||||||
|
timeout: 300_000,
|
||||||
|
testName: 'codex-review',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/codex review', result);
|
||||||
|
recordE2E('/codex review', 'Codex skill E2E', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
// Check that output file was created with review content
|
||||||
|
const outputPath = path.join(codexDir, 'codex-output.md');
|
||||||
|
if (fs.existsSync(outputPath)) {
|
||||||
|
const output = fs.readFileSync(outputPath, 'utf-8');
|
||||||
|
// Should contain the CODEX SAYS header or GATE verdict
|
||||||
|
const hasCodexOutput = output.includes('CODEX') || output.includes('GATE') || output.includes('codex');
|
||||||
|
expect(hasCodexOutput).toBe(true);
|
||||||
|
}
|
||||||
|
}, 360_000);
|
||||||
|
});
|
||||||
|
|
||||||
// Module-level afterAll — finalize eval collector after all tests complete
|
// Module-level afterAll — finalize eval collector after all tests complete
|
||||||
afterAll(async () => {
|
afterAll(async () => {
|
||||||
if (evalCollector) {
|
if (evalCollector) {
|
||||||
|
|
|
||||||
|
|
@ -1121,3 +1121,95 @@ describe('QA report template', () => {
|
||||||
expect(content).toContain('**Precondition:**');
|
expect(content).toContain('**Precondition:**');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// --- Codex skill validation ---
|
||||||
|
|
||||||
|
describe('Codex skill', () => {
|
||||||
|
test('codex/SKILL.md exists and has correct frontmatter', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('name: codex');
|
||||||
|
expect(content).toContain('version: 1.0.0');
|
||||||
|
expect(content).toContain('allowed-tools:');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md contains all three modes', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('Step 2A: Review Mode');
|
||||||
|
expect(content).toContain('Step 2B: Challenge');
|
||||||
|
expect(content).toContain('Step 2C: Consult Mode');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md contains gate verdict logic', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('[P1]');
|
||||||
|
expect(content).toContain('GATE: PASS');
|
||||||
|
expect(content).toContain('GATE: FAIL');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md contains session continuity', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('codex-session-id');
|
||||||
|
expect(content).toContain('codex exec resume');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md contains cost tracking', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('tokens used');
|
||||||
|
expect(content).toContain('Est. cost');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md contains cross-model comparison', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('CROSS-MODEL ANALYSIS');
|
||||||
|
expect(content).toContain('Agreement rate');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md contains review log persistence', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('codex-review');
|
||||||
|
expect(content).toContain('reviews.jsonl');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md uses which for binary discovery, not hardcoded path', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('which codex');
|
||||||
|
expect(content).not.toContain('/opt/homebrew/bin/codex');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md contains error handling for missing binary and API key', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('NOT_FOUND');
|
||||||
|
expect(content).toContain('OPENAI_API_KEY');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex/SKILL.md uses mktemp for temp files', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('mktemp');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex integration in /review offers second opinion', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('Codex second opinion');
|
||||||
|
expect(content).toContain('codex review');
|
||||||
|
expect(content).toContain('adversarial');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex integration in /ship offers review gate', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('Codex');
|
||||||
|
expect(content).toContain('codex review');
|
||||||
|
expect(content).toContain('codex-review');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('codex integration in /plan-eng-review offers plan critique', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('Codex');
|
||||||
|
expect(content).toContain('codex exec');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Review Readiness Dashboard includes Codex Review row', () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||||
|
expect(content).toContain('Codex Review');
|
||||||
|
expect(content).toContain('codex-review');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue