mirror of https://github.com/garrytan/gstack.git
test: Review Army — 14 diff-scope tests + 7 E2E tests
- test/diff-scope.test.ts: 14 tests for all 9 scope signals
- test/skill-e2e-review-army.test.ts: 7 E2E tests
Gate: migration safety, N+1 detection, delivery audit,
quality score, JSON findings
Periodic: red team, consensus
- Updated gen-skill-docs tests for new review structure
- Added touchfile entries and tier classifications
This commit is contained in:
parent
d205b8220c
commit
aca352072c
|
|
@ -0,0 +1,165 @@
|
||||||
|
/**
|
||||||
|
* Tests for bin/gstack-diff-scope — verifies scope signal detection.
|
||||||
|
*
|
||||||
|
* Creates temp git repos with specific file patterns and verifies
|
||||||
|
* the correct SCOPE_* variables are output.
|
||||||
|
*/
|
||||||
|
import { describe, test, expect, afterAll } from 'bun:test';
|
||||||
|
import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'fs';
|
||||||
|
import { join } from 'path';
|
||||||
|
import { tmpdir } from 'os';
|
||||||
|
import { spawnSync } from 'child_process';
|
||||||
|
|
||||||
|
const SCRIPT = join(import.meta.dir, '..', 'bin', 'gstack-diff-scope');
|
||||||
|
|
||||||
|
const dirs: string[] = [];
|
||||||
|
|
||||||
|
function createRepo(files: string[]): string {
|
||||||
|
const dir = mkdtempSync(join(tmpdir(), 'diff-scope-test-'));
|
||||||
|
dirs.push(dir);
|
||||||
|
|
||||||
|
const run = (cmd: string, args: string[]) =>
|
||||||
|
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
|
||||||
|
|
||||||
|
run('git', ['init', '-b', 'main']);
|
||||||
|
run('git', ['config', 'user.email', 'test@test.com']);
|
||||||
|
run('git', ['config', 'user.name', 'Test']);
|
||||||
|
|
||||||
|
// Base commit
|
||||||
|
writeFileSync(join(dir, 'README.md'), '# test\n');
|
||||||
|
run('git', ['add', '.']);
|
||||||
|
run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
// Feature branch with specified files
|
||||||
|
run('git', ['checkout', '-b', 'feature/test']);
|
||||||
|
for (const f of files) {
|
||||||
|
const fullPath = join(dir, f);
|
||||||
|
const dirPath = fullPath.substring(0, fullPath.lastIndexOf('/'));
|
||||||
|
if (dirPath !== dir) mkdirSync(dirPath, { recursive: true });
|
||||||
|
writeFileSync(fullPath, '# test content\n');
|
||||||
|
}
|
||||||
|
run('git', ['add', '.']);
|
||||||
|
run('git', ['commit', '-m', 'add files']);
|
||||||
|
|
||||||
|
return dir;
|
||||||
|
}
|
||||||
|
|
||||||
|
function runScope(dir: string): Record<string, string> {
|
||||||
|
const result = spawnSync('bash', [SCRIPT, 'main'], {
|
||||||
|
cwd: dir, stdio: 'pipe', timeout: 5000,
|
||||||
|
});
|
||||||
|
const output = result.stdout.toString().trim();
|
||||||
|
const vars: Record<string, string> = {};
|
||||||
|
for (const line of output.split('\n')) {
|
||||||
|
const [key, val] = line.split('=');
|
||||||
|
if (key && val) vars[key] = val;
|
||||||
|
}
|
||||||
|
return vars;
|
||||||
|
}
|
||||||
|
|
||||||
|
afterAll(() => {
|
||||||
|
for (const d of dirs) {
|
||||||
|
try { rmSync(d, { recursive: true, force: true }); } catch {}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('gstack-diff-scope', () => {
|
||||||
|
// --- Existing scope signals ---
|
||||||
|
|
||||||
|
test('detects frontend files', () => {
|
||||||
|
const dir = createRepo(['styles.css', 'component.tsx']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_FRONTEND).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects backend files', () => {
|
||||||
|
const dir = createRepo(['app.rb', 'service.py']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_BACKEND).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects test files', () => {
|
||||||
|
const dir = createRepo(['test/app.test.ts']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_TESTS).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- New scope signals (Review Army) ---
|
||||||
|
|
||||||
|
test('detects migrations via db/migrate/', () => {
|
||||||
|
const dir = createRepo(['db/migrate/20260330_create_users.rb']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_MIGRATIONS).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects migrations via generic migrations/', () => {
|
||||||
|
const dir = createRepo(['app/migrations/0001_initial.py']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_MIGRATIONS).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects migrations via prisma', () => {
|
||||||
|
const dir = createRepo(['prisma/migrations/20260330/migration.sql']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_MIGRATIONS).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects API via controller files', () => {
|
||||||
|
const dir = createRepo(['app/controllers/users_controller.rb']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_API).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects API via route files', () => {
|
||||||
|
const dir = createRepo(['src/routes/api.ts']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_API).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects API via GraphQL schemas', () => {
|
||||||
|
const dir = createRepo(['schema.graphql']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_API).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects auth files', () => {
|
||||||
|
const dir = createRepo(['app/services/auth_service.rb']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_AUTH).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects session files', () => {
|
||||||
|
const dir = createRepo(['lib/session_manager.ts']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_AUTH).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('detects JWT files', () => {
|
||||||
|
const dir = createRepo(['utils/jwt_helper.py']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_AUTH).toBe('true');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('returns false for all new signals when no matching files', () => {
|
||||||
|
const dir = createRepo(['docs/readme.md', 'config.yml']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(scope.SCOPE_MIGRATIONS).toBe('false');
|
||||||
|
expect(scope.SCOPE_API).toBe('false');
|
||||||
|
expect(scope.SCOPE_AUTH).toBe('false');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('outputs all 9 scope variables', () => {
|
||||||
|
const dir = createRepo(['app.ts']);
|
||||||
|
const scope = runScope(dir);
|
||||||
|
expect(Object.keys(scope)).toHaveLength(9);
|
||||||
|
expect(scope).toHaveProperty('SCOPE_FRONTEND');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_BACKEND');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_PROMPTS');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_TESTS');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_DOCS');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_CONFIG');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_MIGRATIONS');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_API');
|
||||||
|
expect(scope).toHaveProperty('SCOPE_AUTH');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
-- Migration: Drop user email column
|
||||||
|
-- WARNING: This migration is intentionally unsafe for testing
|
||||||
|
ALTER TABLE users DROP COLUMN email;
|
||||||
|
ALTER TABLE users DROP COLUMN phone_number;
|
||||||
|
-- No backfill, no reversibility check, no data preservation
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
# N+1 query example — intentionally bad for testing
|
||||||
|
class PostsController
|
||||||
|
def index
|
||||||
|
@posts = Post.all
|
||||||
|
@posts.each do |post|
|
||||||
|
# N+1: queries Author table for every post
|
||||||
|
puts post.author.name
|
||||||
|
# N+1: queries Comments table for every post
|
||||||
|
puts post.comments.count
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
@ -607,7 +607,8 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
|
||||||
const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||||
const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||||
|
|
||||||
test('all three modes share codepath tracing methodology', () => {
|
test('plan and ship modes share codepath tracing methodology', () => {
|
||||||
|
// Review mode delegates test coverage to the Testing specialist subagent (Review Army)
|
||||||
const sharedPhrases = [
|
const sharedPhrases = [
|
||||||
'Trace data flow',
|
'Trace data flow',
|
||||||
'Diagram the execution',
|
'Diagram the execution',
|
||||||
|
|
@ -619,33 +620,40 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
|
||||||
for (const phrase of sharedPhrases) {
|
for (const phrase of sharedPhrases) {
|
||||||
expect(planSkill).toContain(phrase);
|
expect(planSkill).toContain(phrase);
|
||||||
expect(shipSkill).toContain(phrase);
|
expect(shipSkill).toContain(phrase);
|
||||||
expect(reviewSkill).toContain(phrase);
|
|
||||||
}
|
}
|
||||||
// Plan mode traces the plan, not a git diff
|
// Plan mode traces the plan, not a git diff
|
||||||
expect(planSkill).toContain('Trace every codepath in the plan');
|
expect(planSkill).toContain('Trace every codepath in the plan');
|
||||||
expect(planSkill).not.toContain('git diff origin');
|
expect(planSkill).not.toContain('git diff origin');
|
||||||
// Ship and review modes trace the diff
|
// Ship mode traces the diff
|
||||||
expect(shipSkill).toContain('Trace every codepath changed');
|
expect(shipSkill).toContain('Trace every codepath changed');
|
||||||
expect(reviewSkill).toContain('Trace every codepath changed');
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test('all three modes include E2E decision matrix', () => {
|
test('review mode uses Review Army for specialist dispatch', () => {
|
||||||
for (const skill of [planSkill, shipSkill, reviewSkill]) {
|
expect(reviewSkill).toContain('Review Army');
|
||||||
|
expect(reviewSkill).toContain('Specialist Dispatch');
|
||||||
|
expect(reviewSkill).toContain('testing.md');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('plan and ship modes include E2E decision matrix', () => {
|
||||||
|
// Review mode delegates to Testing specialist
|
||||||
|
for (const skill of [planSkill, shipSkill]) {
|
||||||
expect(skill).toContain('E2E Test Decision Matrix');
|
expect(skill).toContain('E2E Test Decision Matrix');
|
||||||
expect(skill).toContain('→E2E');
|
expect(skill).toContain('→E2E');
|
||||||
expect(skill).toContain('→EVAL');
|
expect(skill).toContain('→EVAL');
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('all three modes include regression rule', () => {
|
test('plan and ship modes include regression rule', () => {
|
||||||
for (const skill of [planSkill, shipSkill, reviewSkill]) {
|
// Review mode delegates to Testing specialist
|
||||||
|
for (const skill of [planSkill, shipSkill]) {
|
||||||
expect(skill).toContain('REGRESSION RULE');
|
expect(skill).toContain('REGRESSION RULE');
|
||||||
expect(skill).toContain('IRON RULE');
|
expect(skill).toContain('IRON RULE');
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('all three modes include test framework detection', () => {
|
test('plan and ship modes include test framework detection', () => {
|
||||||
for (const skill of [planSkill, shipSkill, reviewSkill]) {
|
// Review mode delegates to Testing specialist
|
||||||
|
for (const skill of [planSkill, shipSkill]) {
|
||||||
expect(skill).toContain('Test Framework Detection');
|
expect(skill).toContain('Test Framework Detection');
|
||||||
expect(skill).toContain('CLAUDE.md');
|
expect(skill).toContain('CLAUDE.md');
|
||||||
}
|
}
|
||||||
|
|
@ -664,11 +672,12 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
|
||||||
expect(shipSkill).toContain('ship-test-plan');
|
expect(shipSkill).toContain('ship-test-plan');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('review mode generates via Fix-First + gaps are INFORMATIONAL', () => {
|
test('review mode uses Fix-First + Review Army for specialist coverage', () => {
|
||||||
expect(reviewSkill).toContain('Fix-First');
|
expect(reviewSkill).toContain('Fix-First');
|
||||||
expect(reviewSkill).toContain('INFORMATIONAL');
|
expect(reviewSkill).toContain('INFORMATIONAL');
|
||||||
expect(reviewSkill).toContain('Step 4.75');
|
// Review Army handles test coverage via Testing specialist subagent
|
||||||
expect(reviewSkill).toContain('subsumes the "Test Gaps" category');
|
expect(reviewSkill).toContain('Review Army');
|
||||||
|
expect(reviewSkill).toContain('Testing');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('plan mode does NOT include ship-specific content', () => {
|
test('plan mode does NOT include ship-specific content', () => {
|
||||||
|
|
@ -683,6 +692,35 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
|
||||||
expect(reviewSkill).not.toContain('ship-test-plan');
|
expect(reviewSkill).not.toContain('ship-test-plan');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('review/specialists/ directory has all expected checklist files', () => {
|
||||||
|
const specDir = path.join(ROOT, 'review', 'specialists');
|
||||||
|
const expected = [
|
||||||
|
'testing.md',
|
||||||
|
'maintainability.md',
|
||||||
|
'security.md',
|
||||||
|
'performance.md',
|
||||||
|
'data-migration.md',
|
||||||
|
'api-contract.md',
|
||||||
|
'red-team.md',
|
||||||
|
];
|
||||||
|
for (const f of expected) {
|
||||||
|
expect(fs.existsSync(path.join(specDir, f))).toBe(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('each specialist file has standard header with scope and output format', () => {
|
||||||
|
const specDir = path.join(ROOT, 'review', 'specialists');
|
||||||
|
const files = fs.readdirSync(specDir).filter(f => f.endsWith('.md'));
|
||||||
|
for (const f of files) {
|
||||||
|
const content = fs.readFileSync(path.join(specDir, f), 'utf-8');
|
||||||
|
// All specialist files must have Scope and Output/JSON in header
|
||||||
|
expect(content).toContain('Scope:');
|
||||||
|
expect(content.toLowerCase()).toMatch(/output|json/);
|
||||||
|
// Must define NO FINDINGS behavior
|
||||||
|
expect(content).toContain('NO FINDINGS');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// Regression guard: ship output contains key phrases from before the refactor
|
// Regression guard: ship output contains key phrases from before the refactor
|
||||||
test('ship SKILL.md regression guard — key phrases preserved', () => {
|
test('ship SKILL.md regression guard — key phrases preserved', () => {
|
||||||
const regressionPhrases = [
|
const regressionPhrases = [
|
||||||
|
|
@ -870,12 +908,9 @@ describe('Coverage gate in ship', () => {
|
||||||
expect(shipSkill).toContain('could not determine percentage — skipping');
|
expect(shipSkill).toContain('could not determine percentage — skipping');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('review SKILL.md contains coverage WARNING', () => {
|
test('review SKILL.md delegates coverage to Testing specialist', () => {
|
||||||
expect(reviewSkill).toContain('COVERAGE WARNING');
|
// Coverage audit moved to Testing specialist subagent in Review Army
|
||||||
expect(reviewSkill).toContain('Consider writing tests before running /ship');
|
expect(reviewSkill).toContain('testing.md');
|
||||||
});
|
|
||||||
|
|
||||||
test('review coverage warning is INFORMATIONAL', () => {
|
|
||||||
expect(reviewSkill).toContain('INFORMATIONAL');
|
expect(reviewSkill).toContain('INFORMATIONAL');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
@ -1604,10 +1639,9 @@ describe('Codex generation (--host codex)', () => {
|
||||||
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
|
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
|
||||||
// Correct: references to sidecar files use gstack/review/ path
|
// Correct: references to sidecar files use gstack/review/ path
|
||||||
expect(content).toContain('.agents/skills/gstack/review/checklist.md');
|
expect(content).toContain('.agents/skills/gstack/review/checklist.md');
|
||||||
expect(content).toContain('.agents/skills/gstack/review/design-checklist.md');
|
// design-checklist.md is now referenced via Review Army specialist (Claude only, stripped for Codex)
|
||||||
// Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there)
|
// Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there)
|
||||||
expect(content).not.toContain('.agents/skills/gstack-review/checklist.md');
|
expect(content).not.toContain('.agents/skills/gstack-review/checklist.md');
|
||||||
expect(content).not.toContain('.agents/skills/gstack-review/design-checklist.md');
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => {
|
test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => {
|
||||||
|
|
|
||||||
|
|
@ -59,6 +59,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||||
'review-base-branch': ['review/**'],
|
'review-base-branch': ['review/**'],
|
||||||
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
|
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
|
||||||
|
|
||||||
|
// Review Army (specialist dispatch)
|
||||||
|
'review-army-migration-safety': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
|
||||||
|
'review-army-perf-n-plus-one': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
|
||||||
|
'review-army-delivery-audit': ['review/**', 'scripts/resolvers/review.ts', 'scripts/resolvers/review-army.ts'],
|
||||||
|
'review-army-quality-score': ['review/**', 'scripts/resolvers/review-army.ts'],
|
||||||
|
'review-army-json-findings': ['review/**', 'scripts/resolvers/review-army.ts'],
|
||||||
|
'review-army-red-team': ['review/**', 'scripts/resolvers/review-army.ts'],
|
||||||
|
'review-army-consensus': ['review/**', 'scripts/resolvers/review-army.ts'],
|
||||||
|
|
||||||
// Office Hours
|
// Office Hours
|
||||||
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
|
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
|
||||||
|
|
||||||
|
|
@ -204,6 +213,15 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||||
'review-plan-completion': 'gate',
|
'review-plan-completion': 'gate',
|
||||||
'review-dashboard-via': 'gate',
|
'review-dashboard-via': 'gate',
|
||||||
|
|
||||||
|
// Review Army — gate for core functionality, periodic for multi-specialist
|
||||||
|
'review-army-migration-safety': 'gate', // Specialist activation guardrail
|
||||||
|
'review-army-perf-n-plus-one': 'gate', // Specialist activation guardrail
|
||||||
|
'review-army-delivery-audit': 'gate', // Delivery integrity guardrail
|
||||||
|
'review-army-quality-score': 'gate', // Score computation
|
||||||
|
'review-army-json-findings': 'gate', // JSON schema compliance
|
||||||
|
'review-army-red-team': 'periodic', // Multi-agent coordination
|
||||||
|
'review-army-consensus': 'periodic', // Multi-specialist agreement
|
||||||
|
|
||||||
// Office Hours
|
// Office Hours
|
||||||
'office-hours-spec-review': 'gate',
|
'office-hours-spec-review': 'gate',
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,562 @@
|
||||||
|
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||||
|
import { runSkillTest } from './helpers/session-runner';
|
||||||
|
import {
|
||||||
|
ROOT, runId, describeIfSelected, testConcurrentIfSelected,
|
||||||
|
logCost, recordE2E, createEvalCollector, finalizeEvalCollector,
|
||||||
|
} from './helpers/e2e-helpers';
|
||||||
|
import { spawnSync } from 'child_process';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as os from 'os';
|
||||||
|
|
||||||
|
const evalCollector = createEvalCollector('e2e-review-army');
|
||||||
|
|
||||||
|
// Helper: create a git repo with a feature branch
|
||||||
|
function setupRepo(prefix: string): { dir: string; run: (cmd: string, args: string[]) => void } {
|
||||||
|
const dir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${prefix}-`));
|
||||||
|
const run = (cmd: string, args: string[]) =>
|
||||||
|
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
|
||||||
|
run('git', ['init', '-b', 'main']);
|
||||||
|
run('git', ['config', 'user.email', 'test@test.com']);
|
||||||
|
run('git', ['config', 'user.name', 'Test']);
|
||||||
|
return { dir, run };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper: copy review skill files to test dir
|
||||||
|
function copyReviewFiles(dir: string) {
|
||||||
|
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(dir, 'review-SKILL.md'));
|
||||||
|
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(dir, 'review-checklist.md'));
|
||||||
|
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(dir, 'review-greptile-triage.md'));
|
||||||
|
// Copy specialist checklists
|
||||||
|
const specDir = path.join(dir, 'review-specialists');
|
||||||
|
fs.mkdirSync(specDir, { recursive: true });
|
||||||
|
const specialistsRoot = path.join(ROOT, 'review', 'specialists');
|
||||||
|
for (const f of fs.readdirSync(specialistsRoot)) {
|
||||||
|
fs.copyFileSync(path.join(specialistsRoot, f), path.join(specDir, f));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Review Army: Migration Safety ---
|
||||||
|
|
||||||
|
describeIfSelected('Review Army: Migration Safety', ['review-army-migration-safety'], () => {
|
||||||
|
let dir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
const repo = setupRepo('army-migration');
|
||||||
|
dir = repo.dir;
|
||||||
|
|
||||||
|
// Base commit
|
||||||
|
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
// Feature branch with unsafe migration
|
||||||
|
repo.run('git', ['checkout', '-b', 'feature/drop-columns']);
|
||||||
|
fs.mkdirSync(path.join(dir, 'db', 'migrate'), { recursive: true });
|
||||||
|
const migrationContent = fs.readFileSync(
|
||||||
|
path.join(ROOT, 'test', 'fixtures', 'review-army-migration.sql'), 'utf-8'
|
||||||
|
);
|
||||||
|
fs.writeFileSync(path.join(dir, 'db', 'migrate', '20260330_drop_columns.sql'), migrationContent);
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'drop email and phone columns']);
|
||||||
|
|
||||||
|
copyReviewFiles(dir);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
|
||||||
|
|
||||||
|
testConcurrentIfSelected('review-army-migration-safety', async () => {
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are in a git repo on a feature branch with a database migration that drops columns.
|
||||||
|
Read review-SKILL.md for instructions. Also read review-checklist.md.
|
||||||
|
The specialist checklists are in review-specialists/ (testing.md, security.md, performance.md, data-migration.md, etc.).
|
||||||
|
|
||||||
|
Skip the preamble, lake intro, telemetry sections.
|
||||||
|
Run Step 4 (Critical pass) then Step 4.5 (Review Army — Specialist Dispatch).
|
||||||
|
The base branch is main. Run gstack-diff-scope style analysis on the changed files.
|
||||||
|
Since db/migrate/ files changed, the Data Migration specialist should activate.
|
||||||
|
|
||||||
|
For the specialist dispatch, instead of launching subagents, just read review-specialists/data-migration.md
|
||||||
|
and apply it yourself against the diff (git diff main...HEAD).
|
||||||
|
|
||||||
|
Write your findings to ${dir}/review-output.md`,
|
||||||
|
workingDirectory: dir,
|
||||||
|
maxTurns: 20,
|
||||||
|
timeout: 180_000,
|
||||||
|
testName: 'review-army-migration-safety',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/review army migration', result);
|
||||||
|
recordE2E(evalCollector, '/review army migration safety', 'Review Army', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
// Verify migration issues were caught
|
||||||
|
const outputPath = path.join(dir, 'review-output.md');
|
||||||
|
if (fs.existsSync(outputPath)) {
|
||||||
|
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
|
||||||
|
const hasMigrationFinding =
|
||||||
|
content.includes('drop') ||
|
||||||
|
content.includes('data loss') ||
|
||||||
|
content.includes('reversib') ||
|
||||||
|
content.includes('migration') ||
|
||||||
|
content.includes('column');
|
||||||
|
expect(hasMigrationFinding).toBe(true);
|
||||||
|
}
|
||||||
|
}, 210_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Review Army: N+1 Performance ---
|
||||||
|
|
||||||
|
describeIfSelected('Review Army: N+1 Performance', ['review-army-perf-n-plus-one'], () => {
|
||||||
|
let dir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
const repo = setupRepo('army-n-plus-one');
|
||||||
|
dir = repo.dir;
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
repo.run('git', ['checkout', '-b', 'feature/add-posts-index']);
|
||||||
|
const n1Content = fs.readFileSync(
|
||||||
|
path.join(ROOT, 'test', 'fixtures', 'review-army-n-plus-one.rb'), 'utf-8'
|
||||||
|
);
|
||||||
|
fs.writeFileSync(path.join(dir, 'posts_controller.rb'), n1Content);
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'add posts controller']);
|
||||||
|
|
||||||
|
copyReviewFiles(dir);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
|
||||||
|
|
||||||
|
testConcurrentIfSelected('review-army-perf-n-plus-one', async () => {
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are in a git repo on a feature branch with a Ruby controller that has N+1 queries.
|
||||||
|
Read review-SKILL.md for instructions. Also read review-checklist.md.
|
||||||
|
The specialist checklists are in review-specialists/ (testing.md, performance.md, etc.).
|
||||||
|
|
||||||
|
Skip the preamble, lake intro, telemetry sections.
|
||||||
|
Run Step 4 (Critical pass) then Step 4.5 (Review Army).
|
||||||
|
The base branch is main. This is a Ruby backend file, so Performance specialist should activate.
|
||||||
|
|
||||||
|
For the specialist dispatch, read review-specialists/performance.md and apply it against the diff.
|
||||||
|
|
||||||
|
Write your findings to ${dir}/review-output.md`,
|
||||||
|
workingDirectory: dir,
|
||||||
|
maxTurns: 20,
|
||||||
|
timeout: 180_000,
|
||||||
|
testName: 'review-army-perf-n-plus-one',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/review army n+1', result);
|
||||||
|
recordE2E(evalCollector, '/review army N+1 detection', 'Review Army', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
const outputPath = path.join(dir, 'review-output.md');
|
||||||
|
if (fs.existsSync(outputPath)) {
|
||||||
|
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
|
||||||
|
const hasN1Finding =
|
||||||
|
content.includes('n+1') ||
|
||||||
|
content.includes('n + 1') ||
|
||||||
|
content.includes('eager') ||
|
||||||
|
content.includes('includes') ||
|
||||||
|
content.includes('preload') ||
|
||||||
|
content.includes('query') ||
|
||||||
|
content.includes('loop');
|
||||||
|
expect(hasN1Finding).toBe(true);
|
||||||
|
}
|
||||||
|
}, 210_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Review Army: Delivery Audit ---
|
||||||
|
|
||||||
|
describeIfSelected('Review Army: Delivery Audit', ['review-army-delivery-audit'], () => {
|
||||||
|
let dir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
const repo = setupRepo('army-delivery');
|
||||||
|
dir = repo.dir;
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
repo.run('git', ['checkout', '-b', 'feature/three-features']);
|
||||||
|
|
||||||
|
// Write a plan file promising 3 features
|
||||||
|
fs.writeFileSync(path.join(dir, 'PLAN.md'), `# Feature Plan
|
||||||
|
|
||||||
|
## Implementation Items
|
||||||
|
1. Add user authentication with login/logout
|
||||||
|
2. Add user profile page with avatar upload
|
||||||
|
3. Add email notification system for new signups
|
||||||
|
|
||||||
|
## Test Items
|
||||||
|
- Test login flow
|
||||||
|
- Test profile page rendering
|
||||||
|
- Test email sending
|
||||||
|
`);
|
||||||
|
repo.run('git', ['add', 'PLAN.md']);
|
||||||
|
repo.run('git', ['commit', '-m', 'add plan']);
|
||||||
|
|
||||||
|
// Implement only 2 of 3 features
|
||||||
|
fs.writeFileSync(path.join(dir, 'auth.rb'), `class AuthController
|
||||||
|
def login
|
||||||
|
# authenticate user
|
||||||
|
session[:user_id] = user.id
|
||||||
|
end
|
||||||
|
|
||||||
|
def logout
|
||||||
|
session.delete(:user_id)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`);
|
||||||
|
fs.writeFileSync(path.join(dir, 'profile.rb'), `class ProfileController
|
||||||
|
def show
|
||||||
|
@user = User.find(params[:id])
|
||||||
|
end
|
||||||
|
|
||||||
|
def update_avatar
|
||||||
|
@user.avatar.attach(params[:avatar])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`);
|
||||||
|
// NOTE: email notification system is NOT implemented (intentionally missing)
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'implement auth and profile features']);
|
||||||
|
|
||||||
|
copyReviewFiles(dir);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
|
||||||
|
|
||||||
|
testConcurrentIfSelected('review-army-delivery-audit', async () => {
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are in a git repo on branch feature/three-features.
|
||||||
|
There is a PLAN.md file that promises 3 features: auth, profile, and email notifications.
|
||||||
|
The diff (git diff main...HEAD) only implements 2 of them (auth and profile).
|
||||||
|
|
||||||
|
Read review-SKILL.md for the review workflow. Focus on the Plan Completion Audit section.
|
||||||
|
The plan file is at ./PLAN.md. Cross-reference it against the diff.
|
||||||
|
|
||||||
|
For each plan item, classify as DONE, PARTIAL, NOT DONE, or CHANGED.
|
||||||
|
The email notification system should be classified as NOT DONE.
|
||||||
|
|
||||||
|
Write your completion audit to ${dir}/review-output.md`,
|
||||||
|
workingDirectory: dir,
|
||||||
|
maxTurns: 15,
|
||||||
|
timeout: 120_000,
|
||||||
|
testName: 'review-army-delivery-audit',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/review army delivery', result);
|
||||||
|
recordE2E(evalCollector, '/review army delivery audit', 'Review Army', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
const outputPath = path.join(dir, 'review-output.md');
|
||||||
|
if (fs.existsSync(outputPath)) {
|
||||||
|
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
|
||||||
|
// Should identify email notifications as NOT DONE
|
||||||
|
const hasNotDone =
|
||||||
|
content.includes('not done') ||
|
||||||
|
content.includes('not_done') ||
|
||||||
|
content.includes('missing') ||
|
||||||
|
content.includes('not implemented');
|
||||||
|
const mentionsEmail =
|
||||||
|
content.includes('email') ||
|
||||||
|
content.includes('notification');
|
||||||
|
expect(hasNotDone).toBe(true);
|
||||||
|
expect(mentionsEmail).toBe(true);
|
||||||
|
}
|
||||||
|
}, 150_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Review Army: Quality Score ---
|
||||||
|
|
||||||
|
describeIfSelected('Review Army: Quality Score', ['review-army-quality-score'], () => {
|
||||||
|
let dir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
const repo = setupRepo('army-quality');
|
||||||
|
dir = repo.dir;
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
repo.run('git', ['checkout', '-b', 'feature/add-controller']);
|
||||||
|
// Code with obvious issues for quality score computation
|
||||||
|
fs.writeFileSync(path.join(dir, 'user_controller.rb'), `class UserController
|
||||||
|
def create
|
||||||
|
# SQL injection
|
||||||
|
User.where("name = '#{params[:name]}'")
|
||||||
|
# Magic number
|
||||||
|
if users.count > 42
|
||||||
|
raise "too many"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`);
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'add user controller']);
|
||||||
|
|
||||||
|
copyReviewFiles(dir);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
|
||||||
|
|
||||||
|
testConcurrentIfSelected('review-army-quality-score', async () => {
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are in a git repo with a vulnerable user controller.
|
||||||
|
Read review-SKILL.md and review-checklist.md.
|
||||||
|
Skip preamble, lake intro, telemetry.
|
||||||
|
|
||||||
|
Run the Critical pass (Step 4) against the diff (git diff main...HEAD).
|
||||||
|
Then compute the PR Quality Score as described in the Review Army merge step:
|
||||||
|
quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))
|
||||||
|
|
||||||
|
Write your findings AND the computed quality score to ${dir}/review-output.md
|
||||||
|
Include the line: "PR Quality Score: X/10" where X is the computed score.`,
|
||||||
|
workingDirectory: dir,
|
||||||
|
maxTurns: 15,
|
||||||
|
timeout: 120_000,
|
||||||
|
testName: 'review-army-quality-score',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/review army quality', result);
|
||||||
|
recordE2E(evalCollector, '/review army quality score', 'Review Army', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
const outputPath = path.join(dir, 'review-output.md');
|
||||||
|
if (fs.existsSync(outputPath)) {
|
||||||
|
const content = fs.readFileSync(outputPath, 'utf-8');
|
||||||
|
// Should contain a quality score
|
||||||
|
const hasScore =
|
||||||
|
content.toLowerCase().includes('quality score') ||
|
||||||
|
content.match(/\d+\/10/);
|
||||||
|
expect(hasScore).toBeTruthy();
|
||||||
|
}
|
||||||
|
}, 150_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Review Army: JSON Findings ---
|
||||||
|
|
||||||
|
describeIfSelected('Review Army: JSON Findings', ['review-army-json-findings'], () => {
|
||||||
|
let dir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
const repo = setupRepo('army-json');
|
||||||
|
dir = repo.dir;
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
repo.run('git', ['checkout', '-b', 'feature/vuln']);
|
||||||
|
fs.writeFileSync(path.join(dir, 'search.rb'), `class SearchController
|
||||||
|
def index
|
||||||
|
# SQL injection via string interpolation
|
||||||
|
results = ActiveRecord::Base.connection.execute(
|
||||||
|
"SELECT * FROM products WHERE name LIKE '%#{params[:q]}%'"
|
||||||
|
)
|
||||||
|
render json: results
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`);
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'add search']);
|
||||||
|
|
||||||
|
copyReviewFiles(dir);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
|
||||||
|
|
||||||
|
testConcurrentIfSelected('review-army-json-findings', async () => {
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are reviewing a git diff with a SQL injection vulnerability.
|
||||||
|
Read review-specialists/security.md for the security checklist.
|
||||||
|
|
||||||
|
Apply the checklist against this diff (git diff main...HEAD).
|
||||||
|
Output your findings as JSON objects, one per line, following the schema:
|
||||||
|
{"severity":"CRITICAL","confidence":9,"path":"search.rb","line":4,"category":"injection","summary":"SQL injection via string interpolation","fix":"Use parameterized query","fingerprint":"search.rb:4:injection","specialist":"security"}
|
||||||
|
|
||||||
|
Write ONLY JSON findings (no preamble) to ${dir}/findings.json`,
|
||||||
|
workingDirectory: dir,
|
||||||
|
maxTurns: 12,
|
||||||
|
timeout: 90_000,
|
||||||
|
testName: 'review-army-json-findings',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/review army json', result);
|
||||||
|
recordE2E(evalCollector, '/review army JSON findings', 'Review Army', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
const findingsPath = path.join(dir, 'findings.json');
|
||||||
|
if (fs.existsSync(findingsPath)) {
|
||||||
|
const content = fs.readFileSync(findingsPath, 'utf-8').trim();
|
||||||
|
const lines = content.split('\n').filter(l => l.trim());
|
||||||
|
// At least one finding
|
||||||
|
expect(lines.length).toBeGreaterThanOrEqual(1);
|
||||||
|
// Each line should be valid JSON with required fields
|
||||||
|
for (const line of lines) {
|
||||||
|
let parsed: any;
|
||||||
|
try { parsed = JSON.parse(line); } catch { continue; }
|
||||||
|
// Required fields per schema
|
||||||
|
expect(parsed).toHaveProperty('severity');
|
||||||
|
expect(parsed).toHaveProperty('confidence');
|
||||||
|
expect(parsed).toHaveProperty('path');
|
||||||
|
expect(parsed).toHaveProperty('category');
|
||||||
|
expect(parsed).toHaveProperty('summary');
|
||||||
|
expect(parsed).toHaveProperty('specialist');
|
||||||
|
break; // One valid line is enough for the gate test
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 120_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Review Army: Red Team (periodic) ---
|
||||||
|
|
||||||
|
describeIfSelected('Review Army: Red Team', ['review-army-red-team'], () => {
|
||||||
|
let dir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
const repo = setupRepo('army-redteam');
|
||||||
|
dir = repo.dir;
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
repo.run('git', ['checkout', '-b', 'feature/large-change']);
|
||||||
|
// Create a large diff (300+ lines)
|
||||||
|
const lines: string[] = ['class LargeController'];
|
||||||
|
for (let i = 0; i < 100; i++) {
|
||||||
|
lines.push(` def method_${i}`);
|
||||||
|
lines.push(` data = params[:input_${i}]`);
|
||||||
|
lines.push(` process(data)`);
|
||||||
|
lines.push(' end');
|
||||||
|
lines.push('');
|
||||||
|
}
|
||||||
|
lines.push('end');
|
||||||
|
fs.writeFileSync(path.join(dir, 'large_controller.rb'), lines.join('\n'));
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'add large controller']);
|
||||||
|
|
||||||
|
copyReviewFiles(dir);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
|
||||||
|
|
||||||
|
testConcurrentIfSelected('review-army-red-team', async () => {
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are reviewing a large diff (300+ lines). Read review-SKILL.md.
|
||||||
|
Skip preamble, lake intro, telemetry.
|
||||||
|
|
||||||
|
The diff is large enough to activate the Red Team specialist.
|
||||||
|
Read review-specialists/red-team.md and apply it against the diff (git diff main...HEAD).
|
||||||
|
Focus on finding issues that other specialists might miss.
|
||||||
|
|
||||||
|
Write your red team findings to ${dir}/review-output.md
|
||||||
|
Start the file with "RED TEAM REVIEW" on the first line.`,
|
||||||
|
workingDirectory: dir,
|
||||||
|
maxTurns: 20,
|
||||||
|
timeout: 180_000,
|
||||||
|
testName: 'review-army-red-team',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/review army red-team', result);
|
||||||
|
recordE2E(evalCollector, '/review army red team', 'Review Army', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
const outputPath = path.join(dir, 'review-output.md');
|
||||||
|
if (fs.existsSync(outputPath)) {
|
||||||
|
const content = fs.readFileSync(outputPath, 'utf-8');
|
||||||
|
expect(content.toLowerCase()).toMatch(/red team|adversarial/);
|
||||||
|
}
|
||||||
|
}, 210_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Review Army: Consensus (periodic) ---
|
||||||
|
|
||||||
|
describeIfSelected('Review Army: Consensus', ['review-army-consensus'], () => {
|
||||||
|
let dir: string;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
const repo = setupRepo('army-consensus');
|
||||||
|
dir = repo.dir;
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(dir, 'app.rb'), '# base\n');
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'initial']);
|
||||||
|
|
||||||
|
repo.run('git', ['checkout', '-b', 'feature/vuln-auth']);
|
||||||
|
// SQL injection that both security AND testing specialists should flag
|
||||||
|
fs.writeFileSync(path.join(dir, 'auth_controller.rb'), `class AuthController
|
||||||
|
def login
|
||||||
|
user = User.find_by("email = '#{params[:email]}' AND password = '#{params[:password]}'")
|
||||||
|
if user
|
||||||
|
session[:user_id] = user.id
|
||||||
|
redirect_to root_path
|
||||||
|
else
|
||||||
|
flash[:error] = "Invalid credentials"
|
||||||
|
render :login
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`);
|
||||||
|
repo.run('git', ['add', '.']);
|
||||||
|
repo.run('git', ['commit', '-m', 'add auth controller']);
|
||||||
|
|
||||||
|
copyReviewFiles(dir);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} });
|
||||||
|
|
||||||
|
testConcurrentIfSelected('review-army-consensus', async () => {
|
||||||
|
const result = await runSkillTest({
|
||||||
|
prompt: `You are reviewing a git diff with a SQL injection in an auth controller.
|
||||||
|
Read review-SKILL.md, review-checklist.md, and the specialist checklists in review-specialists/.
|
||||||
|
|
||||||
|
This vulnerability should be caught by BOTH the security specialist (injection vector)
|
||||||
|
AND the testing specialist (no test for auth bypass).
|
||||||
|
|
||||||
|
Run the review. In your output, if a finding is flagged by multiple perspectives,
|
||||||
|
mark it as "MULTI-SPECIALIST CONFIRMED" with the confirming categories.
|
||||||
|
|
||||||
|
Write findings to ${dir}/review-output.md`,
|
||||||
|
workingDirectory: dir,
|
||||||
|
maxTurns: 20,
|
||||||
|
timeout: 180_000,
|
||||||
|
testName: 'review-army-consensus',
|
||||||
|
runId,
|
||||||
|
});
|
||||||
|
|
||||||
|
logCost('/review army consensus', result);
|
||||||
|
recordE2E(evalCollector, '/review army consensus', 'Review Army', result);
|
||||||
|
expect(result.exitReason).toBe('success');
|
||||||
|
|
||||||
|
const outputPath = path.join(dir, 'review-output.md');
|
||||||
|
if (fs.existsSync(outputPath)) {
|
||||||
|
const content = fs.readFileSync(outputPath, 'utf-8').toLowerCase();
|
||||||
|
// Should catch the SQL injection
|
||||||
|
const hasSqlFinding =
|
||||||
|
content.includes('sql') ||
|
||||||
|
content.includes('injection') ||
|
||||||
|
content.includes('interpolat');
|
||||||
|
expect(hasSqlFinding).toBe(true);
|
||||||
|
}
|
||||||
|
}, 210_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Finalize eval collector
|
||||||
|
afterAll(async () => {
|
||||||
|
await finalizeEvalCollector(evalCollector);
|
||||||
|
});
|
||||||
Loading…
Reference in New Issue