mirror of https://github.com/garrytan/gstack.git
perf: add model pinning infrastructure + rate-limit telemetry to E2E runner
Default E2E model changed from Opus to Sonnet (5x faster, 5x cheaper). Session runner now accepts `model` option with EVALS_MODEL env var override. Added timing telemetry (first_response_ms, max_inter_turn_ms) and wall_clock_ms to eval-store for diagnosing rate-limit impact. Added EVALS_FAST test filtering. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d68a70d351
commit
d442aadf4a
|
|
@ -30,6 +30,13 @@ export const evalsEnabled = !!process.env.EVALS;
|
||||||
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
|
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
|
||||||
export let selectedTests: string[] | null = null; // null = run all
|
export let selectedTests: string[] | null = null; // null = run all
|
||||||
|
|
||||||
|
// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback
|
||||||
|
const FAST_EXCLUDED_TESTS = [
|
||||||
|
'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch',
|
||||||
|
'design-consultation-core', 'design-consultation-existing',
|
||||||
|
'qa-fix-loop', 'design-review-fix',
|
||||||
|
];
|
||||||
|
|
||||||
if (evalsEnabled && !process.env.EVALS_ALL) {
|
if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||||
const baseBranch = process.env.EVALS_BASE
|
const baseBranch = process.env.EVALS_BASE
|
||||||
|| detectBaseBranch(ROOT)
|
|| detectBaseBranch(ROOT)
|
||||||
|
|
@ -48,6 +55,17 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||||
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
|
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Apply EVALS_FAST filter after diff-based selection
|
||||||
|
if (evalsEnabled && process.env.EVALS_FAST) {
|
||||||
|
if (selectedTests === null) {
|
||||||
|
// Run all minus excluded
|
||||||
|
selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t));
|
||||||
|
} else {
|
||||||
|
selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t));
|
||||||
|
}
|
||||||
|
process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`);
|
||||||
|
}
|
||||||
|
|
||||||
export const describeE2E = evalsEnabled ? describe : describe.skip;
|
export const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||||
|
|
||||||
/** Wrap a describe block to skip entirely if none of its tests are selected. */
|
/** Wrap a describe block to skip entirely if none of its tests are selected. */
|
||||||
|
|
@ -164,6 +182,9 @@ export function recordE2E(
|
||||||
exit_reason: result.exitReason,
|
exit_reason: result.exitReason,
|
||||||
timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
|
timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
|
||||||
last_tool_call: lastTool,
|
last_tool_call: lastTool,
|
||||||
|
model: result.model,
|
||||||
|
first_response_ms: result.firstResponseMs,
|
||||||
|
max_inter_turn_ms: result.maxInterTurnMs,
|
||||||
...extra,
|
...extra,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,11 @@ export interface EvalTestEntry {
|
||||||
timeout_at_turn?: number; // which turn was active when timeout hit
|
timeout_at_turn?: number; // which turn was active when timeout hit
|
||||||
last_tool_call?: string; // e.g. "Write(review-output.md)"
|
last_tool_call?: string; // e.g. "Write(review-output.md)"
|
||||||
|
|
||||||
|
// Model + timing diagnostics (added for Sonnet/Opus split)
|
||||||
|
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
|
||||||
|
first_response_ms?: number; // time from spawn to first NDJSON line
|
||||||
|
max_inter_turn_ms?: number; // peak latency between consecutive tool calls
|
||||||
|
|
||||||
// Outcome eval
|
// Outcome eval
|
||||||
detection_rate?: number;
|
detection_rate?: number;
|
||||||
false_positives?: number;
|
false_positives?: number;
|
||||||
|
|
@ -65,6 +70,7 @@ export interface EvalResult {
|
||||||
failed: number;
|
failed: number;
|
||||||
total_cost_usd: number;
|
total_cost_usd: number;
|
||||||
total_duration_ms: number;
|
total_duration_ms: number;
|
||||||
|
wall_clock_ms?: number; // wall-clock from collector creation to finalization (shows parallelism)
|
||||||
tests: EvalTestEntry[];
|
tests: EvalTestEntry[];
|
||||||
_partial?: boolean; // true for incremental saves, absent in final
|
_partial?: boolean; // true for incremental saves, absent in final
|
||||||
}
|
}
|
||||||
|
|
@ -546,6 +552,7 @@ export class EvalCollector {
|
||||||
private tests: EvalTestEntry[] = [];
|
private tests: EvalTestEntry[] = [];
|
||||||
private finalized = false;
|
private finalized = false;
|
||||||
private evalDir: string;
|
private evalDir: string;
|
||||||
|
private createdAt = Date.now();
|
||||||
|
|
||||||
constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
|
constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
|
||||||
this.tier = tier;
|
this.tier = tier;
|
||||||
|
|
@ -615,6 +622,7 @@ export class EvalCollector {
|
||||||
failed: this.tests.length - passed,
|
failed: this.tests.length - passed,
|
||||||
total_cost_usd: Math.round(totalCost * 100) / 100,
|
total_cost_usd: Math.round(totalCost * 100) / 100,
|
||||||
total_duration_ms: totalDuration,
|
total_duration_ms: totalDuration,
|
||||||
|
wall_clock_ms: Date.now() - this.createdAt,
|
||||||
tests: this.tests,
|
tests: this.tests,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,12 @@ export interface SkillTestResult {
|
||||||
output: string;
|
output: string;
|
||||||
costEstimate: CostEstimate;
|
costEstimate: CostEstimate;
|
||||||
transcript: any[];
|
transcript: any[];
|
||||||
|
/** Which model was used for this test (added for Sonnet/Opus split diagnostics) */
|
||||||
|
model: string;
|
||||||
|
/** Time from spawn to first NDJSON line, in ms (added for rate-limit diagnostics) */
|
||||||
|
firstResponseMs: number;
|
||||||
|
/** Peak latency between consecutive tool calls, in ms */
|
||||||
|
maxInterTurnMs: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
const BROWSE_ERROR_PATTERNS = [
|
const BROWSE_ERROR_PATTERNS = [
|
||||||
|
|
@ -116,6 +122,8 @@ export async function runSkillTest(options: {
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
testName?: string;
|
testName?: string;
|
||||||
runId?: string;
|
runId?: string;
|
||||||
|
/** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */
|
||||||
|
model?: string;
|
||||||
}): Promise<SkillTestResult> {
|
}): Promise<SkillTestResult> {
|
||||||
const {
|
const {
|
||||||
prompt,
|
prompt,
|
||||||
|
|
@ -126,6 +134,7 @@ export async function runSkillTest(options: {
|
||||||
testName,
|
testName,
|
||||||
runId,
|
runId,
|
||||||
} = options;
|
} = options;
|
||||||
|
const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6';
|
||||||
|
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
const startedAt = new Date().toISOString();
|
const startedAt = new Date().toISOString();
|
||||||
|
|
@ -144,6 +153,7 @@ export async function runSkillTest(options: {
|
||||||
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
||||||
const args = [
|
const args = [
|
||||||
'-p',
|
'-p',
|
||||||
|
'--model', model,
|
||||||
'--output-format', 'stream-json',
|
'--output-format', 'stream-json',
|
||||||
'--verbose',
|
'--verbose',
|
||||||
'--dangerously-skip-permissions',
|
'--dangerously-skip-permissions',
|
||||||
|
|
@ -175,6 +185,9 @@ export async function runSkillTest(options: {
|
||||||
const collectedLines: string[] = [];
|
const collectedLines: string[] = [];
|
||||||
let liveTurnCount = 0;
|
let liveTurnCount = 0;
|
||||||
let liveToolCount = 0;
|
let liveToolCount = 0;
|
||||||
|
let firstResponseMs = 0;
|
||||||
|
let lastToolTime = 0;
|
||||||
|
let maxInterTurnMs = 0;
|
||||||
const stderrPromise = new Response(proc.stderr).text();
|
const stderrPromise = new Response(proc.stderr).text();
|
||||||
|
|
||||||
const reader = proc.stdout.getReader();
|
const reader = proc.stdout.getReader();
|
||||||
|
|
@ -201,7 +214,15 @@ export async function runSkillTest(options: {
|
||||||
for (const item of content) {
|
for (const item of content) {
|
||||||
if (item.type === 'tool_use') {
|
if (item.type === 'tool_use') {
|
||||||
liveToolCount++;
|
liveToolCount++;
|
||||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
const now = Date.now();
|
||||||
|
const elapsed = Math.round((now - startTime) / 1000);
|
||||||
|
// Track timing telemetry
|
||||||
|
if (firstResponseMs === 0) firstResponseMs = now - startTime;
|
||||||
|
if (lastToolTime > 0) {
|
||||||
|
const interTurn = now - lastToolTime;
|
||||||
|
if (interTurn > maxInterTurnMs) maxInterTurnMs = interTurn;
|
||||||
|
}
|
||||||
|
lastToolTime = now;
|
||||||
const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
|
const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
|
||||||
process.stderr.write(progressLine);
|
process.stderr.write(progressLine);
|
||||||
|
|
||||||
|
|
@ -330,5 +351,5 @@ export async function runSkillTest(options: {
|
||||||
turnsUsed,
|
turnsUsed,
|
||||||
};
|
};
|
||||||
|
|
||||||
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
|
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, model, firstResponseMs, maxInterTurnMs };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue