mirror of https://github.com/garrytan/gstack.git
feat: wire costs[] from modelUsage into eval results
Extract per-model token usage from resultLine.modelUsage (including cache tokens and exact API cost), flow CostEntry[] through EvalCollector, aggregate in finalize(). Extend CostEntry with cache_read_input_tokens, cache_creation_input_tokens, cost_usd. computeCosts() prefers exact cost_usd over MODEL_PRICING when available (~4x more accurate with prompt caching). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4ad73f7362
commit
02925cfc7a
|
|
@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } {
|
||||||
export function computeCosts(costs: CostEntry[]): CostDashboard {
|
export function computeCosts(costs: CostEntry[]): CostDashboard {
|
||||||
const byModel = new Map<string, CostSummary>();
|
const byModel = new Map<string, CostSummary>();
|
||||||
|
|
||||||
|
// Track exact cost_usd sums per model (from API-provided costs)
|
||||||
|
const exactCosts = new Map<string, number>();
|
||||||
|
|
||||||
for (const entry of costs) {
|
for (const entry of costs) {
|
||||||
const existing = byModel.get(entry.model);
|
const existing = byModel.get(entry.model);
|
||||||
if (existing) {
|
if (existing) {
|
||||||
|
|
@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
|
||||||
estimated_cost_usd: 0,
|
estimated_cost_usd: 0,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
if (entry.cost_usd !== undefined) {
|
||||||
|
exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate costs
|
// Calculate costs — prefer exact cost_usd (accounts for cache discounts)
|
||||||
let total = 0;
|
let total = 0;
|
||||||
let atFast = 0;
|
let atFast = 0;
|
||||||
let atFull = 0;
|
let atFull = 0;
|
||||||
|
|
@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
|
||||||
const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
|
const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
|
||||||
|
|
||||||
for (const summary of byModel.values()) {
|
for (const summary of byModel.values()) {
|
||||||
|
const exact = exactCosts.get(summary.model);
|
||||||
|
if (exact !== undefined) {
|
||||||
|
summary.estimated_cost_usd = exact;
|
||||||
|
} else {
|
||||||
const pricing = getPricing(summary.model);
|
const pricing = getPricing(summary.model);
|
||||||
summary.estimated_cost_usd =
|
summary.estimated_cost_usd =
|
||||||
(summary.input_tokens / 1_000_000) * pricing.input +
|
(summary.input_tokens / 1_000_000) * pricing.input +
|
||||||
(summary.output_tokens / 1_000_000) * pricing.output;
|
(summary.output_tokens / 1_000_000) * pricing.output;
|
||||||
|
}
|
||||||
total += summary.estimated_cost_usd;
|
total += summary.estimated_cost_usd;
|
||||||
|
|
||||||
// What-if at fast/full tiers
|
// What-if at fast/full tiers (always from token counts)
|
||||||
atFast +=
|
atFast +=
|
||||||
(summary.input_tokens / 1_000_000) * fastPricing.input +
|
(summary.input_tokens / 1_000_000) * fastPricing.input +
|
||||||
(summary.output_tokens / 1_000_000) * fastPricing.output;
|
(summary.output_tokens / 1_000_000) * fastPricing.output;
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,10 @@ export interface CostEntry {
|
||||||
calls: number;
|
calls: number;
|
||||||
input_tokens: number;
|
input_tokens: number;
|
||||||
output_tokens: number;
|
output_tokens: number;
|
||||||
|
cache_read_input_tokens?: number;
|
||||||
|
cache_creation_input_tokens?: number;
|
||||||
|
/** Exact cost from API when available (accounts for cache discounts). */
|
||||||
|
cost_usd?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FailureEntry {
|
export interface FailureEntry {
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,74 @@ describe('EvalCollector', () => {
|
||||||
expect(data.tests).toHaveLength(0);
|
expect(data.tests).toHaveLength(0);
|
||||||
expect(data.tier).toBe('llm-judge');
|
expect(data.tier).toBe('llm-judge');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('finalize aggregates per-test costs into result-level costs[]', async () => {
|
||||||
|
const collector = new EvalCollector('e2e', tmpDir);
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-a',
|
||||||
|
costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }],
|
||||||
|
}));
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-b',
|
||||||
|
costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }],
|
||||||
|
}));
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-c',
|
||||||
|
costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }],
|
||||||
|
}));
|
||||||
|
|
||||||
|
const filepath = await collector.finalize();
|
||||||
|
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
||||||
|
|
||||||
|
expect(data.costs).toBeDefined();
|
||||||
|
expect(data.costs).toHaveLength(2); // two models
|
||||||
|
const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6');
|
||||||
|
const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5');
|
||||||
|
expect(sonnet).toBeDefined();
|
||||||
|
expect(sonnet!.calls).toBe(2);
|
||||||
|
expect(sonnet!.input_tokens).toBe(300);
|
||||||
|
expect(sonnet!.output_tokens).toBe(150);
|
||||||
|
expect(sonnet!.cost_usd).toBeCloseTo(0.03);
|
||||||
|
expect(haiku).toBeDefined();
|
||||||
|
expect(haiku!.calls).toBe(1);
|
||||||
|
expect(haiku!.cost_usd).toBeCloseTo(0.005);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('finalize omits costs when no tests have cost data', async () => {
|
||||||
|
const collector = new EvalCollector('e2e', tmpDir);
|
||||||
|
collector.addTest(makeEntry({ name: 'no-costs' }));
|
||||||
|
const filepath = await collector.finalize();
|
||||||
|
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
||||||
|
expect(data.costs).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
test('finalize aggregates cache token fields', async () => {
|
||||||
|
const collector = new EvalCollector('e2e', tmpDir);
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-a',
|
||||||
|
costs: [{
|
||||||
|
model: 'claude-sonnet-4-6', calls: 1,
|
||||||
|
input_tokens: 10, output_tokens: 50,
|
||||||
|
cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000,
|
||||||
|
cost_usd: 0.01,
|
||||||
|
}],
|
||||||
|
}));
|
||||||
|
collector.addTest(makeEntry({
|
||||||
|
name: 'test-b',
|
||||||
|
costs: [{
|
||||||
|
model: 'claude-sonnet-4-6', calls: 1,
|
||||||
|
input_tokens: 20, output_tokens: 100,
|
||||||
|
cache_read_input_tokens: 8000, cache_creation_input_tokens: 500,
|
||||||
|
cost_usd: 0.02,
|
||||||
|
}],
|
||||||
|
}));
|
||||||
|
|
||||||
|
const filepath = await collector.finalize();
|
||||||
|
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
||||||
|
const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!;
|
||||||
|
expect(sonnet.cache_read_input_tokens).toBe(13000);
|
||||||
|
expect(sonnet.cache_creation_input_tokens).toBe(1500);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// --- extractToolSummary tests ---
|
// --- extractToolSummary tests ---
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ import * as path from 'path';
|
||||||
import * as os from 'os';
|
import * as os from 'os';
|
||||||
import { spawnSync } from 'child_process';
|
import { spawnSync } from 'child_process';
|
||||||
import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
|
import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
|
||||||
|
import type { CostEntry } from '../../lib/eval-format';
|
||||||
|
|
||||||
const SCHEMA_VERSION = 1;
|
const SCHEMA_VERSION = 1;
|
||||||
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
||||||
|
|
@ -50,6 +51,9 @@ export interface EvalTestEntry {
|
||||||
detected_bugs?: string[];
|
detected_bugs?: string[];
|
||||||
missed_bugs?: string[];
|
missed_bugs?: string[];
|
||||||
|
|
||||||
|
// Per-model cost breakdown
|
||||||
|
costs?: CostEntry[];
|
||||||
|
|
||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -67,6 +71,7 @@ export interface EvalResult {
|
||||||
total_cost_usd: number;
|
total_cost_usd: number;
|
||||||
total_duration_ms: number;
|
total_duration_ms: number;
|
||||||
tests: EvalTestEntry[];
|
tests: EvalTestEntry[];
|
||||||
|
costs?: CostEntry[]; // aggregate per-model cost breakdown
|
||||||
_partial?: boolean; // true for incremental saves, absent in final
|
_partial?: boolean; // true for incremental saves, absent in final
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -414,6 +419,25 @@ export class EvalCollector {
|
||||||
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
||||||
const passed = this.tests.filter(t => t.passed).length;
|
const passed = this.tests.filter(t => t.passed).length;
|
||||||
|
|
||||||
|
// Aggregate per-model costs across all tests
|
||||||
|
const costMap = new Map<string, CostEntry>();
|
||||||
|
for (const t of this.tests) {
|
||||||
|
for (const c of t.costs || []) {
|
||||||
|
const existing = costMap.get(c.model);
|
||||||
|
if (existing) {
|
||||||
|
existing.calls += c.calls;
|
||||||
|
existing.input_tokens += c.input_tokens;
|
||||||
|
existing.output_tokens += c.output_tokens;
|
||||||
|
existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0);
|
||||||
|
existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0);
|
||||||
|
if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd;
|
||||||
|
} else {
|
||||||
|
costMap.set(c.model, { ...c });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const costs = costMap.size > 0 ? [...costMap.values()] : undefined;
|
||||||
|
|
||||||
const result: EvalResult = {
|
const result: EvalResult = {
|
||||||
schema_version: SCHEMA_VERSION,
|
schema_version: SCHEMA_VERSION,
|
||||||
version,
|
version,
|
||||||
|
|
@ -428,6 +452,7 @@ export class EvalCollector {
|
||||||
total_cost_usd: Math.round(totalCost * 100) / 100,
|
total_cost_usd: Math.round(totalCost * 100) / 100,
|
||||||
total_duration_ms: totalDuration,
|
total_duration_ms: totalDuration,
|
||||||
tests: this.tests,
|
tests: this.tests,
|
||||||
|
costs,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Write eval file
|
// Write eval file
|
||||||
|
|
|
||||||
|
|
@ -93,4 +93,36 @@ describe('parseNDJSON', () => {
|
||||||
expect(parsed.turnCount).toBe(2);
|
expect(parsed.turnCount).toBe(2);
|
||||||
expect(parsed.toolCalls).toHaveLength(0);
|
expect(parsed.toolCalls).toHaveLength(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('resultLine preserves modelUsage for cost extraction', () => {
|
||||||
|
const lines = [
|
||||||
|
'{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}',
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'result', subtype: 'success', total_cost_usd: 0.07,
|
||||||
|
num_turns: 1, result: 'Done.',
|
||||||
|
usage: { input_tokens: 8, output_tokens: 802 },
|
||||||
|
modelUsage: {
|
||||||
|
'claude-sonnet-4-6': {
|
||||||
|
inputTokens: 8, outputTokens: 802,
|
||||||
|
cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223,
|
||||||
|
costUSD: 0.07308,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
const parsed = parseNDJSON(lines);
|
||||||
|
expect(parsed.resultLine).not.toBeNull();
|
||||||
|
expect(parsed.resultLine.modelUsage).toBeDefined();
|
||||||
|
const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6'];
|
||||||
|
expect(usage.inputTokens).toBe(8);
|
||||||
|
expect(usage.outputTokens).toBe(802);
|
||||||
|
expect(usage.cacheReadInputTokens).toBe(88133);
|
||||||
|
expect(usage.costUSD).toBeCloseTo(0.07308);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('resultLine without modelUsage has undefined modelUsage', () => {
|
||||||
|
const parsed = parseNDJSON(FIXTURE_LINES);
|
||||||
|
// Original fixture has no modelUsage on result line
|
||||||
|
expect(parsed.resultLine?.modelUsage).toBeUndefined();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@ import * as fs from 'fs';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
import * as os from 'os';
|
import * as os from 'os';
|
||||||
import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
|
import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
|
||||||
|
import type { CostEntry } from '../../lib/eval-format';
|
||||||
|
import { resolveTier, tierToModel } from '../../lib/eval-tier';
|
||||||
|
|
||||||
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
|
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
|
||||||
|
|
||||||
|
|
@ -34,6 +36,7 @@ export interface SkillTestResult {
|
||||||
output: string;
|
output: string;
|
||||||
costEstimate: CostEstimate;
|
costEstimate: CostEstimate;
|
||||||
transcript: any[];
|
transcript: any[];
|
||||||
|
costs: CostEntry[];
|
||||||
}
|
}
|
||||||
|
|
||||||
const BROWSE_ERROR_PATTERNS = [
|
const BROWSE_ERROR_PATTERNS = [
|
||||||
|
|
@ -135,8 +138,11 @@ export async function runSkillTest(options: {
|
||||||
|
|
||||||
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
|
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
|
||||||
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
||||||
|
// Model pinned via EVAL_TIER env var (default: sonnet).
|
||||||
|
const evalModel = tierToModel(resolveTier());
|
||||||
const args = [
|
const args = [
|
||||||
'-p',
|
'-p',
|
||||||
|
'--model', evalModel,
|
||||||
'--output-format', 'stream-json',
|
'--output-format', 'stream-json',
|
||||||
'--verbose',
|
'--verbose',
|
||||||
'--dangerously-skip-permissions',
|
'--dangerously-skip-permissions',
|
||||||
|
|
@ -323,5 +329,21 @@ export async function runSkillTest(options: {
|
||||||
turnsUsed,
|
turnsUsed,
|
||||||
};
|
};
|
||||||
|
|
||||||
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
|
// Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
|
||||||
|
const costs: CostEntry[] = [];
|
||||||
|
if (resultLine?.modelUsage) {
|
||||||
|
for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
|
||||||
|
costs.push({
|
||||||
|
model,
|
||||||
|
calls: 1,
|
||||||
|
input_tokens: usage.inputTokens || 0,
|
||||||
|
output_tokens: usage.outputTokens || 0,
|
||||||
|
cache_read_input_tokens: usage.cacheReadInputTokens || 0,
|
||||||
|
cache_creation_input_tokens: usage.cacheCreationInputTokens || 0,
|
||||||
|
cost_usd: usage.costUSD,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?:
|
||||||
exit_reason: result.exitReason,
|
exit_reason: result.exitReason,
|
||||||
timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
|
timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
|
||||||
last_tool_call: lastTool,
|
last_tool_call: lastTool,
|
||||||
|
costs: result.costs,
|
||||||
...extra,
|
...extra,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue