feat: wire costs[] from modelUsage into eval results

Extract per-model token usage from resultLine.modelUsage (including cache tokens and exact API cost), flow CostEntry[] through EvalCollector, aggregate in finalize(). Extend CostEntry with cache_read_input_tokens, cache_creation_input_tokens, cost_usd. computeCosts() prefers exact cost_usd over MODEL_PRICING when available (~4x more accurate with prompt caching). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 16:47:27 -05:00 · 2026-03-15 16:47:27 -05:00 · 02925cfc7a
parent 4ad73f7362
commit 02925cfc7a
7 changed files with 170 additions and 7 deletions
--- a/lib/eval-cost.ts
+++ b/lib/eval-cost.ts
@ -55,6 +55,9 @@ function getPricing(model: string): { input: number; output: number } {
 export function computeCosts(costs: CostEntry[]): CostDashboard {
  const byModel = new Map<string, CostSummary>();
  // Track exact cost_usd sums per model (from API-provided costs)
  const exactCosts = new Map<string, number>();
  for (const entry of costs) {
    const existing = byModel.get(entry.model);
    if (existing) {
@ -70,9 +73,12 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
        estimated_cost_usd: 0,
      });
    }
    if (entry.cost_usd !== undefined) {
      exactCosts.set(entry.model, (exactCosts.get(entry.model) || 0) + entry.cost_usd);
    }
  }
-  // Calculate costs
+  // Calculate costs — prefer exact cost_usd (accounts for cache discounts)
  let total = 0;
  let atFast = 0;
  let atFull = 0;
@ -80,13 +86,18 @@ export function computeCosts(costs: CostEntry[]): CostDashboard {
  const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
  for (const summary of byModel.values()) {
    const exact = exactCosts.get(summary.model);
    if (exact !== undefined) {
      summary.estimated_cost_usd = exact;
    } else {
      const pricing = getPricing(summary.model);
      summary.estimated_cost_usd =
        (summary.input_tokens / 1_000_000) * pricing.input +
        (summary.output_tokens / 1_000_000) * pricing.output;
    }
    total += summary.estimated_cost_usd;
-    // What-if at fast/full tiers
+    // What-if at fast/full tiers (always from token counts)
    atFast +=
      (summary.input_tokens / 1_000_000) * fastPricing.input +
      (summary.output_tokens / 1_000_000) * fastPricing.output;
--- a/lib/eval-format.ts
+++ b/lib/eval-format.ts
@ -15,6 +15,10 @@ export interface CostEntry {
  calls: number;
  input_tokens: number;
  output_tokens: number;
  cache_read_input_tokens?: number;
  cache_creation_input_tokens?: number;
  /** Exact cost from API when available (accounts for cache discounts). */
  cost_usd?: number;
 }
 export interface FailureEntry {
--- a/test/helpers/eval-store.test.ts
+++ b/test/helpers/eval-store.test.ts
@ -128,6 +128,74 @@ describe('EvalCollector', () => {
    expect(data.tests).toHaveLength(0);
    expect(data.tier).toBe('llm-judge');
  });
  test('finalize aggregates per-test costs into result-level costs[]', async () => {
    const collector = new EvalCollector('e2e', tmpDir);
    collector.addTest(makeEntry({
      name: 'test-a',
      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 100, output_tokens: 50, cost_usd: 0.01 }],
    }));
    collector.addTest(makeEntry({
      name: 'test-b',
      costs: [{ model: 'claude-sonnet-4-6', calls: 1, input_tokens: 200, output_tokens: 100, cost_usd: 0.02 }],
    }));
    collector.addTest(makeEntry({
      name: 'test-c',
      costs: [{ model: 'claude-haiku-4-5', calls: 1, input_tokens: 50, output_tokens: 25, cost_usd: 0.005 }],
    }));
    const filepath = await collector.finalize();
    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
    expect(data.costs).toBeDefined();
    expect(data.costs).toHaveLength(2); // two models
    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6');
    const haiku = data.costs!.find(c => c.model === 'claude-haiku-4-5');
    expect(sonnet).toBeDefined();
    expect(sonnet!.calls).toBe(2);
    expect(sonnet!.input_tokens).toBe(300);
    expect(sonnet!.output_tokens).toBe(150);
    expect(sonnet!.cost_usd).toBeCloseTo(0.03);
    expect(haiku).toBeDefined();
    expect(haiku!.calls).toBe(1);
    expect(haiku!.cost_usd).toBeCloseTo(0.005);
  });
  test('finalize omits costs when no tests have cost data', async () => {
    const collector = new EvalCollector('e2e', tmpDir);
    collector.addTest(makeEntry({ name: 'no-costs' }));
    const filepath = await collector.finalize();
    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
    expect(data.costs).toBeUndefined();
  });
  test('finalize aggregates cache token fields', async () => {
    const collector = new EvalCollector('e2e', tmpDir);
    collector.addTest(makeEntry({
      name: 'test-a',
      costs: [{
        model: 'claude-sonnet-4-6', calls: 1,
        input_tokens: 10, output_tokens: 50,
        cache_read_input_tokens: 5000, cache_creation_input_tokens: 1000,
        cost_usd: 0.01,
      }],
    }));
    collector.addTest(makeEntry({
      name: 'test-b',
      costs: [{
        model: 'claude-sonnet-4-6', calls: 1,
        input_tokens: 20, output_tokens: 100,
        cache_read_input_tokens: 8000, cache_creation_input_tokens: 500,
        cost_usd: 0.02,
      }],
    }));
    const filepath = await collector.finalize();
    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
    const sonnet = data.costs!.find(c => c.model === 'claude-sonnet-4-6')!;
    expect(sonnet.cache_read_input_tokens).toBe(13000);
    expect(sonnet.cache_creation_input_tokens).toBe(1500);
  });
 });
 // --- extractToolSummary tests ---
--- a/test/helpers/eval-store.ts
+++ b/test/helpers/eval-store.ts
@ -13,6 +13,7 @@ import * as path from 'path';
 import * as os from 'os';
 import { spawnSync } from 'child_process';
 import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '../../lib/util';
 import type { CostEntry } from '../../lib/eval-format';
 const SCHEMA_VERSION = 1;
 const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
@ -50,6 +51,9 @@ export interface EvalTestEntry {
  detected_bugs?: string[];
  missed_bugs?: string[];
  // Per-model cost breakdown
  costs?: CostEntry[];
  error?: string;
 }
@ -67,6 +71,7 @@ export interface EvalResult {
  total_cost_usd: number;
  total_duration_ms: number;
  tests: EvalTestEntry[];
  costs?: CostEntry[];  // aggregate per-model cost breakdown
  _partial?: boolean;  // true for incremental saves, absent in final
 }
@ -414,6 +419,25 @@ export class EvalCollector {
    const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
    const passed = this.tests.filter(t => t.passed).length;
    // Aggregate per-model costs across all tests
    const costMap = new Map<string, CostEntry>();
    for (const t of this.tests) {
      for (const c of t.costs || []) {
        const existing = costMap.get(c.model);
        if (existing) {
          existing.calls += c.calls;
          existing.input_tokens += c.input_tokens;
          existing.output_tokens += c.output_tokens;
          existing.cache_read_input_tokens = (existing.cache_read_input_tokens || 0) + (c.cache_read_input_tokens || 0);
          existing.cache_creation_input_tokens = (existing.cache_creation_input_tokens || 0) + (c.cache_creation_input_tokens || 0);
          if (c.cost_usd !== undefined) existing.cost_usd = (existing.cost_usd || 0) + c.cost_usd;
        } else {
          costMap.set(c.model, { ...c });
        }
      }
    }
    const costs = costMap.size > 0 ? [...costMap.values()] : undefined;
    const result: EvalResult = {
      schema_version: SCHEMA_VERSION,
      version,
@ -428,6 +452,7 @@ export class EvalCollector {
      total_cost_usd: Math.round(totalCost * 100) / 100,
      total_duration_ms: totalDuration,
      tests: this.tests,
      costs,
    };
    // Write eval file
--- a/test/helpers/session-runner.test.ts
+++ b/test/helpers/session-runner.test.ts
@ -93,4 +93,36 @@ describe('parseNDJSON', () => {
    expect(parsed.turnCount).toBe(2);
    expect(parsed.toolCalls).toHaveLength(0);
  });
  test('resultLine preserves modelUsage for cost extraction', () => {
    const lines = [
      '{"type":"assistant","message":{"model":"claude-sonnet-4-6","content":[{"type":"text","text":"ok"}]}}',
      JSON.stringify({
        type: 'result', subtype: 'success', total_cost_usd: 0.07,
        num_turns: 1, result: 'Done.',
        usage: { input_tokens: 8, output_tokens: 802 },
        modelUsage: {
          'claude-sonnet-4-6': {
            inputTokens: 8, outputTokens: 802,
            cacheReadInputTokens: 88133, cacheCreationInputTokens: 9223,
            costUSD: 0.07308,
          },
        },
      }),
    ];
    const parsed = parseNDJSON(lines);
    expect(parsed.resultLine).not.toBeNull();
    expect(parsed.resultLine.modelUsage).toBeDefined();
    const usage = parsed.resultLine.modelUsage['claude-sonnet-4-6'];
    expect(usage.inputTokens).toBe(8);
    expect(usage.outputTokens).toBe(802);
    expect(usage.cacheReadInputTokens).toBe(88133);
    expect(usage.costUSD).toBeCloseTo(0.07308);
  });
  test('resultLine without modelUsage has undefined modelUsage', () => {
    const parsed = parseNDJSON(FIXTURE_LINES);
    // Original fixture has no modelUsage on result line
    expect(parsed.resultLine?.modelUsage).toBeUndefined();
  });
 });
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@ -10,6 +10,8 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
 import type { CostEntry } from '../../lib/eval-format';
 import { resolveTier, tierToModel } from '../../lib/eval-tier';
 const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
@ -34,6 +36,7 @@ export interface SkillTestResult {
  output: string;
  costEstimate: CostEstimate;
  transcript: any[];
  costs: CostEntry[];
 }
 const BROWSE_ERROR_PATTERNS = [
@ -135,8 +138,11 @@ export async function runSkillTest(options: {
  // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
  // avoid shell escaping issues. --verbose is required for stream-json mode.
  // Model pinned via EVAL_TIER env var (default: sonnet).
  const evalModel = tierToModel(resolveTier());
  const args = [
    '-p',
    '--model', evalModel,
    '--output-format', 'stream-json',
    '--verbose',
    '--dangerously-skip-permissions',
@ -323,5 +329,21 @@ export async function runSkillTest(options: {
    turnsUsed,
  };
-  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
+  // Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
  const costs: CostEntry[] = [];
  if (resultLine?.modelUsage) {
    for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
      costs.push({
        model,
        calls: 1,
        input_tokens: usage.inputTokens || 0,
        output_tokens: usage.outputTokens || 0,
        cache_read_input_tokens: usage.cacheReadInputTokens || 0,
        cache_creation_input_tokens: usage.cacheCreationInputTokens || 0,
        cost_usd: usage.costUSD,
      });
    }
  }
  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
 }
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@ -41,6 +41,7 @@ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?:
    exit_reason: result.exitReason,
    timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
    last_tool_call: lastTool,
    costs: result.costs,
    ...extra,
  });
 }