diff --git a/packages/cli/src/commands/variance-probe.test.ts b/packages/cli/src/commands/variance-probe.test.ts index 2cbf9ed..301f258 100644 --- a/packages/cli/src/commands/variance-probe.test.ts +++ b/packages/cli/src/commands/variance-probe.test.ts @@ -28,7 +28,12 @@ class FakeRunner implements AgentRunner { return Promise.resolve({ finalText: request.withPack ? "stable" : `wander-${this.i}`, diff: "", - tokens: { inputTokens: request.withPack ? 110 : 100, outputTokens: 10, costUsd: null }, + tokens: { + inputTokens: request.withPack ? 110 : 100, + outputTokens: 10, + cacheTokens: 0, + costUsd: null, + }, errored: false, }); } @@ -88,7 +93,7 @@ describe("runVarianceProbe (seamed)", () => { return Promise.resolve({ finalText: req.withPack ? "s" : `${Math.random()}`, diff: "", - tokens: { inputTokens: 1, outputTokens: 1, costUsd: null }, + tokens: { inputTokens: 1, outputTokens: 1, cacheTokens: 0, costUsd: null }, errored: false, }); }, @@ -102,6 +107,32 @@ describe("runVarianceProbe (seamed)", () => { }); assert.equal(withPackPrompts, 2, "both with-pack runs saw the assembled context"); }); + + it("builds a per-harness runner for each agent in the default set (Bug-2 routing)", async () => { + // With no --harness pin, the probe visits both agents; the default factory + // maps args.models[harness] to each. We assert the factory is invoked once + // per harness so a per-harness model would reach the right runner. + const seen: string[] = []; + await runVarianceProbe({ + taskFile, + runs: 1, + _assemblePackContext: async () => "PACK", + _runnerFor: (h) => { + seen.push(h); + return { + name: `fake:${h}`, + run: () => + Promise.resolve({ + finalText: "x", + diff: "", + tokens: { inputTokens: 1, outputTokens: 1, cacheTokens: 0, costUsd: null }, + errored: false, + }), + }; + }, + }); + assert.deepEqual([...seen].sort(), ["claude", "codex"], "one runner built per harness"); + }); }); describe("assemblePackContext", () => { diff --git a/packages/cli/src/commands/variance-probe.ts b/packages/cli/src/commands/variance-probe.ts index e439e78..b04305d 100644 --- a/packages/cli/src/commands/variance-probe.ts +++ b/packages/cli/src/commands/variance-probe.ts @@ -43,8 +43,14 @@ export interface VarianceProbeArgs { readonly harness?: Harness; /** AWS region for Bedrock inference; falls back to the inherited env. */ readonly awsRegion?: string; - /** Bedrock model / inference profile override (applies to every harness). */ - readonly model?: string; + /** + * Per-harness Bedrock model / inference-profile override. Claude and Codex + * take different model ids (a `us.`-prefixed Anthropic profile vs an + * `openai.*` Bedrock model), so one global value cannot serve both — each + * harness reads its own entry, falling back to the runner's per-harness + * default when absent. + */ + readonly models?: Partial>; /** * Test seam — inject a fake pack-context assembler so unit tests don't need a * real analyzed repo + pack on disk. @@ -96,12 +102,14 @@ export async function runVarianceProbe(args: VarianceProbeArgs): Promise - new CliAgentRunner({ + ((harness: Harness) => { + const model = args.models?.[harness]; + return new CliAgentRunner({ harness, - ...(args.model !== undefined ? { model: args.model } : {}), + ...(model !== undefined ? { model } : {}), ...(args.awsRegion !== undefined ? { awsRegion: args.awsRegion } : {}), - })); + }); + }); const options: ProbeOptions = { packContext, diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 418b611..dab89fe 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -395,8 +395,13 @@ program "With --variance-probe: AWS region for Bedrock inference (default: inherited AWS_REGION)", ) .option( - "--model ", - "With --variance-probe: Bedrock model / inference-profile id (default per harness)", + "--model-claude ", + "With --variance-probe: Claude Code Bedrock model / inference-profile id " + + "(us.-prefixed; default us.anthropic.claude-sonnet-4-6)", + ) + .option( + "--model-codex ", + "With --variance-probe: Codex Bedrock model id (default openai.gpt-5.5)", ) .action(async (path: string | undefined, opts: Record) => { // --variance-probe short-circuits the normal pack path: it loads a task, @@ -408,12 +413,17 @@ program typeof opts["runs"] === "number" && Number.isFinite(opts["runs"]) ? opts["runs"] : undefined; + // Per-harness model overrides — Claude and Codex take different Bedrock + // ids, so they are separate flags rather than one global --model. + const models: Record = {}; + if (typeof opts["modelClaude"] === "string") models["claude"] = opts["modelClaude"]; + if (typeof opts["modelCodex"] === "string") models["codex"] = opts["modelCodex"]; const report = await probeMod.runVarianceProbe({ taskFile: opts["varianceProbe"], ...(runs !== undefined ? { runs } : {}), ...(harness !== undefined ? { harness } : {}), ...(typeof opts["awsRegion"] === "string" ? { awsRegion: opts["awsRegion"] } : {}), - ...(typeof opts["model"] === "string" ? { model: opts["model"] } : {}), + ...(Object.keys(models).length > 0 ? { models } : {}), }); probeMod.printVarianceReport(report, opts["json"] === true); return; diff --git a/packages/eval/src/cli-runner.test.ts b/packages/eval/src/cli-runner.test.ts index 08e744e..c32ca37 100644 --- a/packages/eval/src/cli-runner.test.ts +++ b/packages/eval/src/cli-runner.test.ts @@ -89,6 +89,15 @@ describe("buildArgv", () => { "PROMPT", ]); }); + + it("honors a per-harness model override (Bug-2 fix: codex gets a codex model)", () => { + // The bug was that one global --model handed Claude's id to Codex. Each + // harness must carry its own model into the argv. + const claude = buildArgv({ harness: "claude", model: "us.anthropic.claude-opus-4-8" }, "P"); + assert.equal(claude.argv[claude.argv.indexOf("--model") + 1], "us.anthropic.claude-opus-4-8"); + const codex = buildArgv({ harness: "codex", model: "openai.gpt-5.4" }, "P"); + assert.equal(codex.argv[codex.argv.indexOf("-m") + 1], "openai.gpt-5.4"); + }); }); describe("composePrompt", () => { @@ -114,24 +123,32 @@ describe("composePrompt", () => { }); describe("parseClaudeOutput", () => { - it("extracts result text + usage + cost from the JSON result object", () => { + it("extracts result text + usage + cache + cost from the JSON result object", () => { const stdout = JSON.stringify({ type: "result", subtype: "success", result: "Done — added the flag.", - usage: { input_tokens: 1234, output_tokens: 56 }, + usage: { + input_tokens: 1234, + output_tokens: 56, + cache_creation_input_tokens: 27406, + cache_read_input_tokens: 100, + }, total_cost_usd: 0.0123, }); const { finalText, tokens } = parseClaudeOutput(stdout); assert.equal(finalText, "Done — added the flag."); assert.equal(tokens.inputTokens, 1234); assert.equal(tokens.outputTokens, 56); + // cache = creation 27406 + read 100 (the Bug-1 fix — was silently dropped). + assert.equal(tokens.cacheTokens, 27506); assert.equal(tokens.costUsd, 0.0123); }); it("tolerates a missing usage block (zeros, null cost)", () => { const { tokens, finalText } = parseClaudeOutput(JSON.stringify({ result: "x" })); assert.equal(finalText, "x"); assert.equal(tokens.inputTokens, 0); + assert.equal(tokens.cacheTokens, 0); assert.equal(tokens.costUsd, null); }); it("throws on unparseable stdout", () => { @@ -168,6 +185,7 @@ describe("parseCodexOutput", () => { assert.equal(finalText, "Repo summary here."); assert.equal(tokens.inputTokens, 24763); assert.equal(tokens.outputTokens, 122); + assert.equal(tokens.cacheTokens, 24448, "codex cached_input_tokens captured (Bug-1 fix)"); assert.equal(tokens.costUsd, null, "codex exposes no per-invocation USD cost"); }); diff --git a/packages/eval/src/cli-runner.ts b/packages/eval/src/cli-runner.ts index fbbf0cf..6765f29 100644 --- a/packages/eval/src/cli-runner.ts +++ b/packages/eval/src/cli-runner.ts @@ -152,14 +152,23 @@ export function buildArgv( export function parseClaudeOutput(stdout: string): { finalText: string; tokens: RunTokens } { const doc = JSON.parse(stdout) as { result?: unknown; - usage?: { input_tokens?: unknown; output_tokens?: unknown }; + usage?: { + input_tokens?: unknown; + output_tokens?: unknown; + cache_creation_input_tokens?: unknown; + cache_read_input_tokens?: unknown; + }; total_cost_usd?: unknown; }; const finalText = typeof doc.result === "string" ? doc.result : ""; const inputTokens = num(doc.usage?.input_tokens); const outputTokens = num(doc.usage?.output_tokens); + // Claude Code injects a large cached system prompt per call; both the + // creation and read halves are real token cost the overhead headline needs. + const cacheTokens = + num(doc.usage?.cache_creation_input_tokens) + num(doc.usage?.cache_read_input_tokens); const costUsd = typeof doc.total_cost_usd === "number" ? doc.total_cost_usd : null; - return { finalText, tokens: { inputTokens, outputTokens, costUsd } }; + return { finalText, tokens: { inputTokens, outputTokens, cacheTokens, costUsd } }; } /** @@ -173,6 +182,7 @@ export function parseCodexOutput(stdout: string): { finalText: string; tokens: R let finalText = ""; let inputTokens = 0; let outputTokens = 0; + let cacheTokens = 0; for (const line of stdout.split("\n")) { const trimmed = line.trim(); if (trimmed.length === 0) continue; @@ -186,18 +196,19 @@ export function parseCodexOutput(stdout: string): { finalText: string; tokens: R const e = evt as { type?: unknown; item?: { type?: unknown; text?: unknown }; - usage?: { input_tokens?: unknown; output_tokens?: unknown }; + usage?: { input_tokens?: unknown; output_tokens?: unknown; cached_input_tokens?: unknown }; }; if (e.type === "item.completed" && e.item?.type === "agent_message") { if (typeof e.item.text === "string") finalText = e.item.text; } else if (e.type === "turn.completed" && e.usage !== undefined) { inputTokens = num(e.usage.input_tokens); outputTokens = num(e.usage.output_tokens); + cacheTokens = num(e.usage.cached_input_tokens); } } // Codex does not surface a per-invocation USD cost on the public event // schema, so cost is null (the report tolerates a null-cost arm). - return { finalText, tokens: { inputTokens, outputTokens, costUsd: null } }; + return { finalText, tokens: { inputTokens, outputTokens, cacheTokens, costUsd: null } }; } function num(v: unknown): number { @@ -285,7 +296,7 @@ function erroredOutcome(checkoutPath: string, finalText: string): RunOutcome { return { finalText, diff: "", - tokens: { inputTokens: 0, outputTokens: 0, costUsd: null }, + tokens: { inputTokens: 0, outputTokens: 0, cacheTokens: 0, costUsd: null }, checkoutPath, errored: true, }; diff --git a/packages/eval/src/oracle.test.ts b/packages/eval/src/oracle.test.ts index 6565e4c..f76b49b 100644 --- a/packages/eval/src/oracle.test.ts +++ b/packages/eval/src/oracle.test.ts @@ -10,7 +10,7 @@ import type { AssertionOracle, JudgeOracle, OutputHashOracle } from "./task.js"; const outcome = (over: Partial): RunOutcome => ({ finalText: "", diff: "", - tokens: { inputTokens: 0, outputTokens: 0, costUsd: null }, + tokens: { inputTokens: 0, outputTokens: 0, cacheTokens: 0, costUsd: null }, errored: false, ...over, }); diff --git a/packages/eval/src/probe.test.ts b/packages/eval/src/probe.test.ts index 13c73d0..1f4d884 100644 --- a/packages/eval/src/probe.test.ts +++ b/packages/eval/src/probe.test.ts @@ -34,6 +34,7 @@ class FakeRunner implements AgentRunner { tokens: { inputTokens: request.withPack ? 1100 : 1000, outputTokens: 100, + cacheTokens: 0, costUsd: null, }, errored: false, diff --git a/packages/eval/src/probe.ts b/packages/eval/src/probe.ts index 4432638..27542a9 100644 --- a/packages/eval/src/probe.ts +++ b/packages/eval/src/probe.ts @@ -70,15 +70,17 @@ export function resolveHarnesses(task: Task, options: ProbeOptions): readonly Ha function sumTokens(outcomes: readonly RunOutcome[]): ArmReport["tokens"] { let inputTokens = 0; let outputTokens = 0; + let cacheTokens = 0; let costUsd = 0; let everyRunHadCost = true; for (const o of outcomes) { inputTokens += o.tokens.inputTokens; outputTokens += o.tokens.outputTokens; + cacheTokens += o.tokens.cacheTokens; if (o.tokens.costUsd === null) everyRunHadCost = false; else costUsd += o.tokens.costUsd; } - return { inputTokens, outputTokens, costUsd: everyRunHadCost ? costUsd : null }; + return { inputTokens, outputTokens, cacheTokens, costUsd: everyRunHadCost ? costUsd : null }; } /** Run one arm: N invocations of the agent, then score + total tokens. */ diff --git a/packages/eval/src/report.test.ts b/packages/eval/src/report.test.ts index 81c887d..466021a 100644 --- a/packages/eval/src/report.test.ts +++ b/packages/eval/src/report.test.ts @@ -17,9 +17,9 @@ const assertionDispersion = (passRate: number, stddev: number): ArmDispersion => runs: 10, }); -const arm = (stddev: number, input: number, output: number): ArmReport => ({ +const arm = (stddev: number, input: number, output: number, cache = 0): ArmReport => ({ dispersion: assertionDispersion(0.5, stddev), - tokens: { inputTokens: input, outputTokens: output, costUsd: null }, + tokens: { inputTokens: input, outputTokens: output, cacheTokens: cache, costUsd: null }, }); describe("buildHarnessReport", () => { @@ -46,6 +46,22 @@ describe("buildHarnessReport", () => { assert.equal(report.tokenOverheadFlagged, false, "1.1× is under the 1.3× flag"); }); + it("counts cache tokens in the overhead total (the Bug-1 fix)", () => { + // Without the cache fix, both arms would read 1000 vs 1100 → 1.1×. The + // with-pack arm's large cached system prompt (8000) is real token cost and + // must push the overhead up, not be silently dropped. + const report = buildHarnessReport({ + harness: "claude", + runner: "cli:claude", + runs: 10, + without: arm(0.5, 1000, 0, 0), + with: arm(0.2, 1100, 0, 8000), + }); + // total = (1100 + 8000) / 1000 = 9.1× — the cache tokens dominate. + assert.ok(Math.abs(report.tokenOverhead - 9.1) < 1e-9, "cache tokens included in overhead"); + assert.equal(report.tokenOverheadFlagged, true); + }); + it("flags when token overhead exceeds the guardrail", () => { const report = buildHarnessReport({ harness: "claude", diff --git a/packages/eval/src/report.ts b/packages/eval/src/report.ts index ec31209..2e9b6a1 100644 --- a/packages/eval/src/report.ts +++ b/packages/eval/src/report.ts @@ -27,6 +27,12 @@ export const TOKEN_OVERHEAD_FLAG = 1.3; export interface ArmTokens { readonly inputTokens: number; readonly outputTokens: number; + /** + * Cached input tokens (Claude Code's per-call cached system prompt, Codex's + * `cached_input_tokens`). Counted in the token-overhead total — omitting it + * undercounts the cost the variance claim trades against. + */ + readonly cacheTokens: number; /** Sum of per-run cost when every run reported it; `null` otherwise. */ readonly costUsd: number | null; } @@ -69,7 +75,7 @@ export interface VarianceReport { /** Sum input+output tokens for an arm. */ function totalTokens(t: ArmTokens): number { - return t.inputTokens + t.outputTokens; + return t.inputTokens + t.outputTokens + t.cacheTokens; } /** @@ -129,10 +135,19 @@ export function formatReport(report: VarianceReport): string { ? ` [FLAG: > ${TOKEN_OVERHEAD_FLAG}× — stability bought expensively]` : ""), ); + // Surface the in/out/cache split so the overhead is auditable — cached + // tokens (the per-call system prompt) are part of the total, not hidden. + lines.push(` tokens without: ${fmtTokens(h.without.tokens)}`); + lines.push(` tokens with: ${fmtTokens(h.with.tokens)}`); } return lines.join("\n"); } +/** Render an arm's token split: input + output + cache (the overhead inputs). */ +function fmtTokens(t: ArmTokens): string { + return `in ${t.inputTokens} + out ${t.outputTokens} + cache ${t.cacheTokens}`; +} + function fmtDispersion(d: ArmDispersion): string { switch (d.kind) { case "output_hash": diff --git a/packages/eval/src/runner.ts b/packages/eval/src/runner.ts index d826284..1ff30e9 100644 --- a/packages/eval/src/runner.ts +++ b/packages/eval/src/runner.ts @@ -43,6 +43,16 @@ export interface RunRequest { export interface RunTokens { readonly inputTokens: number; readonly outputTokens: number; + /** + * Cached input tokens the harness billed/served separately from + * `inputTokens` — Claude Code's `cache_creation_input_tokens` + + * `cache_read_input_tokens`, or Codex's `cached_input_tokens`. Claude Code + * injects a large cached system prompt per call (~27K tokens observed), so + * omitting this materially undercounts the token-overhead headline (the + * "~10% more tokens" cost the variance claim rides on). 0 when the harness + * reports no cache usage. + */ + readonly cacheTokens: number; /** Total cost in USD when the harness reports it; `null` when unavailable. */ readonly costUsd: number | null; }