theagenticguy · theagenticguy · Jun 30, 2026 · Jun 30, 2026
@@ -28,7 +28,12 @@ class FakeRunner implements AgentRunner {
     return Promise.resolve({
       finalText: request.withPack ? "stable" : `wander-${this.i}`,
       diff: "",
-      tokens: { inputTokens: request.withPack ? 110 : 100, outputTokens: 10, costUsd: null },
+      tokens: {
+        inputTokens: request.withPack ? 110 : 100,
+        outputTokens: 10,
+        cacheTokens: 0,
+        costUsd: null,
+      },
       errored: false,
     });
   }
@@ -88,7 +93,7 @@ describe("runVarianceProbe (seamed)", () => {
         return Promise.resolve({
           finalText: req.withPack ? "s" : `${Math.random()}`,
           diff: "",
-          tokens: { inputTokens: 1, outputTokens: 1, costUsd: null },
+          tokens: { inputTokens: 1, outputTokens: 1, cacheTokens: 0, costUsd: null },
           errored: false,
         });
       },
@@ -102,6 +107,32 @@ describe("runVarianceProbe (seamed)", () => {
     });
     assert.equal(withPackPrompts, 2, "both with-pack runs saw the assembled context");
   });
+
+  it("builds a per-harness runner for each agent in the default set (Bug-2 routing)", async () => {
+    // With no --harness pin, the probe visits both agents; the default factory
+    // maps args.models[harness] to each. We assert the factory is invoked once
+    // per harness so a per-harness model would reach the right runner.
+    const seen: string[] = [];
+    await runVarianceProbe({
+      taskFile,
+      runs: 1,
+      _assemblePackContext: async () => "PACK",
+      _runnerFor: (h) => {
+        seen.push(h);
+        return {
+          name: `fake:${h}`,
+          run: () =>
+            Promise.resolve({
+              finalText: "x",
+              diff: "",
+              tokens: { inputTokens: 1, outputTokens: 1, cacheTokens: 0, costUsd: null },
+              errored: false,
+            }),
+        };
+      },
+    });
+    assert.deepEqual([...seen].sort(), ["claude", "codex"], "one runner built per harness");
+  });
 });
 
 describe("assemblePackContext", () => {

@@ -43,8 +43,14 @@ export interface VarianceProbeArgs {
   readonly harness?: Harness;
   /** AWS region for Bedrock inference; falls back to the inherited env. */
   readonly awsRegion?: string;
-  /** Bedrock model / inference profile override (applies to every harness). */
-  readonly model?: string;
+  /**
+   * Per-harness Bedrock model / inference-profile override. Claude and Codex
+   * take different model ids (a `us.`-prefixed Anthropic profile vs an
+   * `openai.*` Bedrock model), so one global value cannot serve both — each
+   * harness reads its own entry, falling back to the runner's per-harness
+   * default when absent.
+   */
+  readonly models?: Partial<Record<Harness, string>>;
   /**
    * Test seam — inject a fake pack-context assembler so unit tests don't need a
    * real analyzed repo + pack on disk.
@@ -96,12 +102,14 @@ export async function runVarianceProbe(args: VarianceProbeArgs): Promise<Varianc
   //    (spec 010 §4a). Tests inject a fake via `_runnerFor`.
   const runnerFor =
     args._runnerFor ??
-    ((harness: Harness) =>
-      new CliAgentRunner({
+    ((harness: Harness) => {
+      const model = args.models?.[harness];
+      return new CliAgentRunner({
         harness,
-        ...(args.model !== undefined ? { model: args.model } : {}),
+        ...(model !== undefined ? { model } : {}),
         ...(args.awsRegion !== undefined ? { awsRegion: args.awsRegion } : {}),
-      }));
+      });
+    });
 
   const options: ProbeOptions = {
     packContext,

@@ -395,8 +395,13 @@ program
     "With --variance-probe: AWS region for Bedrock inference (default: inherited AWS_REGION)",
   )
   .option(
-    "--model <id>",
-    "With --variance-probe: Bedrock model / inference-profile id (default per harness)",
+    "--model-claude <id>",
+    "With --variance-probe: Claude Code Bedrock model / inference-profile id " +
+      "(us.-prefixed; default us.anthropic.claude-sonnet-4-6)",
+  )
+  .option(
+    "--model-codex <id>",
+    "With --variance-probe: Codex Bedrock model id (default openai.gpt-5.5)",
   )
   .action(async (path: string | undefined, opts: Record<string, unknown>) => {
     // --variance-probe short-circuits the normal pack path: it loads a task,
@@ -408,12 +413,17 @@ program
         typeof opts["runs"] === "number" && Number.isFinite(opts["runs"])
           ? opts["runs"]
           : undefined;
+      // Per-harness model overrides — Claude and Codex take different Bedrock
+      // ids, so they are separate flags rather than one global --model.
+      const models: Record<string, string> = {};
+      if (typeof opts["modelClaude"] === "string") models["claude"] = opts["modelClaude"];
+      if (typeof opts["modelCodex"] === "string") models["codex"] = opts["modelCodex"];
       const report = await probeMod.runVarianceProbe({
         taskFile: opts["varianceProbe"],
         ...(runs !== undefined ? { runs } : {}),
         ...(harness !== undefined ? { harness } : {}),
         ...(typeof opts["awsRegion"] === "string" ? { awsRegion: opts["awsRegion"] } : {}),
-        ...(typeof opts["model"] === "string" ? { model: opts["model"] } : {}),
+        ...(Object.keys(models).length > 0 ? { models } : {}),
       });
       probeMod.printVarianceReport(report, opts["json"] === true);
       return;

@@ -89,6 +89,15 @@ describe("buildArgv", () => {
       "PROMPT",
     ]);
   });
+
+  it("honors a per-harness model override (Bug-2 fix: codex gets a codex model)", () => {
+    // The bug was that one global --model handed Claude's id to Codex. Each
+    // harness must carry its own model into the argv.
+    const claude = buildArgv({ harness: "claude", model: "us.anthropic.claude-opus-4-8" }, "P");
+    assert.equal(claude.argv[claude.argv.indexOf("--model") + 1], "us.anthropic.claude-opus-4-8");
+    const codex = buildArgv({ harness: "codex", model: "openai.gpt-5.4" }, "P");
+    assert.equal(codex.argv[codex.argv.indexOf("-m") + 1], "openai.gpt-5.4");
+  });
 });
 
 describe("composePrompt", () => {
@@ -114,24 +123,32 @@ describe("composePrompt", () => {
 });
 
 describe("parseClaudeOutput", () => {
-  it("extracts result text + usage + cost from the JSON result object", () => {
+  it("extracts result text + usage + cache + cost from the JSON result object", () => {
     const stdout = JSON.stringify({
       type: "result",
       subtype: "success",
       result: "Done — added the flag.",
-      usage: { input_tokens: 1234, output_tokens: 56 },
+      usage: {
+        input_tokens: 1234,
+        output_tokens: 56,
+        cache_creation_input_tokens: 27406,
+        cache_read_input_tokens: 100,
+      },
       total_cost_usd: 0.0123,
     });
     const { finalText, tokens } = parseClaudeOutput(stdout);
     assert.equal(finalText, "Done — added the flag.");
     assert.equal(tokens.inputTokens, 1234);
     assert.equal(tokens.outputTokens, 56);
+    // cache = creation 27406 + read 100 (the Bug-1 fix — was silently dropped).
+    assert.equal(tokens.cacheTokens, 27506);
     assert.equal(tokens.costUsd, 0.0123);
   });
   it("tolerates a missing usage block (zeros, null cost)", () => {
     const { tokens, finalText } = parseClaudeOutput(JSON.stringify({ result: "x" }));
     assert.equal(finalText, "x");
     assert.equal(tokens.inputTokens, 0);
+    assert.equal(tokens.cacheTokens, 0);
     assert.equal(tokens.costUsd, null);
   });
   it("throws on unparseable stdout", () => {
@@ -168,6 +185,7 @@ describe("parseCodexOutput", () => {
     assert.equal(finalText, "Repo summary here.");
     assert.equal(tokens.inputTokens, 24763);
     assert.equal(tokens.outputTokens, 122);
+    assert.equal(tokens.cacheTokens, 24448, "codex cached_input_tokens captured (Bug-1 fix)");
     assert.equal(tokens.costUsd, null, "codex exposes no per-invocation USD cost");
   });
 

@@ -152,14 +152,23 @@ export function buildArgv(
 export function parseClaudeOutput(stdout: string): { finalText: string; tokens: RunTokens } {
   const doc = JSON.parse(stdout) as {
     result?: unknown;
-    usage?: { input_tokens?: unknown; output_tokens?: unknown };
+    usage?: {
+      input_tokens?: unknown;
+      output_tokens?: unknown;
+      cache_creation_input_tokens?: unknown;
+      cache_read_input_tokens?: unknown;
+    };
     total_cost_usd?: unknown;
   };
   const finalText = typeof doc.result === "string" ? doc.result : "";
   const inputTokens = num(doc.usage?.input_tokens);
   const outputTokens = num(doc.usage?.output_tokens);
+  // Claude Code injects a large cached system prompt per call; both the
+  // creation and read halves are real token cost the overhead headline needs.
+  const cacheTokens =
+    num(doc.usage?.cache_creation_input_tokens) + num(doc.usage?.cache_read_input_tokens);
   const costUsd = typeof doc.total_cost_usd === "number" ? doc.total_cost_usd : null;
-  return { finalText, tokens: { inputTokens, outputTokens, costUsd } };
+  return { finalText, tokens: { inputTokens, outputTokens, cacheTokens, costUsd } };
 }
 
 /**
@@ -173,6 +182,7 @@ export function parseCodexOutput(stdout: string): { finalText: string; tokens: R
   let finalText = "";
   let inputTokens = 0;
   let outputTokens = 0;
+  let cacheTokens = 0;
   for (const line of stdout.split("\n")) {
     const trimmed = line.trim();
     if (trimmed.length === 0) continue;
@@ -186,18 +196,19 @@ export function parseCodexOutput(stdout: string): { finalText: string; tokens: R
     const e = evt as {
       type?: unknown;
       item?: { type?: unknown; text?: unknown };
-      usage?: { input_tokens?: unknown; output_tokens?: unknown };
+      usage?: { input_tokens?: unknown; output_tokens?: unknown; cached_input_tokens?: unknown };
     };
     if (e.type === "item.completed" && e.item?.type === "agent_message") {
       if (typeof e.item.text === "string") finalText = e.item.text;
     } else if (e.type === "turn.completed" && e.usage !== undefined) {
       inputTokens = num(e.usage.input_tokens);
       outputTokens = num(e.usage.output_tokens);
+      cacheTokens = num(e.usage.cached_input_tokens);
     }
   }
   // Codex does not surface a per-invocation USD cost on the public event
   // schema, so cost is null (the report tolerates a null-cost arm).
-  return { finalText, tokens: { inputTokens, outputTokens, costUsd: null } };
+  return { finalText, tokens: { inputTokens, outputTokens, cacheTokens, costUsd: null } };
 }
 
 function num(v: unknown): number {
@@ -285,7 +296,7 @@ function erroredOutcome(checkoutPath: string, finalText: string): RunOutcome {
   return {
     finalText,
     diff: "",
-    tokens: { inputTokens: 0, outputTokens: 0, costUsd: null },
+    tokens: { inputTokens: 0, outputTokens: 0, cacheTokens: 0, costUsd: null },
     checkoutPath,
     errored: true,
   };

@@ -10,7 +10,7 @@ import type { AssertionOracle, JudgeOracle, OutputHashOracle } from "./task.js";
 const outcome = (over: Partial<RunOutcome>): RunOutcome => ({
   finalText: "",
   diff: "",
-  tokens: { inputTokens: 0, outputTokens: 0, costUsd: null },
+  tokens: { inputTokens: 0, outputTokens: 0, cacheTokens: 0, costUsd: null },
   errored: false,
   ...over,
 });

@@ -34,6 +34,7 @@ class FakeRunner implements AgentRunner {
       tokens: {
         inputTokens: request.withPack ? 1100 : 1000,
         outputTokens: 100,
+        cacheTokens: 0,
         costUsd: null,
       },
       errored: false,

@@ -70,15 +70,17 @@ export function resolveHarnesses(task: Task, options: ProbeOptions): readonly Ha
 function sumTokens(outcomes: readonly RunOutcome[]): ArmReport["tokens"] {
   let inputTokens = 0;
   let outputTokens = 0;
+  let cacheTokens = 0;
   let costUsd = 0;
   let everyRunHadCost = true;
   for (const o of outcomes) {
     inputTokens += o.tokens.inputTokens;
     outputTokens += o.tokens.outputTokens;
+    cacheTokens += o.tokens.cacheTokens;
     if (o.tokens.costUsd === null) everyRunHadCost = false;
     else costUsd += o.tokens.costUsd;
   }
-  return { inputTokens, outputTokens, costUsd: everyRunHadCost ? costUsd : null };
+  return { inputTokens, outputTokens, cacheTokens, costUsd: everyRunHadCost ? costUsd : null };
 }
 
 /** Run one arm: N invocations of the agent, then score + total tokens. */

@@ -17,9 +17,9 @@ const assertionDispersion = (passRate: number, stddev: number): ArmDispersion =>
   runs: 10,
 });
 
-const arm = (stddev: number, input: number, output: number): ArmReport => ({
+const arm = (stddev: number, input: number, output: number, cache = 0): ArmReport => ({
   dispersion: assertionDispersion(0.5, stddev),
-  tokens: { inputTokens: input, outputTokens: output, costUsd: null },
+  tokens: { inputTokens: input, outputTokens: output, cacheTokens: cache, costUsd: null },
 });
 
 describe("buildHarnessReport", () => {
@@ -46,6 +46,22 @@ describe("buildHarnessReport", () => {
     assert.equal(report.tokenOverheadFlagged, false, "1.1× is under the 1.3× flag");
   });
 
+  it("counts cache tokens in the overhead total (the Bug-1 fix)", () => {
+    // Without the cache fix, both arms would read 1000 vs 1100 → 1.1×. The
+    // with-pack arm's large cached system prompt (8000) is real token cost and
+    // must push the overhead up, not be silently dropped.
+    const report = buildHarnessReport({
+      harness: "claude",
+      runner: "cli:claude",
+      runs: 10,
+      without: arm(0.5, 1000, 0, 0),
+      with: arm(0.2, 1100, 0, 8000),
+    });
+    // total = (1100 + 8000) / 1000 = 9.1× — the cache tokens dominate.
+    assert.ok(Math.abs(report.tokenOverhead - 9.1) < 1e-9, "cache tokens included in overhead");
+    assert.equal(report.tokenOverheadFlagged, true);
+  });
+
   it("flags when token overhead exceeds the guardrail", () => {
     const report = buildHarnessReport({
       harness: "claude",

@@ -27,6 +27,12 @@ export const TOKEN_OVERHEAD_FLAG = 1.3;
 export interface ArmTokens {
   readonly inputTokens: number;
   readonly outputTokens: number;
+  /**
+   * Cached input tokens (Claude Code's per-call cached system prompt, Codex's
+   * `cached_input_tokens`). Counted in the token-overhead total — omitting it
+   * undercounts the cost the variance claim trades against.
+   */
+  readonly cacheTokens: number;
   /** Sum of per-run cost when every run reported it; `null` otherwise. */
   readonly costUsd: number | null;
 }
@@ -69,7 +75,7 @@ export interface VarianceReport {
 
 /** Sum input+output tokens for an arm. */
 function totalTokens(t: ArmTokens): number {
-  return t.inputTokens + t.outputTokens;
+  return t.inputTokens + t.outputTokens + t.cacheTokens;
 }
 
 /**
@@ -129,10 +135,19 @@ export function formatReport(report: VarianceReport): string {
           ? `  [FLAG: > ${TOKEN_OVERHEAD_FLAG}× — stability bought expensively]`
           : ""),
     );
+    // Surface the in/out/cache split so the overhead is auditable — cached
+    // tokens (the per-call system prompt) are part of the total, not hidden.
+    lines.push(`    tokens without:          ${fmtTokens(h.without.tokens)}`);
+    lines.push(`    tokens with:             ${fmtTokens(h.with.tokens)}`);
   }
   return lines.join("\n");
 }
 
+/** Render an arm's token split: input + output + cache (the overhead inputs). */
+function fmtTokens(t: ArmTokens): string {
+  return `in ${t.inputTokens} + out ${t.outputTokens} + cache ${t.cacheTokens}`;
+}
+
 function fmtDispersion(d: ArmDispersion): string {
   switch (d.kind) {
     case "output_hash":

@@ -43,6 +43,16 @@ export interface RunRequest {
 export interface RunTokens {
   readonly inputTokens: number;
   readonly outputTokens: number;
+  /**
+   * Cached input tokens the harness billed/served separately from
+   * `inputTokens` — Claude Code's `cache_creation_input_tokens` +
+   * `cache_read_input_tokens`, or Codex's `cached_input_tokens`. Claude Code
+   * injects a large cached system prompt per call (~27K tokens observed), so
+   * omitting this materially undercounts the token-overhead headline (the
+   * "~10% more tokens" cost the variance claim rides on). 0 when the harness
+   * reports no cache usage.
+   */
+  readonly cacheTokens: number;
   /** Total cost in USD when the harness reports it; `null` when unavailable. */
   readonly costUsd: number | null;
 }