Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions packages/cli/src/commands/variance-probe.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ class FakeRunner implements AgentRunner {
return Promise.resolve({
finalText: request.withPack ? "stable" : `wander-${this.i}`,
diff: "",
tokens: { inputTokens: request.withPack ? 110 : 100, outputTokens: 10, costUsd: null },
tokens: {
inputTokens: request.withPack ? 110 : 100,
outputTokens: 10,
cacheTokens: 0,
costUsd: null,
},
errored: false,
});
}
Expand Down Expand Up @@ -88,7 +93,7 @@ describe("runVarianceProbe (seamed)", () => {
return Promise.resolve({
finalText: req.withPack ? "s" : `${Math.random()}`,
diff: "",
tokens: { inputTokens: 1, outputTokens: 1, costUsd: null },
tokens: { inputTokens: 1, outputTokens: 1, cacheTokens: 0, costUsd: null },
errored: false,
});
},
Expand All @@ -102,6 +107,32 @@ describe("runVarianceProbe (seamed)", () => {
});
assert.equal(withPackPrompts, 2, "both with-pack runs saw the assembled context");
});

it("builds a per-harness runner for each agent in the default set (Bug-2 routing)", async () => {
// With no --harness pin, the probe visits both agents; the default factory
// maps args.models[harness] to each. We assert the factory is invoked once
// per harness so a per-harness model would reach the right runner.
const seen: string[] = [];
await runVarianceProbe({
taskFile,
runs: 1,
_assemblePackContext: async () => "PACK",
_runnerFor: (h) => {
seen.push(h);
return {
name: `fake:${h}`,
run: () =>
Promise.resolve({
finalText: "x",
diff: "",
tokens: { inputTokens: 1, outputTokens: 1, cacheTokens: 0, costUsd: null },
errored: false,
}),
};
},
});
assert.deepEqual([...seen].sort(), ["claude", "codex"], "one runner built per harness");
});
});

describe("assemblePackContext", () => {
Expand Down
20 changes: 14 additions & 6 deletions packages/cli/src/commands/variance-probe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,14 @@ export interface VarianceProbeArgs {
readonly harness?: Harness;
/** AWS region for Bedrock inference; falls back to the inherited env. */
readonly awsRegion?: string;
/** Bedrock model / inference profile override (applies to every harness). */
readonly model?: string;
/**
* Per-harness Bedrock model / inference-profile override. Claude and Codex
* take different model ids (a `us.`-prefixed Anthropic profile vs an
* `openai.*` Bedrock model), so one global value cannot serve both — each
* harness reads its own entry, falling back to the runner's per-harness
* default when absent.
*/
readonly models?: Partial<Record<Harness, string>>;
/**
* Test seam — inject a fake pack-context assembler so unit tests don't need a
* real analyzed repo + pack on disk.
Expand Down Expand Up @@ -96,12 +102,14 @@ export async function runVarianceProbe(args: VarianceProbeArgs): Promise<Varianc
// (spec 010 §4a). Tests inject a fake via `_runnerFor`.
const runnerFor =
args._runnerFor ??
((harness: Harness) =>
new CliAgentRunner({
((harness: Harness) => {
const model = args.models?.[harness];
return new CliAgentRunner({
harness,
...(args.model !== undefined ? { model: args.model } : {}),
...(model !== undefined ? { model } : {}),
...(args.awsRegion !== undefined ? { awsRegion: args.awsRegion } : {}),
}));
});
});

const options: ProbeOptions = {
packContext,
Expand Down
16 changes: 13 additions & 3 deletions packages/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,13 @@ program
"With --variance-probe: AWS region for Bedrock inference (default: inherited AWS_REGION)",
)
.option(
"--model <id>",
"With --variance-probe: Bedrock model / inference-profile id (default per harness)",
"--model-claude <id>",
"With --variance-probe: Claude Code Bedrock model / inference-profile id " +
"(us.-prefixed; default us.anthropic.claude-sonnet-4-6)",
)
.option(
"--model-codex <id>",
"With --variance-probe: Codex Bedrock model id (default openai.gpt-5.5)",
)
.action(async (path: string | undefined, opts: Record<string, unknown>) => {
// --variance-probe short-circuits the normal pack path: it loads a task,
Expand All @@ -408,12 +413,17 @@ program
typeof opts["runs"] === "number" && Number.isFinite(opts["runs"])
? opts["runs"]
: undefined;
// Per-harness model overrides — Claude and Codex take different Bedrock
// ids, so they are separate flags rather than one global --model.
const models: Record<string, string> = {};
if (typeof opts["modelClaude"] === "string") models["claude"] = opts["modelClaude"];
if (typeof opts["modelCodex"] === "string") models["codex"] = opts["modelCodex"];
const report = await probeMod.runVarianceProbe({
taskFile: opts["varianceProbe"],
...(runs !== undefined ? { runs } : {}),
...(harness !== undefined ? { harness } : {}),
...(typeof opts["awsRegion"] === "string" ? { awsRegion: opts["awsRegion"] } : {}),
...(typeof opts["model"] === "string" ? { model: opts["model"] } : {}),
...(Object.keys(models).length > 0 ? { models } : {}),
});
probeMod.printVarianceReport(report, opts["json"] === true);
return;
Expand Down
22 changes: 20 additions & 2 deletions packages/eval/src/cli-runner.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ describe("buildArgv", () => {
"PROMPT",
]);
});

it("honors a per-harness model override (Bug-2 fix: codex gets a codex model)", () => {
// The bug was that one global --model handed Claude's id to Codex. Each
// harness must carry its own model into the argv.
const claude = buildArgv({ harness: "claude", model: "us.anthropic.claude-opus-4-8" }, "P");
assert.equal(claude.argv[claude.argv.indexOf("--model") + 1], "us.anthropic.claude-opus-4-8");
const codex = buildArgv({ harness: "codex", model: "openai.gpt-5.4" }, "P");
assert.equal(codex.argv[codex.argv.indexOf("-m") + 1], "openai.gpt-5.4");
});
});

describe("composePrompt", () => {
Expand All @@ -114,24 +123,32 @@ describe("composePrompt", () => {
});

describe("parseClaudeOutput", () => {
it("extracts result text + usage + cost from the JSON result object", () => {
it("extracts result text + usage + cache + cost from the JSON result object", () => {
const stdout = JSON.stringify({
type: "result",
subtype: "success",
result: "Done — added the flag.",
usage: { input_tokens: 1234, output_tokens: 56 },
usage: {
input_tokens: 1234,
output_tokens: 56,
cache_creation_input_tokens: 27406,
cache_read_input_tokens: 100,
},
total_cost_usd: 0.0123,
});
const { finalText, tokens } = parseClaudeOutput(stdout);
assert.equal(finalText, "Done — added the flag.");
assert.equal(tokens.inputTokens, 1234);
assert.equal(tokens.outputTokens, 56);
// cache = creation 27406 + read 100 (the Bug-1 fix — was silently dropped).
assert.equal(tokens.cacheTokens, 27506);
assert.equal(tokens.costUsd, 0.0123);
});
it("tolerates a missing usage block (zeros, null cost)", () => {
const { tokens, finalText } = parseClaudeOutput(JSON.stringify({ result: "x" }));
assert.equal(finalText, "x");
assert.equal(tokens.inputTokens, 0);
assert.equal(tokens.cacheTokens, 0);
assert.equal(tokens.costUsd, null);
});
it("throws on unparseable stdout", () => {
Expand Down Expand Up @@ -168,6 +185,7 @@ describe("parseCodexOutput", () => {
assert.equal(finalText, "Repo summary here.");
assert.equal(tokens.inputTokens, 24763);
assert.equal(tokens.outputTokens, 122);
assert.equal(tokens.cacheTokens, 24448, "codex cached_input_tokens captured (Bug-1 fix)");
assert.equal(tokens.costUsd, null, "codex exposes no per-invocation USD cost");
});

Expand Down
21 changes: 16 additions & 5 deletions packages/eval/src/cli-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,23 @@ export function buildArgv(
export function parseClaudeOutput(stdout: string): { finalText: string; tokens: RunTokens } {
const doc = JSON.parse(stdout) as {
result?: unknown;
usage?: { input_tokens?: unknown; output_tokens?: unknown };
usage?: {
input_tokens?: unknown;
output_tokens?: unknown;
cache_creation_input_tokens?: unknown;
cache_read_input_tokens?: unknown;
};
total_cost_usd?: unknown;
};
const finalText = typeof doc.result === "string" ? doc.result : "";
const inputTokens = num(doc.usage?.input_tokens);
const outputTokens = num(doc.usage?.output_tokens);
// Claude Code injects a large cached system prompt per call; both the
// creation and read halves are real token cost the overhead headline needs.
const cacheTokens =
num(doc.usage?.cache_creation_input_tokens) + num(doc.usage?.cache_read_input_tokens);
const costUsd = typeof doc.total_cost_usd === "number" ? doc.total_cost_usd : null;
return { finalText, tokens: { inputTokens, outputTokens, costUsd } };
return { finalText, tokens: { inputTokens, outputTokens, cacheTokens, costUsd } };
}

/**
Expand All @@ -173,6 +182,7 @@ export function parseCodexOutput(stdout: string): { finalText: string; tokens: R
let finalText = "";
let inputTokens = 0;
let outputTokens = 0;
let cacheTokens = 0;
for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (trimmed.length === 0) continue;
Expand All @@ -186,18 +196,19 @@ export function parseCodexOutput(stdout: string): { finalText: string; tokens: R
const e = evt as {
type?: unknown;
item?: { type?: unknown; text?: unknown };
usage?: { input_tokens?: unknown; output_tokens?: unknown };
usage?: { input_tokens?: unknown; output_tokens?: unknown; cached_input_tokens?: unknown };
};
if (e.type === "item.completed" && e.item?.type === "agent_message") {
if (typeof e.item.text === "string") finalText = e.item.text;
} else if (e.type === "turn.completed" && e.usage !== undefined) {
inputTokens = num(e.usage.input_tokens);
outputTokens = num(e.usage.output_tokens);
cacheTokens = num(e.usage.cached_input_tokens);
}
}
// Codex does not surface a per-invocation USD cost on the public event
// schema, so cost is null (the report tolerates a null-cost arm).
return { finalText, tokens: { inputTokens, outputTokens, costUsd: null } };
return { finalText, tokens: { inputTokens, outputTokens, cacheTokens, costUsd: null } };
}

function num(v: unknown): number {
Expand Down Expand Up @@ -285,7 +296,7 @@ function erroredOutcome(checkoutPath: string, finalText: string): RunOutcome {
return {
finalText,
diff: "",
tokens: { inputTokens: 0, outputTokens: 0, costUsd: null },
tokens: { inputTokens: 0, outputTokens: 0, cacheTokens: 0, costUsd: null },
checkoutPath,
errored: true,
};
Expand Down
2 changes: 1 addition & 1 deletion packages/eval/src/oracle.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import type { AssertionOracle, JudgeOracle, OutputHashOracle } from "./task.js";
const outcome = (over: Partial<RunOutcome>): RunOutcome => ({
finalText: "",
diff: "",
tokens: { inputTokens: 0, outputTokens: 0, costUsd: null },
tokens: { inputTokens: 0, outputTokens: 0, cacheTokens: 0, costUsd: null },
errored: false,
...over,
});
Expand Down
1 change: 1 addition & 0 deletions packages/eval/src/probe.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class FakeRunner implements AgentRunner {
tokens: {
inputTokens: request.withPack ? 1100 : 1000,
outputTokens: 100,
cacheTokens: 0,
costUsd: null,
},
errored: false,
Expand Down
4 changes: 3 additions & 1 deletion packages/eval/src/probe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,17 @@ export function resolveHarnesses(task: Task, options: ProbeOptions): readonly Ha
function sumTokens(outcomes: readonly RunOutcome[]): ArmReport["tokens"] {
let inputTokens = 0;
let outputTokens = 0;
let cacheTokens = 0;
let costUsd = 0;
let everyRunHadCost = true;
for (const o of outcomes) {
inputTokens += o.tokens.inputTokens;
outputTokens += o.tokens.outputTokens;
cacheTokens += o.tokens.cacheTokens;
if (o.tokens.costUsd === null) everyRunHadCost = false;
else costUsd += o.tokens.costUsd;
}
return { inputTokens, outputTokens, costUsd: everyRunHadCost ? costUsd : null };
return { inputTokens, outputTokens, cacheTokens, costUsd: everyRunHadCost ? costUsd : null };
}

/** Run one arm: N invocations of the agent, then score + total tokens. */
Expand Down
20 changes: 18 additions & 2 deletions packages/eval/src/report.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ const assertionDispersion = (passRate: number, stddev: number): ArmDispersion =>
runs: 10,
});

const arm = (stddev: number, input: number, output: number): ArmReport => ({
const arm = (stddev: number, input: number, output: number, cache = 0): ArmReport => ({
dispersion: assertionDispersion(0.5, stddev),
tokens: { inputTokens: input, outputTokens: output, costUsd: null },
tokens: { inputTokens: input, outputTokens: output, cacheTokens: cache, costUsd: null },
});

describe("buildHarnessReport", () => {
Expand All @@ -46,6 +46,22 @@ describe("buildHarnessReport", () => {
assert.equal(report.tokenOverheadFlagged, false, "1.1× is under the 1.3× flag");
});

it("counts cache tokens in the overhead total (the Bug-1 fix)", () => {
// Without the cache fix, both arms would read 1000 vs 1100 → 1.1×. The
// with-pack arm's large cached system prompt (8000) is real token cost and
// must push the overhead up, not be silently dropped.
const report = buildHarnessReport({
harness: "claude",
runner: "cli:claude",
runs: 10,
without: arm(0.5, 1000, 0, 0),
with: arm(0.2, 1100, 0, 8000),
});
// total = (1100 + 8000) / 1000 = 9.1× — the cache tokens dominate.
assert.ok(Math.abs(report.tokenOverhead - 9.1) < 1e-9, "cache tokens included in overhead");
assert.equal(report.tokenOverheadFlagged, true);
});

it("flags when token overhead exceeds the guardrail", () => {
const report = buildHarnessReport({
harness: "claude",
Expand Down
17 changes: 16 additions & 1 deletion packages/eval/src/report.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ export const TOKEN_OVERHEAD_FLAG = 1.3;
export interface ArmTokens {
readonly inputTokens: number;
readonly outputTokens: number;
/**
* Cached input tokens (Claude Code's per-call cached system prompt, Codex's
* `cached_input_tokens`). Counted in the token-overhead total — omitting it
* undercounts the cost the variance claim trades against.
*/
readonly cacheTokens: number;
/** Sum of per-run cost when every run reported it; `null` otherwise. */
readonly costUsd: number | null;
}
Expand Down Expand Up @@ -69,7 +75,7 @@ export interface VarianceReport {

/** Sum input+output tokens for an arm. */
function totalTokens(t: ArmTokens): number {
return t.inputTokens + t.outputTokens;
return t.inputTokens + t.outputTokens + t.cacheTokens;
}

/**
Expand Down Expand Up @@ -129,10 +135,19 @@ export function formatReport(report: VarianceReport): string {
? ` [FLAG: > ${TOKEN_OVERHEAD_FLAG}× — stability bought expensively]`
: ""),
);
// Surface the in/out/cache split so the overhead is auditable — cached
// tokens (the per-call system prompt) are part of the total, not hidden.
lines.push(` tokens without: ${fmtTokens(h.without.tokens)}`);
lines.push(` tokens with: ${fmtTokens(h.with.tokens)}`);
}
return lines.join("\n");
}

/** Render an arm's token split: input + output + cache (the overhead inputs). */
function fmtTokens(t: ArmTokens): string {
return `in ${t.inputTokens} + out ${t.outputTokens} + cache ${t.cacheTokens}`;
}

function fmtDispersion(d: ArmDispersion): string {
switch (d.kind) {
case "output_hash":
Expand Down
10 changes: 10 additions & 0 deletions packages/eval/src/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ export interface RunRequest {
export interface RunTokens {
readonly inputTokens: number;
readonly outputTokens: number;
/**
* Cached input tokens the harness billed/served separately from
* `inputTokens` — Claude Code's `cache_creation_input_tokens` +
* `cache_read_input_tokens`, or Codex's `cached_input_tokens`. Claude Code
* injects a large cached system prompt per call (~27K tokens observed), so
* omitting this materially undercounts the token-overhead headline (the
* "~10% more tokens" cost the variance claim rides on). 0 when the harness
* reports no cache usage.
*/
readonly cacheTokens: number;
/** Total cost in USD when the harness reports it; `null` when unavailable. */
readonly costUsd: number | null;
}
Expand Down
Loading