setSelectedModel(null)}
/>
)}
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.prompts.$promptSlug/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.prompts.$promptSlug/route.tsx
index 29753dd1133..b81ca842c47 100644
--- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.prompts.$promptSlug/route.tsx
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.prompts.$promptSlug/route.tsx
@@ -1626,7 +1626,7 @@ function MetricsTab({
return (
{/* Summary big numbers */}
-
+
{/* Version performance */}
@@ -1808,7 +1822,7 @@ function VersionPerformanceSection({
0 GROUP BY timeBucket ORDER BY timeBucket`}
+ query={`SELECT timeBucket(), prettyFormat(quantile(0.5)((input_cost + cached_read_cost + cache_creation_cost) / input_tokens * 1000), 'costInDollars') AS p50, prettyFormat(quantile(0.95)((input_cost + cached_read_cost + cache_creation_cost) / input_tokens * 1000), 'costInDollars') AS p95 FROM llm_metrics WHERE input_tokens > 0 GROUP BY timeBucket ORDER BY timeBucket`}
config={{
type: "chart",
chartType: "line",
@@ -1862,6 +1876,45 @@ function VersionPerformanceSection({
{...widgetProps}
/>
+ {/* Row 4: Caching */}
+
+
+
+
+
+
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts
index a5250e5b850..c38206473cb 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts
@@ -170,6 +170,8 @@ export const loader = createLoaderApiRoute(
inputCost: aiData.inputCost,
outputCost: aiData.outputCost,
totalCost: aiData.totalCost,
+ cachedCost: aiData.cachedCost,
+ cacheCreationCost: aiData.cacheCreationCost,
tokensPerSecond: aiData.tokensPerSecond,
msToFirstChunk: aiData.msToFirstChunk,
durationMs: aiData.durationMs,
diff --git a/apps/webapp/app/v3/utils/enrichCreatableEvents.server.ts b/apps/webapp/app/v3/utils/enrichCreatableEvents.server.ts
index 94e1539bb44..024ac75a940 100644
--- a/apps/webapp/app/v3/utils/enrichCreatableEvents.server.ts
+++ b/apps/webapp/app/v3/utils/enrichCreatableEvents.server.ts
@@ -110,6 +110,8 @@ function enrichLlmMetrics(event: CreateEventInput): void {
"trigger.llm.input_cost": cost.inputCost,
"trigger.llm.output_cost": cost.outputCost,
"trigger.llm.total_cost": cost.totalCost,
+ "trigger.llm.cached_cost": cost.costDetails["input_cached_tokens"] ?? 0,
+ "trigger.llm.cache_creation_cost": cost.costDetails["cache_creation_input_tokens"] ?? 0,
"trigger.llm.matched_model": cost.matchedModelName,
"trigger.llm.matched_model_id": cost.matchedModelId,
"trigger.llm.pricing_tier": cost.pricingTierName,
diff --git a/internal-packages/llm-model-catalog/src/registry.test.ts b/internal-packages/llm-model-catalog/src/registry.test.ts
index 679c8c4cfcf..349ba2622e6 100644
--- a/internal-packages/llm-model-catalog/src/registry.test.ts
+++ b/internal-packages/llm-model-catalog/src/registry.test.ts
@@ -69,12 +69,59 @@ const claudeSonnet: LlmModelWithPricing = {
],
};
+// Prices cache reads under the Anthropic-style alias `cache_read_input_tokens` (not
+// `input_cached_tokens`) plus a cache-creation price, to exercise alias resolution.
+const claudeWithCache: LlmModelWithPricing = {
+ id: "model-claude-with-cache",
+ friendlyId: "llm_model_claude_with_cache",
+ modelName: "claude-with-cache",
+ matchPattern: "^claude-with-cache$",
+ startDate: null,
+ pricingTiers: [
+ {
+ id: "tier-claude-with-cache",
+ name: "Standard",
+ isDefault: true,
+ priority: 0,
+ conditions: [],
+ prices: [
+ { usageType: "input", price: 0.000003 },
+ { usageType: "output", price: 0.000015 },
+ { usageType: "cache_read_input_tokens", price: 0.0000003 },
+ { usageType: "cache_creation_input_tokens", price: 0.00000375 },
+ ],
+ },
+ ],
+};
+
+// No cache prices at all — cached tokens should fall back to the input price.
+const noCachePrice: LlmModelWithPricing = {
+ id: "model-no-cache-price",
+ friendlyId: "llm_model_no_cache_price",
+ modelName: "no-cache-price",
+ matchPattern: "^no-cache-price$",
+ startDate: null,
+ pricingTiers: [
+ {
+ id: "tier-no-cache-price",
+ name: "Standard",
+ isDefault: true,
+ priority: 0,
+ conditions: [],
+ prices: [
+ { usageType: "input", price: 0.000003 },
+ { usageType: "output", price: 0.000015 },
+ ],
+ },
+ ],
+};
+
describe("ModelPricingRegistry", () => {
let registry: TestableRegistry;
beforeEach(() => {
registry = new TestableRegistry(null as any);
- registry.loadPatterns([gpt4o, claudeSonnet]);
+ registry.loadPatterns([gpt4o, claudeSonnet, claudeWithCache, noCachePrice]);
});
describe("match", () => {
@@ -129,7 +176,10 @@ describe("ModelPricingRegistry", () => {
expect(result!.totalCost).toBeCloseTo(0.0035);
});
- it("should include cached token costs", () => {
+ it("should include cached token costs and charge input only on the fresh portion", () => {
+ // input_tokens (500) is inclusive of the 200 cached read tokens, so the input price
+ // applies to the 300 fresh tokens and the cache price to the 200 cached tokens — the
+ // cached tokens must not be billed twice.
const result = registry.calculateCost("gpt-4o", {
input: 500,
output: 50,
@@ -137,10 +187,57 @@ describe("ModelPricingRegistry", () => {
});
expect(result).not.toBeNull();
- expect(result!.costDetails["input"]).toBeCloseTo(0.00125); // 500 * 0.0000025
+ expect(result!.costDetails["input"]).toBeCloseTo(0.00075); // (500 - 200) * 0.0000025
expect(result!.costDetails["output"]).toBeCloseTo(0.0005); // 50 * 0.00001
expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00025); // 200 * 0.00000125
- expect(result!.totalCost).toBeCloseTo(0.002);
+ expect(result!.totalCost).toBeCloseTo(0.0015);
+ });
+
+ it("should not double-charge cache creation tokens (subset of input)", () => {
+ // input (1000) is inclusive of both the 400 cache-read and 300 cache-creation tokens.
+ const result = registry.calculateCost("claude-with-cache", {
+ input: 1000,
+ output: 100,
+ input_cached_tokens: 400,
+ cache_creation_input_tokens: 300,
+ });
+
+ expect(result).not.toBeNull();
+ // fresh input = 1000 - 400 - 300 = 300
+ expect(result!.costDetails["input"]).toBeCloseTo(0.0009); // 300 * 0.000003
+ expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00012); // 400 * 0.0000003
+ expect(result!.costDetails["cache_creation_input_tokens"]).toBeCloseTo(0.001125); // 300 * 0.00000375
+ expect(result!.costDetails["output"]).toBeCloseTo(0.0015); // 100 * 0.000015
+ // 0.0009 + 0.00012 + 0.001125 + 0.0015
+ expect(result!.totalCost).toBeCloseTo(0.003645);
+ });
+
+ it("should apply the cache-read discount when priced under a provider alias key", () => {
+ // The usage is normalized to `input_cached_tokens` but this model prices cache reads
+ // under `cache_read_input_tokens` — the discount must still apply.
+ const result = registry.calculateCost("claude-with-cache", {
+ input: 1000,
+ input_cached_tokens: 400,
+ });
+
+ expect(result).not.toBeNull();
+ expect(result!.costDetails["input"]).toBeCloseTo(0.0018); // (1000 - 400) * 0.000003
+ expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00012); // 400 * 0.0000003
+ expect(result!.totalCost).toBeCloseTo(0.00192);
+ });
+
+ it("should fall back to the input price for cache tokens when no cache price exists", () => {
+ // no-cache-price model has only input/output prices; cached tokens must still be billed
+ // (at the input price) — never free, never double-charged. Total equals input * price.
+ const result = registry.calculateCost("no-cache-price", {
+ input: 1000,
+ input_cached_tokens: 400,
+ });
+
+ expect(result).not.toBeNull();
+ expect(result!.costDetails["input"]).toBeCloseTo(0.0018); // (1000 - 400) * 0.000003
+ expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.0012); // 400 * 0.000003
+ expect(result!.totalCost).toBeCloseTo(0.003); // 1000 * 0.000003 — unchanged from no-cache behavior
});
it("should return null for unknown model", () => {
diff --git a/internal-packages/llm-model-catalog/src/registry.ts b/internal-packages/llm-model-catalog/src/registry.ts
index 80da40ba980..6a3d52814cb 100644
--- a/internal-packages/llm-model-catalog/src/registry.ts
+++ b/internal-packages/llm-model-catalog/src/registry.ts
@@ -147,7 +147,70 @@ export class ModelPricingRegistry {
const costDetails: Record = {};
let totalCost = 0;
+ // `input_tokens` (the "input" usage value) is the TOTAL prompt token count and is
+ // inclusive of cache-read and cache-creation tokens — providers report it that way and
+ // the AI SDK passes it through (verified: total_tokens == input + output, never the
+ // sum of the decomposed parts). Cache reads/writes are therefore a SUBSET of input, not
+ // additional to it. Charging the full input count at the input price AND charging a
+ // separate cache line double-counts those tokens, so the input price must apply only to
+ // the fresh (non-cached) remainder.
+ const priceByType = new Map(tier.prices.map((p) => [p.usageType, p.price]));
+ const resolvePrice = (aliases: string[]): number | undefined => {
+ for (const alias of aliases) {
+ const price = priceByType.get(alias);
+ if (price !== undefined) return price;
+ }
+ return undefined;
+ };
+
+ const inputPrice = resolvePrice(["input", "input_tokens"]) ?? 0;
+ const cacheReadTokens = usageDetails["input_cached_tokens"] ?? 0;
+ const cacheCreationTokens = usageDetails["cache_creation_input_tokens"] ?? 0;
+
+ // Providers price cache reads/writes under provider-specific keys, but our usage details
+ // normalize them to `input_cached_tokens` / `cache_creation_input_tokens`. Resolve the
+ // matching price across the known aliases, falling back to the input price so cache tokens
+ // are never billed for free and never dropped when a model lacks a dedicated cache price.
+ const cacheReadPrice =
+ resolvePrice(["input_cached_tokens", "input_cache_read", "cache_read_input_tokens"]) ??
+ inputPrice;
+ const cacheCreationPrice =
+ resolvePrice([
+ "cache_creation_input_tokens",
+ "input_cache_creation",
+ "input_cache_creation_5m",
+ "input_cache_creation_1h",
+ ]) ?? inputPrice;
+
+ const totalInputTokens = usageDetails["input"] ?? usageDetails["input_tokens"] ?? 0;
+ const freshInputTokens = Math.max(0, totalInputTokens - cacheReadTokens - cacheCreationTokens);
+
+ const addCost = (usageType: string, tokenCount: number, price: number) => {
+ if (tokenCount <= 0 || price <= 0) return;
+ const cost = tokenCount * price;
+ costDetails[usageType] = (costDetails[usageType] ?? 0) + cost;
+ totalCost += cost;
+ };
+
+ addCost("input", freshInputTokens, inputPrice);
+ addCost("input_cached_tokens", cacheReadTokens, cacheReadPrice);
+ addCost("cache_creation_input_tokens", cacheCreationTokens, cacheCreationPrice);
+
+ // Charge every remaining usage type generically. The input + cache types are handled
+ // above (and their alias keys skipped here) so they are never charged twice.
+ const handledUsageTypes = new Set([
+ "input",
+ "input_tokens",
+ "input_cached_tokens",
+ "input_cache_read",
+ "cache_read_input_tokens",
+ "cache_creation_input_tokens",
+ "input_cache_creation",
+ "input_cache_creation_5m",
+ "input_cache_creation_1h",
+ ]);
for (const priceEntry of tier.prices) {
+ if (handledUsageTypes.has(priceEntry.usageType)) continue;
const tokenCount = usageDetails[priceEntry.usageType] ?? 0;
if (tokenCount === 0) continue;
const cost = tokenCount * priceEntry.price;
diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts
index a8a379f9307..155bb77b7a0 100644
--- a/packages/core/src/v3/schemas/api.ts
+++ b/packages/core/src/v3/schemas/api.ts
@@ -1993,6 +1993,8 @@ export const RetrieveSpanDetailResponseBody = z.object({
inputCost: z.number().optional(),
outputCost: z.number().optional(),
totalCost: z.number().optional(),
+ cachedCost: z.number().optional(),
+ cacheCreationCost: z.number().optional(),
tokensPerSecond: z.number().optional(),
msToFirstChunk: z.number().optional(),
durationMs: z.number(),