;
@@ -1875,6 +2129,9 @@ const ScatterGraph = React.memo(
chartDefinition.chartType,
xScaleConfig._isLog,
yScaleConfig.type,
+ optimalPointKeys,
+ getCssColor,
+ resolveColor,
],
);
@@ -2067,6 +2324,17 @@ const ScatterGraph = React.memo(
setHideNonOptimal(checked);
track('latency_hide_non_optimal_toggled', { enabled: checked });
},
+ // On agentic + non-e2e chart, "optimal" means "on the
+ // e2e-latency Pareto frontier" (not a per-axis Pareto on the
+ // current x metric). Explain that so users don't wonder why
+ // a point sitting above the line is still considered
+ // dominated.
+ ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e'
+ ? {
+ infoTooltip:
+ "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.",
+ }
+ : {}),
},
{
id: 'scatter-hide-point-labels',
diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
index f9b1b3c8..73018483 100644
--- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
@@ -194,9 +194,7 @@ export function UnofficialChartDisplay() {
`${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
]
}{' '}
- {graph.chartDefinition[
- `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
- ] || graph.chartDefinition.heading}
+ {graph.chartDefinition.heading}
{graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence}
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 8f8705e1..589ba580 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -157,12 +157,12 @@ describe('processOverlayChartData', () => {
});
it('remaps x to config override for input metrics on interactivity chart', () => {
- // inputTputPerGpu has x override to p99_ttft on interactivity chart
+ // inputTputPerGpu has x override to p90_ttft on interactivity chart
const data = [
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- p99_ttft: 0.25,
+ p90_ttft: 0.25,
median_intvty: 50,
} as any),
];
@@ -176,16 +176,11 @@ describe('processOverlayChartData', () => {
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- median_ttft: 0.1,
+ p90_ttft: 0.1,
median_intvty: 50,
} as any),
];
- const result = processOverlayChartData(
- data,
- 'interactivity',
- 'y_inputTputPerGpu',
- 'median_ttft',
- );
+ const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.1);
});
@@ -195,76 +190,62 @@ describe('processOverlayChartData', () => {
pt({
x: 100,
inputTputPerGpu: { y: 5, roof: false },
- p99_ttft: 0.25,
+ p90_ttft: 0.25,
median_e2el: 2.5,
} as any),
];
const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null);
expect(result).toHaveLength(1);
- // e2e uses median_e2el as x (from chart config default), not p99_ttft
+ // e2e uses median_e2el as x (from chart config default), not p90_ttft
expect(result[0].x).toBe(2.5);
});
- it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => {
- const data = [
- pt({
- x: 100,
- tpPerGpu: { y: 42, roof: false },
- p99_ttft: 0.35,
- median_e2el: 2.5,
- } as any),
- ];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
- expect(result).toHaveLength(1);
- expect(result[0].x).toBe(0.35);
- });
-
- it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => {
+ it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => {
const data = [
pt({
x: 100,
tpPerGpu: { y: 42, roof: false },
- median_ttft: 0.12,
+ p90_ttft: 0.12,
median_e2el: 2.5,
} as any),
];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft');
+ const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.12);
});
it('filters e2e TTFT outliers exceeding y_latency_limit', () => {
const data = [
- pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any),
- pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any),
+ pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any),
+ pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any),
];
- const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
+ const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
// y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.5);
});
it('does not filter interactivity points by latency limit when x-axis is default', () => {
- // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity
+ // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity
// chart's x-axis stays median_intvty for non-input metrics. The latency limit
// (60) must NOT apply to median_intvty values.
const data = [
pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any),
pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any),
];
- const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft');
+ const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft');
expect(result).toHaveLength(2);
});
it('applies latency limit on interactivity only when x-axis is actually overridden', () => {
- // When an input metric IS selected and x-axis overrides to p99_ttft,
+ // When an input metric IS selected and x-axis overrides to p90_ttft,
// the latency limit should apply.
const data = [
- pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any),
- pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any),
+ pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any),
+ pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any),
];
- const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft');
- // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999
+ const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
+ // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999
expect(result).toHaveLength(1);
expect(result[0].x).toBe(0.5);
});
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4b5335b6..4876c614 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -75,11 +75,13 @@ export function processOverlayChartData(
chartType: 'e2e' | 'interactivity',
selectedYAxisMetric: string,
selectedXAxisMetric: string | null,
+ options?: { isAgentic?: boolean },
): InferenceData[] {
const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType);
if (!chartDef) return [];
const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+ const isAgentic = options?.isAgentic === true;
// Resolve x-axis field (must match useChartData logic)
const metricTitle =
@@ -87,9 +89,11 @@ export function processOverlayChartData(
const isInputMetric = metricTitle.toLowerCase().includes('input');
let xAxisField: string = chartDef.x;
// selectedXAxisMetric is already the effective metric for this chart type
- // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
+ // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric).
+ // Match any *_ttft metric — the x-axis-mode picker can now select any
+ // percentile (median/p75/p90/p99) depending on sequence kind.
const isTtftOverride =
- selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+ typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft');
if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
xAxisField = selectedXAxisMetric;
@@ -109,7 +113,12 @@ export function processOverlayChartData(
})
.filter(
(d) =>
- xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit,
+ // Skip the latency limit for the natural x-axis or for agentic
+ // (long TTFTs are normal there, not overload outliers).
+ xAxisField === chartDef.x ||
+ isAgentic ||
+ !chartDef.y_latency_limit ||
+ d.x <= chartDef.y_latency_limit,
);
return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric);
diff --git a/packages/app/src/components/inference/utils/paretoLabels.ts b/packages/app/src/components/inference/utils/paretoLabels.ts
index 6e57d499..8f4f9702 100644
--- a/packages/app/src/components/inference/utils/paretoLabels.ts
+++ b/packages/app/src/components/inference/utils/paretoLabels.ts
@@ -46,7 +46,7 @@ export const parseLabelComponents = (label: string): string[] => {
const parts = label.split('+');
return parts.map((p) => {
// Strip the leading "NxNNN" multiplier (e.g., "1x" or "3x")
- const match = p.match(/^\d+x(.+)$/u);
+ const match = p.match(/^\d+x(?.+)$/u);
return match ? match[1] : p;
});
};
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 4c56d217..ecf0b99d 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -19,6 +19,14 @@ export interface TooltipConfig {
isTracked?: boolean;
/** URL to the GitHub Actions workflow run */
runUrl?: string;
+ /**
+ * Whether this agentic point has a stored trace_replay blob. Controls
+ * visibility of the "View charts" button — the actual distributions are
+ * rendered on the detail page, not inline, so all the tooltip needs is a
+ * presence boolean (sourced from the bulk `/api/v1/trace-availability`
+ * call so we don't ship megabytes of profile JSONL just for this check).
+ */
+ hasTrace?: boolean;
}
export interface OverlayTooltipConfig extends TooltipConfig {
@@ -88,7 +96,76 @@ const runLinkHTML = (runUrl?: string) =>
const tooltipLine = (label: string, value: string | number) =>
`${label}: ${value}
`;
-const shortenSha = (image: string) => image.replaceAll(/(sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$1…');
+const formatPct = (v: number | undefined): string | null =>
+ v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
+
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */
+const fmt = (v: number): string => {
+ if (!Number.isFinite(v)) return String(v);
+ const rounded = parseFloat(v.toFixed(3));
+ if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
+ return String(rounded);
+};
+
+/**
+ * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
+ * success, token totals. Returns an empty string for non-agentic rows.
+ */
+const generateAgenticHTML = (d: InferenceData): string => {
+ if (d.benchmark_type !== 'agentic_traces') return '';
+
+ const parts: string[] = [];
+ if (d.offload_mode) {
+ parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase()));
+ }
+
+ const gpuHit = formatPct(d.server_gpu_cache_hit_rate);
+ const cpuHit = formatPct(d.server_cpu_cache_hit_rate);
+ const theoHit = formatPct(d.theoretical_cache_hit_rate);
+ if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit));
+ if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit));
+ if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit));
+
+ if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) {
+ const successPct =
+ d.num_requests_total > 0
+ ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)`
+ : '';
+ parts.push(
+ tooltipLine(
+ 'Requests',
+ `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`,
+ ),
+ );
+ }
+
+ if (d.total_prompt_tokens !== undefined) {
+ parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens)));
+ }
+ if (d.total_generation_tokens !== undefined) {
+ parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
+ }
+
+ // Histograms + time-series live on the dedicated detail page now; the
+ // "View charts" button (rendered by the wrapper when pinned + has trace
+ // data) takes the user there.
+
+ return parts.join('');
+};
+
+/** "View charts" button — only visible when the tooltip is pinned and the
+ * point has stored trace data. Wired up by the ScatterGraph click handler. */
+const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => {
+ if (!isPinned || !hasTraceData) return '';
+ return `View charts → `;
+};
+
+const shortenSha = (image: string) =>
+ image.replaceAll(/(?sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$1…');
const imageTooltipLine = (image: string) =>
`
@@ -138,7 +215,16 @@ const generateParallelismHTML = (d: InferenceData): string => {
* @returns HTML string for the tooltip content
*/
export const generateTooltipContent = (config: TooltipConfig): string => {
- const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+ const {
+ data: d,
+ isPinned,
+ xLabel,
+ yLabel,
+ selectedYAxisMetric,
+ hardwareConfig,
+ runUrl,
+ hasTrace,
+ } = config;
return `
@@ -156,16 +242,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
: ''
}
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${
selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)}
+ Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
`
: ''
}
@@ -173,7 +259,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)}
+ Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
`
: ''
}
@@ -182,10 +268,12 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
${runLinkHTML(runUrl)}
+ ${viewChartsButtonHTML(isPinned, Boolean(hasTrace))}
${
isPinned
? `
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${tooltipLine('Total GPUs', d.tp)}
${generateParallelismHTML(d)}
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`;
};
@@ -271,16 +360,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
: ''
}
- ${xLabel}: ${formatNumber(d.x)}
+ ${xLabel}: ${fmt(d.x)}
- ${yLabel}: ${formatNumber(d.y)}
+ ${yLabel}: ${fmt(d.y)}
${
selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
? `
- Input Token Throughput per GPU: ${formatNumber(d['inputTputPerGpu'].y)}
+ Input Token Throughput per GPU: ${fmt(d['inputTputPerGpu'].y)}
`
: ''
}
@@ -288,7 +377,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
? `
- Output Token Throughput per GPU: ${formatNumber(d['outputTputPerGpu'].y)}
+ Output Token Throughput per GPU: ${fmt(d['outputTputPerGpu'].y)}
`
: ''
}
@@ -297,9 +386,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
${runLinkHTML(runUrl)}
`;
diff --git a/packages/app/src/components/json-ld.test.ts b/packages/app/src/components/json-ld.test.ts
index 34feaa4b..ee085700 100644
--- a/packages/app/src/components/json-ld.test.ts
+++ b/packages/app/src/components/json-ld.test.ts
@@ -9,7 +9,7 @@ function render(data: object): string {
}
function scriptBody(html: string): string {
- const match = html.match(/