From 1b1b7c2212ffa3a846b8837ea9808c77b72094fc Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 16:11:39 -0400 Subject: [PATCH 1/2] minimaxm3 MI300X/MI325X non-MTP: start TP-only latency rows at conc 1 Drop the conc-start of the TP-only (latency) search-space rows from 4 to 1 for minimaxm3-fp8-mi300x-vllm and minimaxm3-fp8-mi325x-vllm, capturing the single-request latency point. TEP/DEP rows keep their higher concurrency starts. Mirrors the H100/H200 conc-1 change (#1743). Co-Authored-By: Claude Fable 5 --- .github/configs/amd-master.yaml | 12 ++++++------ perf-changelog.yaml | 7 +++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3609ddab8..84ee0f9ca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2864,12 +2864,12 @@ minimaxm3-fp8-mi300x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 1, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of @@ -2918,16 +2918,16 @@ minimaxm3-fp8-mi325x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 1, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 128, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 1, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 256, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 32 } + - { tp: 8, conc-start: 1, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } diff --git a/perf-changelog.yaml b/perf-changelog.yaml index eda79758b..15348a8e5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3800,3 +3800,10 @@ - "H200-style search space (TP4/TP8 latency, TP4+EP4/TP8+EP8 TEP, TP8+EP8 dp-attn DEP) trimmed at the extreme-concurrency end with TP-only latency rows started at conc 1" - "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X/MI300X) before serving; also adds SPEC_SUFFIX to launch_mi325x-amds.sh so spec-decoding=mtp routes to the _mtp script" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1759 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm + - minimaxm3-fp8-mi325x-vllm + description: + - "Extend the MiniMax-M3 MXFP8 MI300X and MI325X non-MTP sweeps down to concurrency 1 on the TP-only latency rows (was conc 4), to capture the single-request latency point; TEP/DEP rows keep their higher concurrency starts" + pr-link: TBD From 0c221e5f6c298e81f46d3576c54f699e183f0789 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 16:11:50 -0400 Subject: [PATCH 2/2] perf-changelog: fill in PR link for mi300x/mi325x non-MTP conc-1 Co-Authored-By: Claude Fable 5 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 15348a8e5..a9ec67931 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3806,4 +3806,4 @@ - minimaxm3-fp8-mi325x-vllm description: - "Extend the MiniMax-M3 MXFP8 MI300X and MI325X non-MTP sweeps down to concurrency 1 on the TP-only latency rows (was conc 4), to capture the single-request latency point; TEP/DEP rows keep their higher concurrency starts" - pr-link: TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1760