From f9df800b366bc6a01c2697404fdd88ac03416120 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:49:39 -0400 Subject: [PATCH 1/2] minimaxm3 H200/H100 MTP: start TP-only latency rows at conc 1 Drop the conc-start of the TP-only (latency) search-space rows from 4 to 1 for minimaxm3-fp8-h200-vllm-mtp and minimaxm3-fp8-h100-vllm-mtp, matching the conc-1 start used on the non-MTP day-zero recipes so the sweeps capture the single-request latency point. TEP/DEP rows keep their higher concurrency starts (128/256). Follow-up to #1739 (6e8ebf56). Co-Authored-By: Claude Fable 5 --- .github/configs/nvidia-master.yaml | 12 ++++++------ perf-changelog.yaml | 7 +++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6229cf455..2b2848714 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11934,16 +11934,16 @@ minimaxm3-fp8-h200-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } @@ -11966,10 +11966,10 @@ minimaxm3-fp8-h100-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp } diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 155da5483..55f34b749 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3717,3 +3717,10 @@ - "B300-parity layouts and concurrency ranges: TP8, TP8+EP8, TP4, TP4+EP4, TP2+EP2, and TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k" - "launch_mi355x-amds.sh routes M3 weights to NFS /it-share/hf-hub-cache instead of node-local /var/lib NVMe" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1725 + +- config-keys: + - minimaxm3-fp8-h200-vllm-mtp + - minimaxm3-fp8-h100-vllm-mtp + description: + - "Start the TP-only latency rows of the MiniMax-M3 EAGLE3 MTP sweeps (H200, H100) at concurrency 1 instead of 4, matching the conc-1 start used on the non-MTP day-zero recipes — captures the single-request latency point. TEP/DEP rows keep their higher concurrency starts." + pr-link: TBD From c8615e80608aafccdcbdc9c463425de247a2f501 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:49:53 -0400 Subject: [PATCH 2/2] perf-changelog: fill in PR link for minimaxm3 H200/H100 MTP conc-1 Co-Authored-By: Claude Fable 5 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 55f34b749..c6b069a68 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3723,4 +3723,4 @@ - minimaxm3-fp8-h100-vllm-mtp description: - "Start the TP-only latency rows of the MiniMax-M3 EAGLE3 MTP sweeps (H200, H100) at concurrency 1 instead of 4, matching the conc-1 start used on the non-MTP day-zero recipes — captures the single-request latency point. TEP/DEP rows keep their higher concurrency starts." - pr-link: TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1743