diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6229cf455..2b2848714 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11934,16 +11934,16 @@ minimaxm3-fp8-h200-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } @@ -11966,10 +11966,10 @@ minimaxm3-fp8-h100-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp } diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 155da5483..c6b069a68 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3717,3 +3717,10 @@ - "B300-parity layouts and concurrency ranges: TP8, TP8+EP8, TP4, TP4+EP4, TP2+EP2, and TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k" - "launch_mi355x-amds.sh routes M3 weights to NFS /it-share/hf-hub-cache instead of node-local /var/lib NVMe" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1725 + +- config-keys: + - minimaxm3-fp8-h200-vllm-mtp + - minimaxm3-fp8-h100-vllm-mtp + description: + - "Start the TP-only latency rows of the MiniMax-M3 EAGLE3 MTP sweeps (H200, H100) at concurrency 1 instead of 4, matching the conc-1 start used on the non-MTP day-zero recipes — captures the single-request latency point. TEP/DEP rows keep their higher concurrency starts." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1743