diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3609ddab8..84ee0f9ca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2864,12 +2864,12 @@ minimaxm3-fp8-mi300x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 1, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of @@ -2918,16 +2918,16 @@ minimaxm3-fp8-mi325x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 1, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 128, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 1, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 256, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 32 } + - { tp: 8, conc-start: 1, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } diff --git a/perf-changelog.yaml b/perf-changelog.yaml index eda79758b..a9ec67931 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3800,3 +3800,10 @@ - "H200-style search space (TP4/TP8 latency, TP4+EP4/TP8+EP8 TEP, TP8+EP8 dp-attn DEP) trimmed at the extreme-concurrency end with TP-only latency rows started at conc 1" - "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X/MI300X) before serving; also adds SPEC_SUFFIX to launch_mi325x-amds.sh so spec-decoding=mtp routes to the _mtp script" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1759 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm + - minimaxm3-fp8-mi325x-vllm + description: + - "Extend the MiniMax-M3 MXFP8 MI300X and MI325X non-MTP sweeps down to concurrency 1 on the TP-only latency rows (was conc 4), to capture the single-request latency point; TEP/DEP rows keep their higher concurrency starts" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1760