Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
965b046
feat: MiniMax-M3 MXFP8 full sweep config for GB300
Oseltamivir Jun 13, 2026
e3fa89f
chore: update perf-changelog pr-link to #1735
Oseltamivir Jun 13, 2026
b915c89
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
afc3f92
Update runner name in nvidia-master.yaml
Oseltamivir Jun 13, 2026
99a075b
fix: add sbatch_directives mem=0 + cpus-per-task=72 to M3 GB300 recipes
Oseltamivir Jun 13, 2026
26e2005
fix: run M3 GB300 workers cache-only (HF_HUB_OFFLINE=1) to avoid fetc…
Oseltamivir Jun 13, 2026
ce76bd7
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
b660ddd
fix: re-pin utils/aiperf to live cjq/agentx-v0.3 tip (ff2b646c)
Oseltamivir Jun 13, 2026
7ea8b0b
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
ef7c650
MiniMax-M3 GB300: disagg-only sweep + multi-node-NVLink KV transfer
Oseltamivir Jun 13, 2026
c94bf9f
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
7fd8904
M3 GB300: add 8k1k disagg sweep; drop unschedulable collocated-1n
Oseltamivir Jun 13, 2026
5df0669
M3 GB300: add rack-saturating balanced-ratio TP-ep1 max-throughput di…
Oseltamivir Jun 14, 2026
88e99ce
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb300…
Oseltamivir Jun 14, 2026
62fe18d
M3 GB300: replace dep16dec with 1P4D TP4-ep1; add prefill-heavy 10P7D…
Oseltamivir Jun 14, 2026
f4c6384
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 14, 2026
1d71f49
[Klaud Cold]minimaxm3-fp8-mi300x-vllm-mtp: day-zero MiniMax-M3 EAGLE3…
functionstackx Jun 14, 2026
2bf5851
[AMD] perf: enable MiniMax M3 CUDA graphs on MI300X (#1750)
cquil11 Jun 14, 2026
fd922a6
[Klaud Cold] minimaxm3-fp8-mi300x-vllm-mtp: run with CUDA graphs (dro…
functionstackx Jun 14, 2026
805dc1c
M3 GB300: drop dominated configs, restore 1P1D full range
Oseltamivir Jun 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2871,3 +2871,33 @@ minimaxm3-fp8-mi300x-vllm:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only
# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like
# H100), with the TP8 latency rows started at conc 1 to capture single-request
# latency — matching the H100/MI355X MTP recipes. The shipped ROCm image lacks
# SupportsEagle3 on the AMD MiniMax-M3 model, so the recipe applies that fix
# in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546;
# validated green on MI355X) before serving.
minimaxm3-fp8-mi300x-vllm-mtp:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi300x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
95 changes: 95 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11773,6 +11773,101 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
ep: 4
dp-attn: true

# MiniMax-M3 GB300 full sweep — safetensors-load-strategy removed from all
# GB300 recipes (host-memory OOM with prefetch on CW Grace Blackwell nodes).
# srun_options mem=0 required (DefMemPerCPU=4096 cgroup limit).
minimaxm3-fp8-gb300-dynamo-vllm:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: gb300-cw
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# Disagg-only. Every recipe enables NixlConnector KV transfer over
# multi-node NVLink (UCX_CUDA_IPC_ENABLE_MNNVL=y + --enable-cumem-allocator),
# which moved the cross-node prefill->decode KV handoff off the RDMA/TCP
# fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s): +17/+23/+49%
# out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU
# IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node,
# so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation).
# 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency:
# peaks at ~1646 out/s/gpu @ conc 256 (1k1k). Covers latency-to-mid.
- conc-list: [8, 16, 32, 64, 128, 256, 512]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
Comment thread
cursor[bot] marked this conversation as resolved.
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false

# Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
# Balanced prefill:decode ratio (single prefill starved the decode pool)
# + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower).
- conc-list: [2048, 4096, 8192]
prefill:
num-worker: 5
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml"
decode:
num-worker: 12
tp: 4
ep: 1
dp-attn: false

- isl: 8192
osl: 1024
search-space:
# 8k1k long context: prefill is heavier and KV is larger, so concurrency
# is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer.
# 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency:
# peaks at ~1209 out/s/gpu @ conc 256 (8k1k). Covers latency-to-mid.
- conc-list: [8, 16, 32, 64, 128, 256]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false

# Prefill-heavy rack-saturating: 10P + 7D, TP4 ep1 (17 nodes).
# At 8k context, prefill is 8x heavier — 5P:12D showed 616-req prefill
# backlog. DSR1/DSV4 GB300 patterns use 6-10 prefill workers for 8k1k.
- conc-list: [1024, 2048, 4096]
prefill:
num-worker: 10
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml"
decode:
num-worker: 7
tp: 4
ep: 1
dp-attn: false

# MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ on:

env:
RANDOM_RANGE_RATIO: 0.8
# Day-zero models resolved via hf: ids download from the Hub inside the
# slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
# get 429-rate-limited when several workers pull a 444 GB snapshot at
# once; sbatch/srun inherit this env so the token reaches the workers.
HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
EXP_NAME: ${{ inputs.exp-name }}
IMAGE: ${{ inputs.image }}
MODEL_PREFIX: ${{ inputs.model-prefix }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-2n-1k1k"

# MiniMax-M3 disaggregated 1P+1D recipe for GB300, 2 nodes.
# 1 node prefill (4 GPUs, TP4+EP4) → NixlConnector → 1 node decode (4 GPUs, TP4+EP4).
# --block-size 128 is mandatory (MSA sparse/index cache alignment).
# safetensors-load-strategy omitted — prefetch OOMs on CW GB300 host memory.

model:
path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3"
precision: "fp8"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
# OOM-kill during engine init; srun_options.mem=0 alone only grants a
# step what the job already holds. cpus-per-task=72 (one NUMA socket)
# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
sbatch_directives:
mem: "0"
cpus-per-task: "72"
srun_options:
mem: "0"

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 720
interval_seconds: 10

extra_mount:
- "__M3_HF_HOME__:__M3_HF_HOME__"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 4
gpus_per_decode: 4

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
HF_HOME: "__M3_HF_HOME__"
# Cache-only at runtime: the launcher pre-stages the full snapshot and
# verifies it offline before submit, so workers must NOT re-fetch. Without
# this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
# co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
# loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
HF_HUB_OFFLINE: "1"
# Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
# NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
# registered KV (enable-cumem-allocator below).
UCX_CUDA_IPC_ENABLE_MNNVL: "y"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
HF_HOME: "__M3_HF_HOME__"
# Cache-only at runtime: the launcher pre-stages the full snapshot and
# verifies it offline before submit, so workers must NOT re-fetch. Without
# this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
# co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
# loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
HF_HUB_OFFLINE: "1"
# Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
# NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
# registered KV (enable-cumem-allocator below).
UCX_CUDA_IPC_ENABLE_MNNVL: "y"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 4
pipeline-parallel-size: 1
enable-expert-parallel: true
enable-cumem-allocator: true
enforce-eager: true
max-model-len: 2304
max-num-seqs: 16
max-num-batched-tokens: 16384
block-size: 128
language-model-only: true
gpu-memory-utilization: 0.9
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 4
pipeline-parallel-size: 1
enable-expert-parallel: true
enable-cumem-allocator: true
max-model-len: 2304
max-num-seqs: 256
max-num-batched-tokens: 256
max-cudagraph-capture-size: 512
block-size: 128
language-model-only: true
gpu-memory-utilization: 0.9
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "64x128x256x512"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recipe omits low concurrencies

Medium Severity

The 1k1k 1P+1D sweep declares conc-list values 8, 16, and 32 in nvidia-master.yaml, but the recipe’s benchmark.concurrencies only runs 64 through 512. Multinode jobs use the recipe list via srtctl, not matrix CONC_LIST, so those low-concurrency points never execute.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 88e99ce. Configure here.

Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-1k1k"

# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU).
# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert
# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead
# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode
# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued),
# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0,
# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV
# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128
# mandatory (MSA cache); text-only -> language-model-only.

model:
path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3"
precision: "fp8"
dynamo:
install: true
wheel: "1.2.0.dev20260526"
sbatch_directives:
mem: "0"
cpus-per-task: "72"
srun_options:
mem: "0"
slurm:
time_limit: "8:00:00"
health_check:
max_attempts: 720
interval_seconds: 10
extra_mount:
- "__M3_HF_HOME__:__M3_HF_HOME__"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 5
decode_nodes: 12
prefill_workers: 5
decode_workers: 12
gpus_per_prefill: 4
gpus_per_decode: 4

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
HF_HOME: "__M3_HF_HOME__"
HF_HUB_OFFLINE: "1"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
HF_HOME: "__M3_HF_HOME__"
HF_HUB_OFFLINE: "1"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 4
pipeline-parallel-size: 1
enable-cumem-allocator: true
enforce-eager: true
max-num-seqs: 16
max-num-batched-tokens: 16384
max-model-len: 2304
block-size: 128
language-model-only: true
gpu-memory-utilization: 0.9
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 4
pipeline-parallel-size: 1
enable-cumem-allocator: true
max-num-seqs: 256
max-num-batched-tokens: 256
max-cudagraph-capture-size: 512
max-model-len: 2304
block-size: 128
language-model-only: true
gpu-memory-utilization: 0.9
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "2048x4096x8192"
Loading
Loading