Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11773,6 +11773,197 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
ep: 4
dp-attn: true

# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build
# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set
# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
# shipped without NIXL, so disagg workers crashed at NixlConnector init).
# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER
# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd
# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image
# from m3_release before running. Fully disaggregated, rack-scale wide-EP
# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors
# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel
# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers
# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge.
# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
minimaxm3-fp8-gb200-dynamo-vllm:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: gb200
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP
# would idle DP ranks at small concurrencies, so stay narrow here.
- conc-list: [1, 2, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
# decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
- conc-list: [128, 256, 512]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

# Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
# 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
- conc-list: [512, 1024, 2048]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

# Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
# (4 nodes) = 8 nodes.
- conc-list: [2048, 4096]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

# Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
# (4 nodes) = 12 nodes within one NVL72 rack.
- conc-list: [4096, 8192]
prefill:
num-worker: 4
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# Low latency 8k1k: 1P+1D TP4, 2 nodes. stream-interval 1 + cudagraph
# cap 128 for best interactivity at small concurrencies.
- conc-list: [1, 2, 4, 8, 16]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
- conc-list: [32, 64, 128]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

# Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
- conc-list: [128, 256, 512]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

# Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
- conc-list: [512, 1024]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
Comment thread
cursor[bot] marked this conversation as resolved.
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

# Max throughput 8k1k: 4P+1D, 12 nodes.
- conc-list: [1024, 2048]
prefill:
num-worker: 4
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

# MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ on:

env:
RANDOM_RANGE_RATIO: 0.8
# Day-zero models resolved via hf: ids download from the Hub inside the
# slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
# get 429-rate-limited when several workers pull a 444 GB snapshot at
# once; sbatch/srun inherit this env so the token reaches the workers.
HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
EXP_NAME: ${{ inputs.exp-name }}
IMAGE: ${{ inputs.image }}
MODEL_PREFIX: ${{ inputs.model-prefix }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"

# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP).
# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of
# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so
# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128.

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:minimax-m3"
precision: "fp8"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 720
interval_seconds: 10


resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
# NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
# of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
# NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_CUMEM_ENABLE: "1"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
# NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
# of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
# NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_CUMEM_ENABLE: "1"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13346
enable-expert-parallel: true
enforce-eager: true
max-model-len: 2304
max-num-seqs: 16
max-num-batched-tokens: 16384
block-size: 128
attention-backend: FLASHINFER
language-model-only: true
gpu-memory-utilization: 0.9
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
numa-bind: true
enable-sleep-mode: true
stream-interval: 32

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 2304
max-num-seqs: 512
max-num-batched-tokens: 512
max-cudagraph-capture-size: 512
block-size: 128
attention-backend: FLASHINFER
language-model-only: true
gpu-memory-utilization: 0.9
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
numa-bind: true
enable-sleep-mode: true
stream-interval: 128

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "128x256x512"
Loading
Loading