-
Notifications
You must be signed in to change notification settings - Fork 194
MiniMax-M3 MXFP8 full sweep config for GB300 #1735
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
965b046
e3fa89f
b915c89
afc3f92
99a075b
26e2005
ce76bd7
b660ddd
7ea8b0b
ef7c650
c94bf9f
7fd8904
5df0669
88e99ce
62fe18d
f4c6384
1d71f49
2bf5851
fd922a6
805dc1c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-2n-1k1k" | ||
|
|
||
| # MiniMax-M3 disaggregated 1P+1D recipe for GB300, 2 nodes. | ||
| # 1 node prefill (4 GPUs, TP4+EP4) → NixlConnector → 1 node decode (4 GPUs, TP4+EP4). | ||
| # --block-size 128 is mandatory (MSA sparse/index cache alignment). | ||
| # safetensors-load-strategy omitted — prefetch OOMs on CW GB300 host memory. | ||
|
|
||
| model: | ||
| path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" | ||
| container: "vllm/vllm-openai:minimax-m3" | ||
| precision: "fp8" | ||
|
|
||
| dynamo: | ||
| install: true | ||
| wheel: "1.2.0.dev20260526" | ||
|
|
||
| # CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the | ||
| # job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups | ||
| # OOM-kill during engine init; srun_options.mem=0 alone only grants a | ||
| # step what the job already holds. cpus-per-task=72 (one NUMA socket) | ||
| # keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. | ||
| # Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. | ||
| sbatch_directives: | ||
| mem: "0" | ||
| cpus-per-task: "72" | ||
| srun_options: | ||
| mem: "0" | ||
|
|
||
| slurm: | ||
| time_limit: "8:00:00" | ||
|
|
||
| health_check: | ||
| max_attempts: 720 | ||
| interval_seconds: 10 | ||
|
|
||
| extra_mount: | ||
| - "__M3_HF_HOME__:__M3_HF_HOME__" | ||
|
|
||
| resources: | ||
| gpu_type: "gb300" | ||
| gpus_per_node: 4 | ||
| prefill_nodes: 1 | ||
| decode_nodes: 1 | ||
| prefill_workers: 1 | ||
| decode_workers: 1 | ||
| gpus_per_prefill: 4 | ||
| gpus_per_decode: 4 | ||
|
|
||
| frontend: | ||
| type: dynamo | ||
| enable_multiple_frontends: false | ||
|
|
||
| backend: | ||
| type: vllm | ||
| connector: null | ||
|
|
||
| prefill_environment: | ||
| VLLM_ENGINE_READY_TIMEOUT_S: "3600" | ||
| VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" | ||
| HF_HOME: "__M3_HF_HOME__" | ||
| # Cache-only at runtime: the launcher pre-stages the full snapshot and | ||
| # verifies it offline before submit, so workers must NOT re-fetch. Without | ||
| # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; | ||
| # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the | ||
| # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). | ||
| HF_HUB_OFFLINE: "1" | ||
| # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the | ||
| # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- | ||
| # registered KV (enable-cumem-allocator below). | ||
| UCX_CUDA_IPC_ENABLE_MNNVL: "y" | ||
|
|
||
| decode_environment: | ||
| VLLM_ENGINE_READY_TIMEOUT_S: "3600" | ||
| VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" | ||
| HF_HOME: "__M3_HF_HOME__" | ||
| # Cache-only at runtime: the launcher pre-stages the full snapshot and | ||
| # verifies it offline before submit, so workers must NOT re-fetch. Without | ||
| # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; | ||
| # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the | ||
| # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). | ||
| HF_HUB_OFFLINE: "1" | ||
| # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the | ||
| # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- | ||
| # registered KV (enable-cumem-allocator below). | ||
| UCX_CUDA_IPC_ENABLE_MNNVL: "y" | ||
|
|
||
| vllm_config: | ||
| prefill: | ||
| kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' | ||
| tensor-parallel-size: 4 | ||
| pipeline-parallel-size: 1 | ||
| enable-expert-parallel: true | ||
| enable-cumem-allocator: true | ||
| enforce-eager: true | ||
| max-model-len: 2304 | ||
| max-num-seqs: 16 | ||
| max-num-batched-tokens: 16384 | ||
| block-size: 128 | ||
| language-model-only: true | ||
| gpu-memory-utilization: 0.9 | ||
| trust-remote-code: true | ||
| no-enable-prefix-caching: true | ||
| stream-interval: 32 | ||
|
|
||
| decode: | ||
| kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' | ||
| tensor-parallel-size: 4 | ||
| pipeline-parallel-size: 1 | ||
| enable-expert-parallel: true | ||
| enable-cumem-allocator: true | ||
| max-model-len: 2304 | ||
| max-num-seqs: 256 | ||
| max-num-batched-tokens: 256 | ||
| max-cudagraph-capture-size: 512 | ||
| block-size: 128 | ||
| language-model-only: true | ||
| gpu-memory-utilization: 0.9 | ||
| trust-remote-code: true | ||
| no-enable-prefix-caching: true | ||
| stream-interval: 32 | ||
|
|
||
| benchmark: | ||
| type: "sa-bench" | ||
| isl: 1024 | ||
| osl: 1024 | ||
| concurrencies: "64x128x256x512" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Recipe omits low concurrenciesMedium Severity The 1k1k Additional Locations (1)Reviewed by Cursor Bugbot for commit 88e99ce. Configure here. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,102 @@ | ||
| name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-1k1k" | ||
|
|
||
| # MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU). | ||
| # Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert | ||
| # parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead | ||
| # made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode | ||
| # or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued), | ||
| # so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0, | ||
| # +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV | ||
| # transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128 | ||
| # mandatory (MSA cache); text-only -> language-model-only. | ||
|
|
||
| model: | ||
| path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" | ||
| container: "vllm/vllm-openai:minimax-m3" | ||
| precision: "fp8" | ||
| dynamo: | ||
| install: true | ||
| wheel: "1.2.0.dev20260526" | ||
| sbatch_directives: | ||
| mem: "0" | ||
| cpus-per-task: "72" | ||
| srun_options: | ||
| mem: "0" | ||
| slurm: | ||
| time_limit: "8:00:00" | ||
| health_check: | ||
| max_attempts: 720 | ||
| interval_seconds: 10 | ||
| extra_mount: | ||
| - "__M3_HF_HOME__:__M3_HF_HOME__" | ||
|
|
||
| resources: | ||
| gpu_type: "gb300" | ||
| gpus_per_node: 4 | ||
| prefill_nodes: 5 | ||
| decode_nodes: 12 | ||
| prefill_workers: 5 | ||
| decode_workers: 12 | ||
| gpus_per_prefill: 4 | ||
| gpus_per_decode: 4 | ||
|
|
||
| frontend: | ||
| type: dynamo | ||
| enable_multiple_frontends: false | ||
|
|
||
| backend: | ||
| type: vllm | ||
| connector: null | ||
|
|
||
| prefill_environment: | ||
| VLLM_ENGINE_READY_TIMEOUT_S: "3600" | ||
| VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" | ||
| HF_HOME: "__M3_HF_HOME__" | ||
| HF_HUB_OFFLINE: "1" | ||
| UCX_CUDA_IPC_ENABLE_MNNVL: "y" | ||
|
|
||
| decode_environment: | ||
| VLLM_ENGINE_READY_TIMEOUT_S: "3600" | ||
| VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" | ||
| HF_HOME: "__M3_HF_HOME__" | ||
| HF_HUB_OFFLINE: "1" | ||
| UCX_CUDA_IPC_ENABLE_MNNVL: "y" | ||
|
|
||
| vllm_config: | ||
| prefill: | ||
| kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' | ||
| tensor-parallel-size: 4 | ||
| pipeline-parallel-size: 1 | ||
| enable-cumem-allocator: true | ||
| enforce-eager: true | ||
| max-num-seqs: 16 | ||
| max-num-batched-tokens: 16384 | ||
| max-model-len: 2304 | ||
| block-size: 128 | ||
| language-model-only: true | ||
| gpu-memory-utilization: 0.9 | ||
| trust-remote-code: true | ||
| no-enable-prefix-caching: true | ||
| stream-interval: 32 | ||
|
|
||
| decode: | ||
| kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' | ||
| tensor-parallel-size: 4 | ||
| pipeline-parallel-size: 1 | ||
| enable-cumem-allocator: true | ||
| max-num-seqs: 256 | ||
| max-num-batched-tokens: 256 | ||
| max-cudagraph-capture-size: 512 | ||
| max-model-len: 2304 | ||
| block-size: 128 | ||
| language-model-only: true | ||
| gpu-memory-utilization: 0.9 | ||
| trust-remote-code: true | ||
| no-enable-prefix-caching: true | ||
| stream-interval: 32 | ||
|
|
||
| benchmark: | ||
| type: "sa-bench" | ||
| isl: 1024 | ||
| osl: 1024 | ||
| concurrencies: "2048x4096x8192" |


Uh oh!
There was an error while loading. Please reload this page.