From bbdd923d53275ffd09b195baa7d2113da8fb521c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:59:43 -0700 Subject: [PATCH 1/8] feat: MiniMax-M3 MXFP8 full sweep config for GB200 Add minimaxm3-fp8-gb200-dynamo-vllm to nvidia-master.yaml with 6 topologies covering the full concurrency range: - TP4/TP8 (low latency, conc 4-64) - TP4+EP4 agg + 1P+1D disagg (mid curve, conc 64-512) - DEP4/DEP8 (high throughput, conc 256-2048) All recipe YAMLs included under minimax-m3-gb200-fp8/{1k1k,8k1k}/. --- .github/configs/nvidia-master.yaml | 111 ++++++++++++++++++ .../workflows/benchmark-multinode-tmpl.yml | 5 + .../1k1k/agg-gb200-dep4-1n.yaml | 74 ++++++++++++ .../1k1k/agg-gb200-dep8-2n.yaml | 74 ++++++++++++ .../1k1k/agg-gb200-tp4-1n.yaml | 71 +++++++++++ .../1k1k/agg-gb200-tp4ep4-1n.yaml | 72 ++++++++++++ .../1k1k/agg-gb200-tp8-2n.yaml | 71 +++++++++++ .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 89 ++++++++++++++ .../8k1k/agg-gb200-dep8-2n.yaml | 74 ++++++++++++ .../8k1k/agg-gb200-tp4-1n.yaml | 71 +++++++++++ .../8k1k/agg-gb200-tp4ep4-1n.yaml | 72 ++++++++++++ .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 89 ++++++++++++++ perf-changelog.yaml | 12 ++ runners/launch_gb200-nv.sh | 50 ++++++-- 14 files changed, 927 insertions(+), 8 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 187824347..e68adb5f4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11679,6 +11679,117 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: true +# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). +# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint +# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX +# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release; +# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64) +# image built from the m3_release branch (vllm-project/vllm#45381). +# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). +minimaxm3-fp8-gb200-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: gb200 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Low latency: TP=4 aggregated, 1 node (4 GPU). + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml" + decode: + num-worker: 0 + tp: 4 + ep: 1 + dp-attn: false + + # Low latency: TP=8 aggregated, 2 nodes (8 GPU). + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml" + decode: + num-worker: 0 + tp: 8 + ep: 1 + dp-attn: false + + # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU). + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml" + decode: + num-worker: 0 + tp: 4 + ep: 4 + dp-attn: false + + # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). + - conc-list: [64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + + # High throughput: DEP=4 aggregated, 1 node (4 GPU). + - conc-list: [256, 512, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml" + decode: + num-worker: 0 + tp: 1 + ep: 4 + dp-attn: true + + # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU). + - conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 1 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml" + decode: + num-worker: 0 + tp: 1 + ep: 8 + dp-attn: true + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 81727ef39..85b399e6c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -123,6 +123,11 @@ on: env: RANDOM_RANGE_RATIO: 0.8 + # Day-zero models resolved via hf: ids download from the Hub inside the + # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests + # get 429-rate-limited when several workers pull a 444 GB snapshot at + # once; sbatch/srun inherit this env so the token reaches the workers. + HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }} EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} MODEL_PREFIX: ${{ inputs.model-prefix }} diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml new file mode 100644 index 000000000..a95d2df41 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml @@ -0,0 +1,74 @@ +name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml new file mode 100644 index 000000000..ab231e733 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml @@ -0,0 +1,74 @@ +name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml new file mode 100644 index 000000000..ce431c3c0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml @@ -0,0 +1,71 @@ +name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml new file mode 100644 index 000000000..29efa7ecc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml @@ -0,0 +1,72 @@ +name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml new file mode 100644 index 000000000..29a5934bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml @@ -0,0 +1,71 @@ +name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml new file mode 100644 index 000000000..17769abf3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -0,0 +1,89 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200. +# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml new file mode 100644 index 000000000..db729764a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml @@ -0,0 +1,74 @@ +name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 128 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml new file mode 100644 index 000000000..8c7ecbe17 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml @@ -0,0 +1,71 @@ +name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml new file mode 100644 index 000000000..3e146af8b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml @@ -0,0 +1,72 @@ +name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 256 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128x256" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml new file mode 100644 index 000000000..54980f7d3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -0,0 +1,89 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k. +# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128x256" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d29c9a5d3..647121c12 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3646,3 +3646,15 @@ - "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k" - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724 + +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo" + - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" + - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)" + - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode" + - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)" + - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" + - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" + pr-link: TBD diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 36c8af203..9c3430289 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -60,8 +60,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" + elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8" exit 1 fi else @@ -81,7 +84,7 @@ NGINX_IMAGE="nginx:1.27.4" # squash dir on a path that's also visible to compute nodes. Falls # back to the legacy sa-shared path so other configs are untouched. SQUASH_DIR="/mnt/lustre01/users-public/sa-shared" -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then echo "=== cluster diagnostic (minimax sweep) ===" echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)" echo "HOME=$HOME" @@ -128,8 +131,32 @@ fi SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -enroot import -o $SQUASH_FILE docker://$IMAGE -enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Concurrent matrix jobs (three gb200-nv runners) all import to the same +# shared-FS squash path. An unsynchronized `enroot import -o` onto an +# existing file APPENDS to it (mksquashfs default), corrupting the image +# while other jobs' pyxis extractions are reading it — observed on the +# minimaxm3 day-zero sweep (R1: an eval job appended to the live squash +# mid-run). Serialize with a lock, skip when the existing file is valid, +# and build to a temp path + atomic mv so readers never see a half-written +# file. Mirrors the import_squash pattern in launch_gb300-nv.sh. +import_squash() { + local squash="$1" image="$2" + local lock="${squash}.lock" + ( + exec 9>"$lock" + flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; } + if unsquashfs -l "$squash" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import: $squash" + else + rm -f "$squash" "$squash".tmp.* + enroot import -o "${squash}.tmp.$$" "docker://$image" + mv -f "${squash}.tmp.$$" "$squash" + fi + ) || exit 1 +} + +import_squash "$SQUASH_FILE" "$IMAGE" +import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -202,7 +229,7 @@ SRT_REPO_DIR="srt-slurm" # cross-mounted to compute nodes. Put the srt-slurm workspace and staged # InferenceX checkout on a writable shared-FS path that compute can see. # Per-run-unique paths avoid races between parallel sweep jobs. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SHARED_BASE="" for cand in \ /mnt/lustre01/users-public/sa-shared/gha-runs \ @@ -269,6 +296,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 exit 1 fi +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 + cd "$SRT_REPO_DIR" || exit 1 + git checkout main || exit 1 + mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -292,7 +325,7 @@ source $HOME/.local/bin/env # under a head-node-only path, .venv/bin/python3 becomes a broken # symlink on compute. Pin the venv to /usr/bin/python3 — a system # path that exists at the same location on both head and compute. -if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then +if [[ ( $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ) && -x /usr/bin/python3 ]]; then uv venv --seed --python /usr/bin/python3 else uv venv --seed @@ -312,7 +345,7 @@ SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" # Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path # above so srtctl's outputs/ directory (which lives under # SRTCTL_ROOT) is visible to compute nodes. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SRTCTL_ROOT="$SRT_REPO_DIR" fi echo "Creating srtslurm.yaml configuration..." @@ -354,7 +387,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" # can't see. Stage the relevant subset to shared FS and repoint # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already # on shared FS) and .git (not needed in container) for speed. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}" mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1 rsync -a --delete \ @@ -379,6 +412,7 @@ if [[ ! -f "$CONFIG_PATH" ]]; then fi sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" + if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) else From dbf5135c0299f26b19ff814519651f17efdc68e8 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:06:32 -0700 Subject: [PATCH 2/8] chore: update perf-changelog pr-link to #1734 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 647121c12..e1d38dd9f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3657,4 +3657,4 @@ - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)" - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" - pr-link: TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From ed63c1e042078379d6f555d573528c82e7559623 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 20:52:36 -0700 Subject: [PATCH 3/8] feat: switch GB200 M3 to ai-dynamo vllm-runtime 1.3.0 image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adopt the NVIDIA Dynamo vLLM runtime image (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1), the canonical M3 runtime from ai-dynamo/dynamo release/1.3.0-minimax-m3-dev.1. Changes mirrored from that release's recipes/minimax-m3/vllm/disagg/MXFP8/deploy.yaml: - dynamo.install: false — the runtime image bundles dynamo 1.3.0, so the prior 1.2.0 wheel install is dropped (srtctl defaults install=true) - attention-backend: FLASH_ATTN on every prefill/decode/agg engine Benchmark-specific knobs kept over the reference's serving defaults: language-model-only (text-only), no-enable-prefix-caching (random data), scenario-trimmed max-model-len. --- .github/configs/nvidia-master.yaml | 10 ++++++---- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml | 6 +++--- .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 7 ++++--- .../minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml | 6 +++--- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 7 ++++--- 11 files changed, 38 insertions(+), 34 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c11f6505b..d1926f30f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11682,12 +11682,14 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX -# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release; -# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64) -# image built from the m3_release branch (vllm-project/vllm#45381). +# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime +# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with +# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. +# Engine args mirror the canonical recipe (ai-dynamo/dynamo +# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). minimaxm3-fp8-gb200-dynamo-vllm: - image: vllm/vllm-openai:minimax-m3 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml index a95d2df41..3b328ea28 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -60,6 +59,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml index ab231e733..81b000039 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -60,6 +59,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml index ce431c3c0..f7684fe8d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -57,6 +56,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 64 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml index 29efa7ecc..1fc4a3d98 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml index 29a5934bd..65e85f441 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -57,6 +56,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 64 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 17769abf3..90ec1d007 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" @@ -75,6 +75,7 @@ backend: max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml index db729764a..c3f50da69 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -60,6 +59,7 @@ backend: max-num-batched-tokens: 16384 max-cudagraph-capture-size: 128 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml index 8c7ecbe17..444f1e1df 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -57,6 +56,7 @@ backend: max-num-batched-tokens: 16384 max-cudagraph-capture-size: 64 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml index 3e146af8b..ca8ea7e48 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-batched-tokens: 16384 max-cudagraph-capture-size: 256 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 54980f7d3..6a13b50d1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" @@ -75,6 +75,7 @@ backend: max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" From 8738f42aabdbffb4235c5b5bc89c359c59ff26c4 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:11:07 -0700 Subject: [PATCH 4/8] fix: use enroot registry syntax (nvcr.io#) for GB200 M3 image enroot's docker:// URI needs `#` to separate the registry host from the image path; `nvcr.io/...` was parsed as a Docker Hub repo and 401'd against registry-1.docker.io. Matches the existing nvcr.io# convention in nvidia-master.yaml. Recipe container fields kept byte-identical to the master image: field (srtslurm.yaml maps "${IMAGE}" -> squashfile). --- .github/configs/nvidia-master.yaml | 4 ++-- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml | 2 +- .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml | 2 +- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1d24e2857..9e3977232 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11777,13 +11777,13 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX # tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime -# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with +# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. # Engine args mirror the canonical recipe (ai-dynamo/dynamo # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). minimaxm3-fp8-gb200-dynamo-vllm: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 + image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml index 3b328ea28..921f99b8e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml index 81b000039..50eb3ff64 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml index f7684fe8d..6115d210c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml index 1fc4a3d98..94df4c8ec 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml index 65e85f441..1ac2612bd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 90ec1d007..4f9c01c6b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml index c3f50da69..adb36f646 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml index 444f1e1df..8cfbcb616 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml index ca8ea7e48..1567ca57c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 6a13b50d1..86d48468a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: From 3415fb4e6a815393fd6c8ba12210bc9cd2f5074d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 13:44:48 -0700 Subject: [PATCH 5/8] feat: convert MiniMax-M3 GB200 sweep to fully disaggregated inference Replace the mostly-aggregated GB200 sweep (5 agg + 1 disagg) with a fully disaggregated sweep that splits prefill/decode over NixlConnector, mirroring the minimaxm2.5-fp8-gb200 reference. Every worker = one 4-GPU node since the 444 GB MXFP8 checkpoint can't fit in fewer. Topologies (1k1k): 1P1D TP4 (low-lat), 1P1D TP4+EP4 (mid), 1P2D TP4+EP4 (decode-scaled), 2P1D TP4+EP4 (prefill-scaled), 1P1D DEP4 (max-tput), spanning conc 4-2048. - add 4 disagg recipes; remove 8 orphaned agg recipes (1k1k + 8k1k) - rewire nvidia-master.yaml search-space to the 5 disagg entries - perf-changelog: describe disagg sweep; fix stale Image line (vllm/vllm-openai:minimax-m3 -> nvcr.io#.../vllm-runtime:1.3.0-minimax-m3-dev.1) Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 66 ++++++------- .../1k1k/agg-gb200-dep4-1n.yaml | 74 -------------- .../1k1k/agg-gb200-dep8-2n.yaml | 74 -------------- .../1k1k/agg-gb200-tp4-1n.yaml | 71 -------------- .../1k1k/agg-gb200-tp4ep4-1n.yaml | 72 -------------- .../1k1k/agg-gb200-tp8-2n.yaml | 71 -------------- .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml | 96 +++++++++++++++++++ .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 89 +++++++++++++++++ .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml | 92 ++++++++++++++++++ .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml | 92 ++++++++++++++++++ .../8k1k/agg-gb200-dep8-2n.yaml | 74 -------------- .../8k1k/agg-gb200-tp4-1n.yaml | 71 -------------- .../8k1k/agg-gb200-tp4ep4-1n.yaml | 72 -------------- perf-changelog.yaml | 10 +- 14 files changed, 401 insertions(+), 623 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e3977232..15aee30c5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11781,7 +11781,10 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. # Engine args mirror the canonical recipe (ai-dynamo/dynamo # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. -# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). +# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split, +# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in +# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4 +# (decode- and prefill-scaled), 1P1D DEP4 (max throughput). minimaxm3-fp8-gb200-dynamo-vllm: image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -11796,7 +11799,7 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Low latency: TP=4 aggregated, 1 node (4 GPU). + # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each). - conc-list: [4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -11804,86 +11807,71 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml" decode: - num-worker: 0 + num-worker: 1 tp: 4 ep: 1 dp-attn: false - # Low latency: TP=8 aggregated, 2 nodes (8 GPU). - - conc-list: [4, 8, 16, 32, 64] + # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). + - conc-list: [64, 128, 256, 512] prefill: num-worker: 1 - tp: 8 - ep: 1 + tp: 4 + ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" decode: - num-worker: 0 - tp: 8 - ep: 1 + num-worker: 1 + tp: 4 + ep: 4 dp-attn: false - # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU). - - conc-list: [128, 256, 512] + # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each). + - conc-list: [256, 512, 1024] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml" decode: - num-worker: 0 + num-worker: 2 tp: 4 ep: 4 dp-attn: false - # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). - - conc-list: [64, 128, 256, 512] + # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each). + - conc-list: [256, 512, 1024] prefill: - num-worker: 1 + num-worker: 2 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml" decode: num-worker: 1 tp: 4 ep: 4 dp-attn: false - # High throughput: DEP=4 aggregated, 1 node (4 GPU). - - conc-list: [256, 512, 1024] + # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each). + - conc-list: [512, 1024, 2048] prefill: num-worker: 1 tp: 1 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml" decode: - num-worker: 0 - tp: 1 - ep: 4 - dp-attn: true - - # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU). - - conc-list: [512, 1024, 2048] - prefill: num-worker: 1 tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml" - decode: - num-worker: 0 - tp: 1 - ep: 8 + ep: 4 dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml deleted file mode 100644 index 921f99b8e..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml deleted file mode 100644 index 50eb3ff64..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml deleted file mode 100644 index 6115d210c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 64 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml deleted file mode 100644 index 94df4c8ec..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml deleted file mode 100644 index 1ac2612bd..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 64 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml new file mode 100644 index 000000000..0749dbc86 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml @@ -0,0 +1,96 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve). +# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode +# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode +# token throughput at high concurrency; engine shape mirrors the proven +# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index +# cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml new file mode 100644 index 000000000..927066e42 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -0,0 +1,89 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve). +# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP, +# no expert parallel: lowest TTFT/ITL for small concurrencies. +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml new file mode 100644 index 000000000..fbb99a3dd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml @@ -0,0 +1,92 @@ +name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k" + +# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled). +# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node +# each) = 3 nodes. Two decode workers absorb more in-flight sequences for +# mid/high concurrencies while a single prefill keeps TTFT low. +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml new file mode 100644 index 000000000..fb27934cb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml @@ -0,0 +1,92 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k" + +# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled). +# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4, +# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at +# mid/high concurrencies without starving a single decode worker. +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml deleted file mode 100644 index adb36f646..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml deleted file mode 100644 index 8cfbcb616..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 64 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml deleted file mode 100644 index 1567ca57c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 256 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128x256" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8ab05189e..5327dbd02 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3652,12 +3652,12 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo" + - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo" - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" - - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)" - - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode" - - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)" - - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" + - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)" + - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" + - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)" + - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048" - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From 803cd20f243bb841b2013364af932e6aa9690850 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:19:36 -0700 Subject: [PATCH 6/8] fix: restore NIXL-bearing image for M3 GB200 disagg + enable MNNVL KV transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 27478698552 failed: every disagg worker crashed at NixlConnector init with "NIXL is not available" (RuntimeError, vllm .../nixl/worker.py:248). The ai-dynamo vllm-runtime:1.3.0-minimax-m3-dev.1 image ships dynamo but NOT the nixl bindings (cupy missing too), so kv_connector=NixlConnector cannot initialize and the engine core never becomes healthy. Revert to the pre-ed63c1e0 runtime path that pulls NIXL in via the dynamo wheel (same as the working minimaxm2.5-gb200 disagg recipes): - image/container: vllm/vllm-openai:minimax-m3 (the m3_release build all other m3 entries already use) - dynamo.install=true + wheel 1.2.0.dev20260526 (nixl is a dynamo dep) - keep attention-backend FLASH_ATTN (added in the image-switch commit) Also enable NVLink (MNNVL) KV transfer so NIXL doesn't fall back to TCP, mirroring the deepseek-v4 gb200 disagg recipes — on every prefill/decode env block: UCX_TLS=cuda_copy,cuda_ipc,tcp UCX_CUDA_IPC_ENABLE_MNNVL=y UCX_MEMTYPE_CACHE=n / UCX_MEMTYPE_REG_WHOLE=n NCCL_CUMEM_ENABLE=1 (cuMem-allocate buffers so they are IPC-exportable) Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 18 +++++++++------- .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml | 21 +++++++++++++++++-- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 21 +++++++++++++++++-- perf-changelog.yaml | 4 ++-- 8 files changed, 126 insertions(+), 22 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b0b99d53f..f246f518a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11776,17 +11776,19 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX -# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime -# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with -# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. +# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build +# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set +# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND +# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image +# shipped without NIXL, so disagg workers crashed at NixlConnector init). # Engine args mirror the canonical recipe (ai-dynamo/dynamo # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. -# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split, -# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in -# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4 -# (decode- and prefill-scaled), 1P1D DEP4 (max throughput). +# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over +# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB +# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), +# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput). minimaxm3-fp8-gb200-dynamo-vllm: - image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 + image: vllm/vllm-openai:minimax-m3 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml index 0749dbc86..4b56e9e6f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml @@ -9,11 +9,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -44,10 +45,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index 927066e42..558c5d894 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -7,11 +7,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -42,10 +43,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 4f9c01c6b..eeefc68c1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -41,10 +42,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml index fbb99a3dd..02d9bd98e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml @@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -43,10 +44,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml index fb27934cb..4a440766a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml @@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -43,10 +44,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 86d48468a..c14b9fb3b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -41,10 +42,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index be638f5f1..627ed5bb1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3654,8 +3654,8 @@ description: - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo" - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" - - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)" - - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" + - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" + - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)" - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048" - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" From 1320056380a6f095211fbbb016a9fcc57fdbfbb6 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 20:04:13 -0700 Subject: [PATCH 7/8] feat: rack-scale wide-EP (DeepSeek megamoe) M3 GB200 disagg + FLASHINFER The narrow DEP8-max sweep showed no GB200 advantage over B200 because both cap at an 8-GPU NVLink island. Exploit NVL72's rack-scale NVLink with wide expert parallelism spanning multiple nodes, mirroring the deepseek-v4 "megamoe" ladder (DEP = data-parallel attention + expert-parallel): - 1P1D TP4 (2n) low-latency, conc 4-64 - 1P1D DEP8 (4n) mid, EP8/16-experts-per-rank, conc 128-512 - 1P1D DEP8->DEP16 (6n) wide decode (EP16), conc 512-2048 - 2P1D DEP8->DEP16 (8n) prefill-scaled, conc 2048-4096 - 4P1D DEP8->DEP16 (12n) max throughput, conc 4096-8192 M3 has 128 routed experts (top-4), so EP8/EP16 shard cleanly. EP16 across 16 GPU / 4 nodes is the regime B200 physically can't reach. Attention: FLASH_ATTN -> FLASHINFER (trtllm-gen) on all GB200 recipes to exploit Blackwell. Requires the :minimax-m3 image rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381), which gates trtllm-gen page>=128. Also add GB200 perf/NVLink-KV knobs from the deepseek-v4 reference: numa-bind (Grace) and enable-sleep-mode (cuMem allocator so the KV cache is IPC-exportable over the MNNVL fabric), alongside the existing UCX MNNVL env. Replaces the four narrow EP4 recipes; keeps 1P1D TP4 for low latency. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 95 +++++++++------- ...3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} | 45 +++++--- ...l => disagg-gb200-1p1d-dep8-dep16-6n.yaml} | 36 +++--- .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 17 ++- ...l => disagg-gb200-2p1d-dep8-dep16-8n.yaml} | 42 ++++--- ... => disagg-gb200-4p1d-dep8-dep16-12n.yaml} | 44 ++++--- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 107 ------------------ perf-changelog.yaml | 15 +-- 8 files changed, 170 insertions(+), 231 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p2d-tp4ep4-3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} (69%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep4-2n.yaml => disagg-gb200-1p1d-dep8-dep16-6n.yaml} (77%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-tp4ep4-3n.yaml => disagg-gb200-2p1d-dep8-dep16-8n.yaml} (74%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tp4ep4-2n.yaml => disagg-gb200-4p1d-dep8-dep16-12n.yaml} (71%) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f246f518a..70ec293af 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11781,12 +11781,16 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND # NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image # shipped without NIXL, so disagg workers crashed at NixlConnector init). -# Engine args mirror the canonical recipe (ai-dynamo/dynamo -# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. -# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over -# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB -# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), -# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput). +# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER +# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd +# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image +# from m3_release before running. Fully disaggregated, rack-scale wide-EP +# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors +# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel +# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers +# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge. +# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode), +# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts. minimaxm3-fp8-gb200-dynamo-vllm: image: vllm/vllm-openai:minimax-m3 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -11801,7 +11805,8 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each). + # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP + # would idle DP ranks at small concurrencies, so stay narrow here. - conc-list: [4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -11816,64 +11821,68 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false - # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). - - conc-list: [64, 128, 256, 512] + # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes + # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank). + - conc-list: [128, 256, 512] prefill: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" decode: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true - # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each). - - conc-list: [256, 512, 1024] + # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU / + # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink. + - conc-list: [512, 1024, 2048] prefill: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" decode: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: false + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true - # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each). - - conc-list: [256, 512, 1024] + # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode + # (4 nodes) = 8 nodes. + - conc-list: [2048, 4096] prefill: num-worker: 2 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" decode: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 16 + ep: 16 + dp-attn: true - # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each). - - conc-list: [512, 1024, 2048] + # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode + # (4 nodes) = 12 nodes within one NVL72 rack. + - conc-list: [4096, 8192] prefill: - num-worker: 1 - tp: 1 - ep: 4 + num-worker: 4 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" decode: num-worker: 1 - tp: 1 - ep: 4 + tp: 16 + ep: 16 dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml similarity index 69% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml index 02d9bd98e..efc5d5740 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -1,10 +1,11 @@ -name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k" -# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled). -# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node -# each) = 3 nodes. Two decode workers absorb more in-flight sequences for -# mid/high concurrencies while a single prefill keeps TTFT low. -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP). +# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> +# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel +# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of +# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so +# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128. model: path: "minimax-m3-mxfp8" @@ -26,12 +27,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 2 prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 4 - gpus_per_decode: 4 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -68,42 +69,50 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml similarity index 77% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml index 4b56e9e6f..5ca08a06d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -1,11 +1,10 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k" -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve). -# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode -# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode -# token throughput at high concurrency; engine shape mirrors the proven -# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index -# cache alignment). +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve). +# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn +# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128) +# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER +# attention, block-size 128. model: path: "minimax-m3-mxfp8" @@ -27,12 +26,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 4 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -71,7 +70,7 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 4 + data-parallel-size: 8 data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true @@ -79,31 +78,36 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 4 + data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-batched-tokens: 2048 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 128 benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index 558c5d894..b60b17515 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -1,9 +1,10 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve). -# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP, -# no expert parallel: lowest TTFT/ITL for small concurrencies. -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure +# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where +# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, +# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). model: path: "minimax-m3-mxfp8" @@ -74,12 +75,14 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: @@ -91,13 +94,15 @@ backend: max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml similarity index 74% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml index 4a440766a..853095727 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -1,10 +1,10 @@ -name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k" # MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled). -# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4, -# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at -# mid/high concurrencies without starving a single decode worker. -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16 +# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt +# ingest into a single wide decode at high concurrency. FLASHINFER +# attention, block-size 128. model: path: "minimax-m3-mxfp8" @@ -26,12 +26,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 1 + prefill_nodes: 4 + decode_nodes: 4 prefill_workers: 2 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -68,42 +68,50 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "2048x4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml similarity index 71% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml index eeefc68c1..4a6aa5d0f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -1,8 +1,10 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k" -# MiniMax-M3 disaggregated 1P+1D recipe for GB200. -# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput). +# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector -> +# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max +# prefill fan-in for the highest-concurrency points. FLASHINFER attention, +# block-size 128. model: path: "minimax-m3-mxfp8" @@ -24,12 +26,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 4 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -66,42 +68,50 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "64x128x256x512" + concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml deleted file mode 100644 index c14b9fb3b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ /dev/null @@ -1,107 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k. -# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). -# --block-size 128 is mandatory (MSA sparse/index cache alignment). - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 9472 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128x256" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 627ed5bb1..295a8e694 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3652,13 +3652,14 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo" - - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" - - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" - - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" - - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)" - - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048" - - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" + - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo" + - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)" + - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" + - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks" + - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" + - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" + - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" + - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From c8cd5670cc1878c9d9109c8b212c2e02adb7eb98 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:00:11 -0700 Subject: [PATCH 8/8] feat: tune 1k1k low-conc latency + add 8k1k sweep for M3 GB200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1k1k TP4 low-conc tuning: stream-interval 1 (was 128 decode / 32 prefill), cudagraph cap 128 (was 512), conc range extended to 1-64 (was 4-64) to match B200 coverage. 8k1k sweep: 5 disagg recipes mirroring the 1k1k megamoe ladder (TP4, DEP8, DEP8→DEP16, 2P1D, 4P1D) with max-model-len 9472 (74×128 blocks = ISL+OSL+256 headroom). Concurrencies shifted ~4x lower for 8x heavier prefill: TP4 1-16, DEP8 32-128, DEP8→DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 81 ++++++++++++- .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 8 +- .../8k1k/disagg-gb200-1p1d-dep8-4n.yaml | 111 ++++++++++++++++++ .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 111 ++++++++++++++++++ .../8k1k/disagg-gb200-1p1d-tp4-2n.yaml | 106 +++++++++++++++++ .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 110 +++++++++++++++++ .../disagg-gb200-4p1d-dep8-dep16-12n.yaml | 110 +++++++++++++++++ perf-changelog.yaml | 3 +- 8 files changed, 634 insertions(+), 6 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 70ec293af..32957e282 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11807,7 +11807,7 @@ minimaxm3-fp8-gb200-dynamo-vllm: search-space: # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP # would idle DP ranks at small concurrencies, so stay narrow here. - - conc-list: [4, 8, 16, 32, 64] + - conc-list: [1, 2, 4, 8, 16, 32, 64] prefill: num-worker: 1 tp: 4 @@ -11885,6 +11885,85 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 16 dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # Low latency 8k1k: 1P+1D TP4, 2 nodes. stream-interval 1 + cudagraph + # cap 128 for best interactivity at small concurrencies. + - conc-list: [1, 2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL. + - conc-list: [512, 1024] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput 8k1k: 4P+1D, 12 nodes. + - conc-list: [1024, 2048] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index b60b17515..f3e79340a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -83,7 +83,7 @@ backend: no-enable-prefix-caching: true numa-bind: true enable-sleep-mode: true - stream-interval: 32 + stream-interval: 1 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -92,7 +92,7 @@ backend: max-model-len: 2304 max-num-seqs: 256 max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 + max-cudagraph-capture-size: 128 block-size: 128 attention-backend: FLASHINFER language-model-only: true @@ -102,10 +102,10 @@ backend: no-enable-prefix-caching: true numa-bind: true enable-sleep-mode: true - stream-interval: 128 + stream-interval: 1 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x8x16x32x64" + concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml new file mode 100644 index 000000000..f6f2c7874 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -0,0 +1,111 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k). +# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> +# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel +# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards +# 16 experts/rank. FLASHINFER attention, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml new file mode 100644 index 000000000..0d7d44843 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -0,0 +1,111 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k). +# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 +# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 +# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has +# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml new file mode 100644 index 000000000..b0602354c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k). +# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure +# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where +# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, +# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). +# Low-conc tuned: stream-interval 1, cudagraph cap 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml new file mode 100644 index 000000000..6a0765c60 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -0,0 +1,110 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k). +# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16 +# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute; +# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml new file mode 100644 index 000000000..9e4ff3c2b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -0,0 +1,110 @@ +name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k). +# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16 +# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill +# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8329ac1da..46ac06a08 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3659,7 +3659,8 @@ - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192" + - "1k1k concurrency sweep: TP4 1-64 (low-conc latency tuned: stream-interval 1, cudagraph cap 128), DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192" + - "8k1k concurrency sweep (same 5 topologies, shifted ~4x lower for 8x heavier prefill): TP4 1-16, DEP8 32-128, DEP8->DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048; max-model-len 9472 (74*128)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: