From 965b046386dc8e24e25cefc75a2d3c996b981f17 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:05:57 -0700 Subject: [PATCH 01/14] feat: MiniMax-M3 MXFP8 full sweep config for GB300 Add minimaxm3-fp8-gb300-dynamo-vllm to nvidia-master.yaml with 7 topologies covering the full concurrency range: - TP4/TP8 (low latency, conc 4-64) - TP4+EP4 agg + 1P+1D disagg 2-node + 1P+1D collocated (mid, conc 64-512) - DEP4/DEP8 (high throughput, conc 256-2048) All recipe YAMLs included under minimax-m3-gb300-fp8/{1k1k,8k1k}/. GB300 recipes include srun_options mem=0 (CW DefMemPerCPU cgroup fix) and omit safetensors-load-strategy prefetch (host-memory limit). --- .github/configs/nvidia-master.yaml | 122 ++++++++++++++++++ .../workflows/benchmark-multinode-tmpl.yml | 5 + .../1k1k/agg-gb300-dep4-1n.yaml | 82 ++++++++++++ .../1k1k/agg-gb300-dep8-2n.yaml | 82 ++++++++++++ .../1k1k/agg-gb300-tp4-1n.yaml | 79 ++++++++++++ .../1k1k/agg-gb300-tp4ep4-1n.yaml | 80 ++++++++++++ .../1k1k/agg-gb300-tp8-2n.yaml | 79 ++++++++++++ .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 97 ++++++++++++++ .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml | 95 ++++++++++++++ .../8k1k/agg-gb300-dep8-2n.yaml | 82 ++++++++++++ .../8k1k/agg-gb300-tp4-1n.yaml | 79 ++++++++++++ .../8k1k/agg-gb300-tp4ep4-1n.yaml | 80 ++++++++++++ .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 97 ++++++++++++++ perf-changelog.yaml | 13 ++ runners/launch_gb300-cw.sh | 51 +++++++- 15 files changed, 1122 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 187824347..4f66dd392 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11679,6 +11679,128 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: true +# MiniMax-M3 GB300 full sweep — safetensors-load-strategy removed from all +# GB300 recipes (host-memory OOM with prefetch on CW Grace Blackwell nodes). +# srun_options mem=0 required (DefMemPerCPU=4096 cgroup limit). +minimaxm3-fp8-gb300-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: gb300 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Low latency: TP=4 aggregated, 1 node (4 GPU). + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml" + decode: + num-worker: 0 + tp: 4 + ep: 1 + dp-attn: false + + # Low latency: TP=8 aggregated, 2 nodes (8 GPU). + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml" + decode: + num-worker: 0 + tp: 8 + ep: 1 + dp-attn: false + + # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU). + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml" + decode: + num-worker: 0 + tp: 4 + ep: 4 + dp-attn: false + + # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). + - conc-list: [64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + + # Mid curve: 1P+1D disagg TP4+EP4, collocated 1 node (8 GPU). + - conc-list: [64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + + # High throughput: DEP=4 aggregated, 1 node (4 GPU). + - conc-list: [256, 512, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml" + decode: + num-worker: 0 + tp: 1 + ep: 4 + dp-attn: true + + # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU). + - conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 1 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml" + decode: + num-worker: 0 + tp: 1 + ep: 8 + dp-attn: true + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 81727ef39..85b399e6c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -123,6 +123,11 @@ on: env: RANDOM_RANGE_RATIO: 0.8 + # Day-zero models resolved via hf: ids download from the Hub inside the + # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests + # get 429-rate-limited when several workers pull a 444 GB snapshot at + # once; sbatch/srun inherit this env so the token reaches the workers. + HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }} EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} MODEL_PREFIX: ${{ inputs.model-prefix }} diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml new file mode 100644 index 000000000..58467b48d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml @@ -0,0 +1,82 @@ +name: "minimax-m3-vllm-agg-gb300-dep4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml new file mode 100644 index 000000000..5842a2aec --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml @@ -0,0 +1,82 @@ +name: "minimax-m3-vllm-agg-gb300-dep8-2n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml new file mode 100644 index 000000000..d6f981ab6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-agg-gb300-tp4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml new file mode 100644 index 000000000..8f5bc8675 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml new file mode 100644 index 000000000..6ab7b8a61 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-agg-gb300-tp8-2n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml new file mode 100644 index 000000000..ac80ba8e9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -0,0 +1,97 @@ +name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node. +# 8 GPUs per GB300 node: 4 for prefill, 4 for decode. +# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4). +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml new file mode 100644 index 000000000..1bcbc0ac2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml @@ -0,0 +1,95 @@ +name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-2n-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB300, 2 nodes. +# 1 node prefill (4 GPUs, TP4+EP4) → NixlConnector → 1 node decode (4 GPUs, TP4+EP4). +# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# safetensors-load-strategy omitted — prefetch OOMs on CW GB300 host memory. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml new file mode 100644 index 000000000..d4d4af392 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml @@ -0,0 +1,82 @@ +name: "minimax-m3-vllm-agg-gb300-dep8-2n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 256 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml new file mode 100644 index 000000000..7d268187d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-agg-gb300-tp4-1n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml new file mode 100644 index 000000000..196981472 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads +# the 444 GB MXFP8 snapshot once, on a compute node, into the shared +# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable +# shared-FS path and bind-mounts it via extra_mount). +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml new file mode 100644 index 000000000..e9c60933c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -0,0 +1,97 @@ +name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node. +# 8 GPUs per GB300 node: 4 for prefill, 4 for decode. +# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4). +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128x256" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d29c9a5d3..fe804d0cc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3646,3 +3646,16 @@ - "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k" - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724 + +- config-keys: + - minimaxm3-fp8-gb300-dynamo-vllm + description: + - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB300 via Dynamo" + - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" + - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)" + - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode" + - "7 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), 1P+1D disagg collocated (1n), DEP4 (1n), DEP8 (2n)" + - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" + - "GB300 recipes require srun_options mem=0 (CW DefMemPerCPU=4096 causes host-RAM OOM) and omit safetensors-load-strategy prefetch (~600 GB host-memory limit)" + - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" + pr-link: TBD diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 6a5c50e38..50dae4465 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -59,8 +59,24 @@ elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then echo "Unsupported framework on gb300-cw for glm5/fp8: $FRAMEWORK. Currently supported: dynamo-sglang" exit 1 fi +elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + # Day-zero: MiniMax-M3-MXFP8 is not staged on this cluster. The recipes + # carry an hf: model id directly, so srtctl pre-downloads the snapshot + # into the shared HF_HOME sed-injected below; MODEL_PATH only feeds the + # (unreferenced) model_paths aliases in srtslurm.yaml. + export MODEL_PATH="hf:MiniMaxAI/MiniMax-M3-MXFP8" + + if [[ $FRAMEWORK == "dynamo-vllm" ]]; then + SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" + SRT_SLURM_RECIPES_REF="main" + SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8" + SRT_RECIPE_DST="recipes/vllm/minimax-m3-gb300-fp8" + else + echo "Unsupported framework on gb300-cw for minimaxm3/fp8: $FRAMEWORK. Currently supported: dynamo-vllm" + exit 1 + fi else - echo "Unsupported model prefix/precision combination on gb300-cw: $MODEL_PREFIX/$PRECISION. Currently supported: dsv4/fp4, glm5/fp8" + echo "Unsupported model prefix/precision combination on gb300-cw: $MODEL_PREFIX/$PRECISION. Currently supported: dsv4/fp4, glm5/fp8, minimaxm3/fp8" exit 1 fi @@ -276,6 +292,39 @@ else mv "$TMP_CONFIG_FILE" "$CONFIG_FILE" fi +# MiniMax-M3 day-zero: the recipes use an hf: model id and need a shared-FS +# HF_HOME visible (and writable) on compute nodes for srtctl's one-time +# pre-download of the 444 GB snapshot. /mnt/vast is the shared NFS that +# already hosts models and squash files on this cluster. +if [[ $MODEL_PREFIX == "minimaxm3" ]]; then + M3_HF_HOME="/mnt/vast/hf-home" + mkdir -p "$M3_HF_HOME" + sed -i "s|__M3_HF_HOME__|${M3_HF_HOME}|g" "$CONFIG_FILE" + # Dynamo's rust fetch_model dies instantly on ANY held .lock file + # ("Lock acquisition failed") — it doesn't retry like Python's + # huggingface_hub. Concurrent GHA jobs race on the 444 GB download + # and create fresh locks that survive the old "mmin +10" cleanup. + # Fix: nuke ALL locks (safe — HF uses atomic rename from .incomplete), + # then force-download with the Python client (which DOES wait for + # locks) so srtctl's pre-download is a no-op and dynamo sees a fully + # cached snapshot with zero lock files. + find "$M3_HF_HOME" -name '*.lock' -delete 2>/dev/null || true + export HF_HOME="$M3_HF_HOME" + DL_CMD="huggingface-cli download" + command -v huggingface-cli >/dev/null 2>&1 || DL_CMD="hf download" + for _attempt in 1 2 3; do + if HF_HUB_OFFLINE=1 $DL_CMD MiniMaxAI/MiniMax-M3-MXFP8 --quiet 2>/dev/null; then + echo "MiniMax-M3-MXFP8 fully cached (verified offline)" + break + fi + echo "MiniMax-M3 cache incomplete, downloading (attempt $_attempt)..." + find "$M3_HF_HOME" -name '*.lock' -delete 2>/dev/null || true + $DL_CMD MiniMaxAI/MiniMax-M3-MXFP8 --quiet 2>&1 | tail -5 || true + sleep 5 + done + find "$M3_HF_HOME" -name '*.lock' -delete 2>/dev/null || true +fi + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT" From e3fa89f3996ec115bae5a53cba57c1635c8b6cf0 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:06:25 -0700 Subject: [PATCH 02/14] chore: update perf-changelog pr-link to #1735 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fe804d0cc..2c1292116 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3658,4 +3658,4 @@ - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" - "GB300 recipes require srun_options mem=0 (CW DefMemPerCPU=4096 causes host-RAM OOM) and omit safetensors-load-strategy prefetch (~600 GB host-memory limit)" - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" - pr-link: TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1735 From afc3f921abfb7ce333dcd3d8fe67f9dbbd299963 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:20:14 -0700 Subject: [PATCH 03/14] Update runner name in nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7dbaa8f2e..5e85e9f3c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11686,7 +11686,7 @@ minimaxm3-fp8-gb300-dynamo-vllm: image: vllm/vllm-openai:minimax-m3 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 - runner: gb300 + runner: gb300-cw precision: fp8 framework: dynamo-vllm multinode: true From 99a075bfeeb7962d92b8b35fc02aa0cadbba6575 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:48:09 -0700 Subject: [PATCH 04/14] fix: add sbatch_directives mem=0 + cpus-per-task=72 to M3 GB300 recipes srun_options.mem=0 only grants a step the job's existing allocation; on gb300-cw (DefMemPerCPU=4096, no DefCpuPerGPU) the job itself was only allocated 4 GB/node and workers were cgroup-OOM-killed during engine init (run 27452273567: oom_kill in StepId=7409.7 on slurm-gb300-133-193, worker RLIMIT showed 4194304 KB). The canary passed because it landed on gb300-nv, which doesn't enforce the cap. Mirrors the sbatch_directives block of the DSV4 agentic recipes. --- .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml | 9 +++++++++ .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml | 9 +++++++++ .../vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml | 9 +++++++++ .../minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml | 9 +++++++++ .../vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml | 9 +++++++++ .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 9 +++++++++ .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml | 9 +++++++++ .../minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml | 9 +++++++++ .../vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml | 9 +++++++++ .../minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml | 9 +++++++++ .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 9 +++++++++ 11 files changed, 99 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml index 58467b48d..0ce1485b2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml index 5842a2aec..1afd90c56 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml index d6f981ab6..14596993f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml index 8f5bc8675..d0e543c28 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml index 6ab7b8a61..31e706fb7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml index ac80ba8e9..417c0958c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -14,6 +14,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml index 1bcbc0ac2..7ce075f5c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml @@ -14,6 +14,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml index d4d4af392..8451e0ad1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml index 7d268187d..6bf18b5a7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml index 196981472..a11b5c405 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml @@ -22,6 +22,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml index e9c60933c..53d2b049b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -14,6 +14,15 @@ dynamo: install: true wheel: "1.2.0.dev20260526" +# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the +# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups +# OOM-kill during engine init; srun_options.mem=0 alone only grants a +# step what the job already holds. cpus-per-task=72 (one NUMA socket) +# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. +# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. +sbatch_directives: + mem: "0" + cpus-per-task: "72" srun_options: mem: "0" From 26e2005bc27b25415735b1a51f2deaf203ff4177 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 19:07:17 -0700 Subject: [PATCH 05/14] fix: run M3 GB300 workers cache-only (HF_HUB_OFFLINE=1) to avoid fetch_model lock race With the mem fix in place, run 27452976271 cleared the OOM but hit a new failure: both nodes of the TP8-2n job called dynamo fetch_model within 200ms (191 @ :23.637, 193 @ :23.833), 191 took the per-blob .lock on the shared /mnt/vast/hf-home cache and held it verifying the 444 GB snapshot, 193 retried ~6.4s and died 'Lock acquisition failed' (dynamo's rust hub doesn't wait like Python hf_hub). The launcher already pre-stages and verifies the snapshot offline before submit, so the workers never need to fetch. Setting HF_HUB_OFFLINE=1 in every worker env block makes dynamo serve cache-only and skip the download lock entirely, so co-fetching workers no longer collide. Applied to all agg + disagg (prefill/decode) env blocks across the 11 recipes. --- .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml | 6 ++++++ .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml | 6 ++++++ .../minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml | 6 ++++++ .../1k1k/agg-gb300-tp4ep4-1n.yaml | 6 ++++++ .../minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml | 6 ++++++ .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 12 ++++++++++++ .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml | 12 ++++++++++++ .../minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml | 6 ++++++ .../minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml | 6 ++++++ .../8k1k/agg-gb300-tp4ep4-1n.yaml | 6 ++++++ .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 12 ++++++++++++ 11 files changed, 84 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml index 0ce1485b2..f8cb4c161 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml index 1afd90c56..3b380e36d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml index 14596993f..c156fb59d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml index d0e543c28..0e0b2280f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml index 31e706fb7..5884b7f6b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml index 417c0958c..133b54846 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -60,11 +60,23 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml index 7ce075f5c..d2a98b286 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml @@ -58,11 +58,23 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml index 8451e0ad1..5d2330626 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml index 6bf18b5a7..d32879cbd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml index a11b5c405..06c01619f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml @@ -66,6 +66,12 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: aggregated: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml index 53d2b049b..0c671cd62 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -60,11 +60,23 @@ backend: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" HF_HOME: "__M3_HF_HOME__" + # Cache-only at runtime: the launcher pre-stages the full snapshot and + # verifies it offline before submit, so workers must NOT re-fetch. Without + # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; + # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the + # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). + HF_HUB_OFFLINE: "1" vllm_config: prefill: From b660ddde1f46aa0ea68022b4f93a1758f943ee10 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:54:51 -0700 Subject: [PATCH 06/14] fix: re-pin utils/aiperf to live cjq/agentx-v0.3 tip (ff2b646c) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous pin 062a5de9 (set by #1571 "chore: agentx v0.3") was the cjq/agentx-v0.3 tip on 2026-06-02, but that branch was later rebased/ force-pushed (now at ff2b646c) which orphaned 062a5de9; GitHub has since garbage-collected it. It is now unfetchable ("upload-pack: not our ref") and absent from every CI runner cache, so actions/checkout fails on any cold runner with "Unable to find current revision in submodule path utils/aiperf" (e.g. the newly-added gb300-cw runner-4, run 27453693856). Re-pin to the current cjq/agentx-v0.3 tip — the branch .gitmodules already declares, which is live/fetchable and contains the prior aiperf history as an ancestor. This makes the pin and the declared branch consistent again. --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index 062a5de92..ff2b646c0 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9 +Subproject commit ff2b646c0425aff9307a0e73161b23d77003a357 From ef7c65080b99fa7596363cb3c769b88673196b9a Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:14:26 -0700 Subject: [PATCH 07/14] MiniMax-M3 GB300: disagg-only sweep + multi-node-NVLink KV transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the aggregated M3 GB300 topologies with disaggregated-only, and enable NixlConnector KV transfer over multi-node NVLink on every disagg recipe. On gb300-cw the cross-node prefill->decode KV handoff was silently falling back to RDMA/TCP (~268 MB/s, ~1400 tiny descriptors for M3 MSA cache) — the disagg ceiling. Setting UCX_CUDA_IPC_ENABLE_MNNVL=y plus --enable-cumem-allocator (VMM-registers KV so NIXL uses cuda_ipc across the NVL fabric) lifts it to ~1.4-1.7 GB/s and gives +17% / +23% / +49% out tok/s/gpu at conc 64 / 128 / 256 (jobs 7490 base vs 7493 MNNVL, 1P1D TP4EP4). This is a GB300-only win: B300 8-GPU IB islands cannot move KV over multi-node NVLink. Sweep (1k1k), all MNNVL: - 1P1D TP4+EP4 collocated 1n (8 GPU), conc 8-256 - low/mid latency - 1P1D TP4+EP4 split 2n (8 GPU), conc 64-512 - mid throughput - 1P + DP16+EP wide decode 5n (20 GPU), conc 512-2048 - max throughput (decode keeps scaling on NVL where 1P1D saturates: ~1213 vs ~810 out tok/s/gpu @ conc 1024) Removes all agg-gb300 recipes (1k1k + 8k1k); applies MNNVL to the 8k1k disagg recipe too for consistency. --- .github/configs/nvidia-master.yaml | 86 +++----------- .../1k1k/agg-gb300-dep4-1n.yaml | 97 --------------- .../1k1k/agg-gb300-dep8-2n.yaml | 97 --------------- .../1k1k/agg-gb300-tp4-1n.yaml | 94 --------------- .../1k1k/agg-gb300-tp4ep4-1n.yaml | 95 --------------- .../1k1k/agg-gb300-tp8-2n.yaml | 94 --------------- .../1k1k/disagg-gb300-1p-dep16dec-5n.yaml | 112 ++++++++++++++++++ .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 10 ++ .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml | 10 ++ .../8k1k/agg-gb300-dep8-2n.yaml | 97 --------------- .../8k1k/agg-gb300-tp4-1n.yaml | 94 --------------- .../8k1k/agg-gb300-tp4ep4-1n.yaml | 95 --------------- .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 10 ++ 13 files changed, 159 insertions(+), 832 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f3f720180..871518e5c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11790,52 +11790,28 @@ minimaxm3-fp8-gb300-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Low latency: TP=4 aggregated, 1 node (4 GPU). - - conc-list: [4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml" - decode: - num-worker: 0 - tp: 4 - ep: 1 - dp-attn: false - - # Low latency: TP=8 aggregated, 2 nodes (8 GPU). - - conc-list: [4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml" - decode: - num-worker: 0 - tp: 8 - ep: 1 - dp-attn: false - - # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU). - - conc-list: [128, 256, 512] + # Disagg-only sweep. Every recipe enables NixlConnector KV transfer over + # multi-node NVLink (UCX_CUDA_IPC_ENABLE_MNNVL=y + --enable-cumem-allocator), + # which moved the cross-node prefill->decode KV handoff off the RDMA/TCP + # fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s) for +17-49% + # tok/s/gpu on M3 — a GB300-only win (B300's 8-GPU IB islands cannot move + # KV over multi-node NVLink). + # Low / mid latency: 1P+1D TP4+EP4 collocated, 1 node (8 GPU). + - conc-list: [8, 16, 32, 64, 128, 256] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml" decode: - num-worker: 0 + num-worker: 1 tp: 4 ep: 4 dp-attn: false - # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). + # Mid throughput: 1P+1D TP4+EP4 split across 2 nodes (8 GPU). - conc-list: [64, 128, 256, 512] prefill: num-worker: 1 @@ -11850,49 +11826,21 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: false - # Mid curve: 1P+1D disagg TP4+EP4, collocated 1 node (8 GPU). - - conc-list: [64, 128, 256, 512] + # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU). + # Decode keeps scaling on the NVL fabric where 1P1D saturates: + # ~1213 out tok/s/gpu @ conc 1024 vs the 1P1D ~810 plateau. + - conc-list: [512, 1024, 2048] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - - # High throughput: DEP=4 aggregated, 1 node (4 GPU). - - conc-list: [256, 512, 1024] - prefill: - num-worker: 1 - tp: 1 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml" decode: - num-worker: 0 - tp: 1 - ep: 4 - dp-attn: true - - # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU). - - conc-list: [512, 1024, 2048] - prefill: num-worker: 1 tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml" - decode: - num-worker: 0 - tp: 1 - ep: 8 + ep: 16 dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml deleted file mode 100644 index f8cb4c161..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-dep4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml deleted file mode 100644 index 3b380e36d..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-dep8-2n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml deleted file mode 100644 index c156fb59d..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml +++ /dev/null @@ -1,94 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-tp4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 64 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml deleted file mode 100644 index 0e0b2280f..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml deleted file mode 100644 index 5884b7f6b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml +++ /dev/null @@ -1,94 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-tp8-2n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 64 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml new file mode 100644 index 000000000..434007451 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml @@ -0,0 +1,112 @@ +name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-1k1k" + +# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode. +# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP +# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y + +# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu). +# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector +# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests +# whether fast MNNVL KV transfer makes the wide decode pool (which B300 +# cannot build on one NVLink island) actually pay off. +# --block-size 128 mandatory (MSA cache); text-only -> language-model-only. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enable-cumem-allocator: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-cumem-allocator: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml index 133b54846..f4bd604f6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -66,6 +66,10 @@ backend: # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). HF_HUB_OFFLINE: "1" + # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the + # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- + # registered KV (enable-cumem-allocator below). + UCX_CUDA_IPC_ENABLE_MNNVL: "y" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -77,6 +81,10 @@ backend: # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). HF_HUB_OFFLINE: "1" + # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the + # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- + # registered KV (enable-cumem-allocator below). + UCX_CUDA_IPC_ENABLE_MNNVL: "y" vllm_config: prefill: @@ -84,6 +92,7 @@ backend: tensor-parallel-size: 4 pipeline-parallel-size: 1 enable-expert-parallel: true + enable-cumem-allocator: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 @@ -100,6 +109,7 @@ backend: tensor-parallel-size: 4 pipeline-parallel-size: 1 enable-expert-parallel: true + enable-cumem-allocator: true max-model-len: 2304 max-num-seqs: 256 max-num-batched-tokens: 256 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml index d2a98b286..8e13b522f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml @@ -64,6 +64,10 @@ backend: # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). HF_HUB_OFFLINE: "1" + # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the + # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- + # registered KV (enable-cumem-allocator below). + UCX_CUDA_IPC_ENABLE_MNNVL: "y" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -75,6 +79,10 @@ backend: # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). HF_HUB_OFFLINE: "1" + # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the + # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- + # registered KV (enable-cumem-allocator below). + UCX_CUDA_IPC_ENABLE_MNNVL: "y" vllm_config: prefill: @@ -82,6 +90,7 @@ backend: tensor-parallel-size: 4 pipeline-parallel-size: 1 enable-expert-parallel: true + enable-cumem-allocator: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 @@ -98,6 +107,7 @@ backend: tensor-parallel-size: 4 pipeline-parallel-size: 1 enable-expert-parallel: true + enable-cumem-allocator: true max-model-len: 2304 max-num-seqs: 256 max-num-batched-tokens: 256 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml deleted file mode 100644 index 5d2330626..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-dep8-2n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 256 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml deleted file mode 100644 index d32879cbd..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml +++ /dev/null @@ -1,94 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-tp4-1n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 64 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml deleted file mode 100644 index 06c01619f..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads -# the 444 GB MXFP8 snapshot once, on a compute node, into the shared -# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable -# shared-FS path and bind-mounts it via extra_mount). -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 512 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml index 0c671cd62..843f47951 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml @@ -66,6 +66,10 @@ backend: # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). HF_HUB_OFFLINE: "1" + # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the + # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- + # registered KV (enable-cumem-allocator below). + UCX_CUDA_IPC_ENABLE_MNNVL: "y" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -77,6 +81,10 @@ backend: # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). HF_HUB_OFFLINE: "1" + # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the + # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- + # registered KV (enable-cumem-allocator below). + UCX_CUDA_IPC_ENABLE_MNNVL: "y" vllm_config: prefill: @@ -84,6 +92,7 @@ backend: tensor-parallel-size: 4 pipeline-parallel-size: 1 enable-expert-parallel: true + enable-cumem-allocator: true enforce-eager: true max-model-len: 9472 max-num-seqs: 16 @@ -100,6 +109,7 @@ backend: tensor-parallel-size: 4 pipeline-parallel-size: 1 enable-expert-parallel: true + enable-cumem-allocator: true max-model-len: 9472 max-num-seqs: 256 max-num-batched-tokens: 256 From 7fd890415772d9cbb4b4ecd37465b67c29368fc3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:47:31 -0700 Subject: [PATCH 08/14] M3 GB300: add 8k1k disagg sweep; drop unschedulable collocated-1n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The collocated-1n topology (disagg-gb300-1p1d-tp4ep4-1n) declared gpus_per_node: 8, but gb300-cw nodes have 4 GPUs — sbatch rejects it with "Requested node configuration is not available" even on a fully idle cluster (confirmed: fails standalone with 28 nodes free; the split-2n and wide-decode at gpus_per_node 4 schedule fine). It was an 8-GPU-node template artifact that never reached sbatch before. Remove it (1k1k + 8k1k) and let the split-2n cover the low-latency end (conc extended down to 8). Add the 8k1k (isl 8192) scenario mirroring 1k1k with the two valid disagg shapes (split-2n + wide DP16 decode), MNNVL KV transfer on both, seq params retuned for long context (max-model-len 9472) and lower concurrency. --- .github/configs/nvidia-master.yaml | 47 +++++-- .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml | 128 ------------------ .../8k1k/disagg-gb300-1p-dep16dec-5n.yaml | 112 +++++++++++++++ ....yaml => disagg-gb300-1p1d-tp4ep4-2n.yaml} | 14 +- 4 files changed, 151 insertions(+), 150 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-1p1d-tp4ep4-1n.yaml => disagg-gb300-1p1d-tp4ep4-2n.yaml} (91%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7abaf7160..d349b9ed5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11790,36 +11790,57 @@ minimaxm3-fp8-gb300-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Disagg-only sweep. Every recipe enables NixlConnector KV transfer over + # Disagg-only. Every recipe enables NixlConnector KV transfer over # multi-node NVLink (UCX_CUDA_IPC_ENABLE_MNNVL=y + --enable-cumem-allocator), # which moved the cross-node prefill->decode KV handoff off the RDMA/TCP - # fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s) for +17-49% - # tok/s/gpu on M3 — a GB300-only win (B300's 8-GPU IB islands cannot move - # KV over multi-node NVLink). - # Low / mid latency: 1P+1D TP4+EP4 collocated, 1 node (8 GPU). - - conc-list: [8, 16, 32, 64, 128, 256] + # fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s): +17/+23/+49% + # out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU + # IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node, + # so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation). + # 1P+1D TP4+EP4 split, 2 nodes (8 GPU) - low -> mid. + - conc-list: [8, 16, 32, 64, 128, 256, 512] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml" decode: num-worker: 1 tp: 4 ep: 4 dp-attn: false - # Mid throughput: 1P+1D TP4+EP4 split across 2 nodes (8 GPU). - - conc-list: [64, 128, 256, 512] + # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU). Decode keeps scaling on the NVL fabric where 1P1D saturates: ~1213 vs ~810 out tok/s/gpu @ conc 1024. + - conc-list: [512, 1024, 2048] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml" + decode: + num-worker: 1 + tp: 1 + ep: 16 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # 8k1k long context: prefill is heavier and KV is larger, so concurrency + # is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer. + # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). + - conc-list: [16, 32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml" decode: num-worker: 1 tp: 4 @@ -11827,16 +11848,14 @@ minimaxm3-fp8-gb300-dynamo-vllm: dp-attn: false # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU). - # Decode keeps scaling on the NVL fabric where 1P1D saturates: - # ~1213 out tok/s/gpu @ conc 1024 vs the 1P1D ~810 plateau. - - conc-list: [512, 1024, 2048] + - conc-list: [256, 512, 1024] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml" decode: num-worker: 1 tp: 1 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml deleted file mode 100644 index f4bd604f6..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ /dev/null @@ -1,128 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-1k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node. -# 8 GPUs per GB300 node: 4 for prefill, 4 for decode. -# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4). -# --block-size 128 is mandatory (MSA sparse/index cache alignment). - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the -# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups -# OOM-kill during engine init; srun_options.mem=0 alone only grants a -# step what the job already holds. cpus-per-task=72 (one NUMA socket) -# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default. -# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml. -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the - # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- - # registered KV (enable-cumem-allocator below). - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - # Cache-only at runtime: the launcher pre-stages the full snapshot and - # verifies it offline before submit, so workers must NOT re-fetch. Without - # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache; - # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the - # loser dies with "Lock acquisition failed" (no retry like Python hf_hub). - HF_HUB_OFFLINE: "1" - # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the - # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM- - # registered KV (enable-cumem-allocator below). - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - enable-cumem-allocator: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - enable-cumem-allocator: true - max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml new file mode 100644 index 000000000..2771039e8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml @@ -0,0 +1,112 @@ +name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-8k1k" + +# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode. +# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP +# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y + +# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu). +# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector +# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests +# whether fast MNNVL KV transfer makes the wide decode pool (which B300 +# cannot build on one NVLink island) actually pay off. +# --block-size 128 mandatory (MSA cache); text-only -> language-model-only. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" + + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enable-cumem-allocator: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-cumem-allocator: true + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml similarity index 91% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml index 843f47951..1e0d67e3c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml @@ -1,9 +1,9 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-2n-8k1k" -# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node. -# 8 GPUs per GB300 node: 4 for prefill, 4 for decode. -# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4). +# MiniMax-M3 disaggregated 1P+1D recipe for GB300, 2 nodes. +# 1 node prefill (4 GPUs, TP4+EP4) → NixlConnector → 1 node decode (4 GPUs, TP4+EP4). # --block-size 128 is mandatory (MSA sparse/index cache alignment). +# safetensors-load-strategy omitted — prefetch OOMs on CW GB300 host memory. model: path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" @@ -26,7 +26,6 @@ sbatch_directives: srun_options: mem: "0" - slurm: time_limit: "8:00:00" @@ -39,7 +38,7 @@ extra_mount: resources: gpu_type: "gb300" - gpus_per_node: 8 + gpus_per_node: 4 prefill_nodes: 1 decode_nodes: 1 prefill_workers: 1 @@ -54,7 +53,6 @@ frontend: backend: type: vllm connector: null - allow_prefill_decode_colocation: true prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -125,4 +123,4 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "64x128x256" + concurrencies: "16x32x64x128x256" From 5df0669a3e04ea4427cb92c66691c576940f706a Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 18:53:25 -0700 Subject: [PATCH 09/14] M3 GB300: add rack-saturating balanced-ratio TP-ep1 max-throughput disagg config Adds a 17-node (full-rack) disagg topology to the M3 GB300 sweep (1k1k + 8k1k) from on-cluster tuning (gb300-cw): - PREFILL is the binding bottleneck, not decode width or KV transfer: a single prefill worker left ~3967 reqs queued and starved 64 decode GPUs. Balancing to 5 prefill : 12 decode (TP4) cleared the backlog and lifted throughput +57% (535 -> 843 out tok/s/gpu @ conc 2048). - TP-only decode (ep1, no expert parallelism) per the Qwen3.5-397B-A17B recipes (closest M3 analog); M3 wide-EP/DP-attention all-to-all was slower and DP32 < DP16 per-GPU. - Kept the existing 1p1d (low/mid latency) and dep16dec (wide-decode) topologies so CI measures the full Pareto rather than replacing them. NixlConnector KV transfer stays on multi-node NVLink (MNNVL + cumem); note KV transfer was verified NOT to bottleneck throughput (doubling its bandwidth via num_threads changed end-to-end tok/s/gpu by ~0). recipe yamls line up 1:1 with the nvidia-master.yaml CONFIG_FILE references. --- .github/configs/nvidia-master.yaml | 34 ++++++ .../1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml | 102 ++++++++++++++++++ .../8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml | 102 ++++++++++++++++++ 3 files changed, 238 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d349b9ed5..fd235392b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11827,6 +11827,23 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 16 dp-attn: true + # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes). + # Balanced prefill:decode ratio (single prefill starved the decode pool) + # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower). + - conc-list: [2048, 4096, 8192] + prefill: + num-worker: 5 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml" + decode: + num-worker: 12 + tp: 4 + ep: 1 + dp-attn: false + - isl: 8192 osl: 1024 search-space: @@ -11862,6 +11879,23 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 16 dp-attn: true + # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes). + # Balanced prefill:decode ratio (single prefill starved the decode pool) + # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower). + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 5 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml" + decode: + num-worker: 12 + tp: 4 + ep: 1 + dp-attn: false + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml new file mode 100644 index 000000000..eb39f71f9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml @@ -0,0 +1,102 @@ +name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-1k1k" + +# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU). +# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert +# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead +# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode +# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued), +# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0, +# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV +# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128 +# mandatory (MSA cache); text-only -> language-model-only. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" +dynamo: + install: true + wheel: "1.2.0.dev20260526" +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" +slurm: + time_limit: "8:00:00" +health_check: + max_attempts: 720 + interval_seconds: 10 +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 12 + prefill_workers: 5 + decode_workers: 12 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-cumem-allocator: true + enforce-eager: true + max-num-seqs: 16 + max-num-batched-tokens: 16384 + max-model-len: 2304 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-cumem-allocator: true + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + max-model-len: 2304 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048x4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml new file mode 100644 index 000000000..7cd84fbf1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml @@ -0,0 +1,102 @@ +name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-8k1k" + +# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU). +# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert +# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead +# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode +# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued), +# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0, +# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV +# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128 +# mandatory (MSA cache); text-only -> language-model-only. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" +dynamo: + install: true + wheel: "1.2.0.dev20260526" +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" +slurm: + time_limit: "8:00:00" +health_check: + max_attempts: 720 + interval_seconds: 10 +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 12 + prefill_workers: 5 + decode_workers: 12 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-cumem-allocator: true + enforce-eager: true + max-num-seqs: 16 + max-num-batched-tokens: 16384 + max-model-len: 9472 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-cumem-allocator: true + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + max-model-len: 9472 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048x4096" From 62fe18d3784f89f4ff2271a6da34cdd7dddbbe14 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 22:38:01 -0700 Subject: [PATCH 10/14] M3 GB300: replace dep16dec with 1P4D TP4-ep1; add prefill-heavy 10P7D for 8k1k MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DSR1 GB300 patterns show wide-EP decode hurts M3's MoE all-to-all; independent TP4 decode workers are strictly better. Also, 8k1k is prefill-bound (616-req backlog at 5P:12D) — rebalance to 10P:7D per DSR1/DSV4's prefill-heavy long-context ratios. Changes: - Replace dep16dec (EP16 single decode) with 1P+4D (4x TP4 ep1 decode) for both 1k1k and 8k1k, same 5 nodes - Add 10P+7D TP4 ep1 (17 nodes) for 8k1k max throughput - Tighten concurrency ranges: 1P1D [4-32], 1P4D [64-512], 5P12D/10P7D [1024+] --- .github/configs/nvidia-master.yaml | 64 +++++++----- ....yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} | 38 +++---- .../8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml | 99 +++++++++++++++++++ ....yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} | 38 +++---- 4 files changed, 164 insertions(+), 75 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/{disagg-gb300-1p-dep16dec-5n.yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} (68%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-1p-dep16dec-5n.yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} (68%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fd235392b..0fa0079b3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11797,8 +11797,8 @@ minimaxm3-fp8-gb300-dynamo-vllm: # out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU # IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node, # so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation). - # 1P+1D TP4+EP4 split, 2 nodes (8 GPU) - low -> mid. - - conc-list: [8, 16, 32, 64, 128, 256, 512] + # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU). + - conc-list: [4, 8, 16, 32] prefill: num-worker: 1 tp: 4 @@ -11812,20 +11812,22 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: false - # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU). Decode keeps scaling on the NVL fabric where 1P1D saturates: ~1213 vs ~810 out tok/s/gpu @ conc 1024. - - conc-list: [512, 1024, 2048] + # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). DSR1 pattern: multiple + # independent TP4 decode workers instead of wide-EP decode (M3's MoE + # all-to-all makes wide EP slower). + - conc-list: [64, 128, 256, 512] prefill: num-worker: 1 tp: 4 - ep: 4 + ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml" decode: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes). # Balanced prefill:decode ratio (single prefill starved the decode pool) @@ -11849,8 +11851,8 @@ minimaxm3-fp8-gb300-dynamo-vllm: search-space: # 8k1k long context: prefill is heavier and KV is larger, so concurrency # is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer. - # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). - - conc-list: [16, 32, 64, 128, 256] + # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU). + - conc-list: [4, 8, 16] prefill: num-worker: 1 tp: 4 @@ -11864,24 +11866,23 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: false - # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU). - - conc-list: [256, 512, 1024] + # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). Multiple independent + # TP4 decode workers instead of wide-EP decode. + - conc-list: [32, 64, 128, 256] prefill: num-worker: 1 tp: 4 - ep: 4 + ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml" decode: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false - # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes). - # Balanced prefill:decode ratio (single prefill starved the decode pool) - # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower). + # Rack-saturating: 5 prefill + 12 decode, TP4 ep1 (17 nodes). - conc-list: [1024, 2048, 4096] prefill: num-worker: 5 @@ -11896,6 +11897,23 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 1 dp-attn: false + # Prefill-heavy rack-saturating: 10P + 7D, TP4 ep1 (17 nodes). + # At 8k context, prefill is 8x heavier — 5P:12D showed 616-req prefill + # backlog. DSR1/DSV4 GB300 patterns use 6-10 prefill workers for 8k1k. + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 10 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml" + decode: + num-worker: 7 + tp: 4 + ep: 1 + dp-attn: false + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml similarity index 68% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml index 434007451..c9bb30163 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml @@ -1,38 +1,28 @@ -name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-1k1k" +name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-1k1k" -# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode. -# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP -# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y + -# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu). -# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector -# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests -# whether fast MNNVL KV transfer makes the wide decode pool (which B300 -# cannot build on one NVLink island) actually pay off. -# --block-size 128 mandatory (MSA cache); text-only -> language-model-only. +# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes. +# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode +# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead +# makes wide EP slower than independent TP4 workers. +# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache). model: path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" container: "vllm/vllm-openai:minimax-m3" precision: "fp8" - dynamo: install: true wheel: "1.2.0.dev20260526" - sbatch_directives: mem: "0" cpus-per-task: "72" srun_options: mem: "0" - - slurm: time_limit: "8:00:00" - health_check: max_attempts: 720 interval_seconds: 10 - extra_mount: - "__M3_HF_HOME__:__M3_HF_HOME__" @@ -42,9 +32,9 @@ resources: prefill_nodes: 1 decode_nodes: 4 prefill_workers: 1 - decode_workers: 1 + decode_workers: 4 gpus_per_prefill: 4 - gpus_per_decode: 16 + gpus_per_decode: 4 frontend: type: dynamo @@ -73,12 +63,11 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 pipeline-parallel-size: 1 - enable-expert-parallel: true enable-cumem-allocator: true enforce-eager: true - max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 + max-model-len: 2304 block-size: 128 language-model-only: true gpu-memory-utilization: 0.9 @@ -88,16 +77,13 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 + tensor-parallel-size: 4 pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true enable-cumem-allocator: true - max-model-len: 2304 max-num-seqs: 256 max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 + max-model-len: 2304 block-size: 128 language-model-only: true gpu-memory-utilization: 0.9 @@ -109,4 +95,4 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "512x1024x2048" + concurrencies: "32x64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml new file mode 100644 index 000000000..f005aa6dc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml @@ -0,0 +1,99 @@ +name: "minimax-m3-vllm-disagg-gb300-10p7d-tp4ep1-8k1k" + +# MiniMax-M3 GB300 prefill-heavy max-throughput disagg, 17 nodes / 68 GPU. +# At 8k context, prefill is 8x heavier per request: the 5P:12D ratio (tuned +# for light 1k1k prefill) is prefill-bound — 616 reqs queued behind 5 workers. +# This config rebalances to 10P:7D per DSR1/DSV4 GB300 patterns (DSR1 8k1k +# uses 6P:1D; DSV4 uses 10P:8D at 18 nodes). TP4 ep1 decode (no wide EP — +# M3's MoE all-to-all overhead makes it slower). MNNVL KV + cumem. + +model: + path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" +dynamo: + install: true + wheel: "1.2.0.dev20260526" +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" +slurm: + time_limit: "8:00:00" +health_check: + max_attempts: 720 + interval_seconds: 10 +extra_mount: + - "__M3_HF_HOME__:__M3_HF_HOME__" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 10 + decode_nodes: 7 + prefill_workers: 10 + decode_workers: 7 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + HF_HOME: "__M3_HF_HOME__" + HF_HUB_OFFLINE: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-cumem-allocator: true + enforce-eager: true + max-num-seqs: 16 + max-num-batched-tokens: 16384 + max-model-len: 9472 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-cumem-allocator: true + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + max-model-len: 9472 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048x4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml similarity index 68% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml index 2771039e8..86dbfca17 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml @@ -1,38 +1,28 @@ -name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-8k1k" -# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode. -# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP -# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y + -# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu). -# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector -# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests -# whether fast MNNVL KV transfer makes the wide decode pool (which B300 -# cannot build on one NVLink island) actually pay off. -# --block-size 128 mandatory (MSA cache); text-only -> language-model-only. +# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes. +# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode +# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead +# makes wide EP slower than independent TP4 workers. +# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache). model: path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" container: "vllm/vllm-openai:minimax-m3" precision: "fp8" - dynamo: install: true wheel: "1.2.0.dev20260526" - sbatch_directives: mem: "0" cpus-per-task: "72" srun_options: mem: "0" - - slurm: time_limit: "8:00:00" - health_check: max_attempts: 720 interval_seconds: 10 - extra_mount: - "__M3_HF_HOME__:__M3_HF_HOME__" @@ -42,9 +32,9 @@ resources: prefill_nodes: 1 decode_nodes: 4 prefill_workers: 1 - decode_workers: 1 + decode_workers: 4 gpus_per_prefill: 4 - gpus_per_decode: 16 + gpus_per_decode: 4 frontend: type: dynamo @@ -73,12 +63,11 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 pipeline-parallel-size: 1 - enable-expert-parallel: true enable-cumem-allocator: true enforce-eager: true - max-model-len: 9472 max-num-seqs: 16 max-num-batched-tokens: 16384 + max-model-len: 9472 block-size: 128 language-model-only: true gpu-memory-utilization: 0.9 @@ -88,16 +77,13 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 + tensor-parallel-size: 4 pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true enable-cumem-allocator: true - max-model-len: 9472 max-num-seqs: 256 max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 + max-model-len: 9472 block-size: 128 language-model-only: true gpu-memory-utilization: 0.9 @@ -109,4 +95,4 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "32x64x128x256" From 1d71f496cfbe400481bd56c91be63f72d9ace933 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 01:49:20 -0400 Subject: [PATCH 11/14] [Klaud Cold]minimaxm3-fp8-mi300x-vllm-mtp: day-zero MiniMax-M3 EAGLE3 (MTP) MI300X recipe (#1749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * minimaxm3-fp8-mi300x-vllm-mtp: day-zero MiniMax-M3 EAGLE3 MI300X recipe Adds the spec-decoding=mtp sibling of minimaxm3-fp8-mi300x-vllm, based on the MI300X non-MTP recipe + the MI355X MTP recipe. Keeps the MI300X serve shape (BF16 KV cache — gfx942 lacks calibrated ROCm FP8 attention scales — plus --no-enable-prefix-caching, TRITON_ATTN, --enforce-eager, minimax_m3 parsers) and adds the Inferact/MiniMax-M3-EAGLE3 draft via --speculative-config (method eagle3, 3 spec tokens) + chat-template prompts. Carries the same in-place EAGLE3 patch as the MI355X MTP recipe: the shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches the installed amd/model.py before serving (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X). Idempotent; hard-fails on base drift. TP8-only search space (gfx942 192 GB is memory-tight, like H100), TP8 latency rows started at conc 1, matching the H100/MI355X MTP recipes. Also adds SPEC_SUFFIX to launch_mi300x-amds.sh so spec-decoding=mtp routes to the _mtp script (the launcher hardcoded _mi300x.sh). Co-Authored-By: Claude Fable 5 * perf-changelog: fill in PR link for minimaxm3-fp8-mi300x-vllm-mtp (#1749) Co-Authored-By: Claude Fable 5 --------- Co-authored-by: Claude Fable 5 --- .github/configs/amd-master.yaml | 30 +++ .../fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh | 212 ++++++++++++++++++ perf-changelog.yaml | 10 + runners/launch_mi300x-amds.sh | 6 +- 4 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2528798b1..f18b3f94e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2871,3 +2871,33 @@ minimaxm3-fp8-mi300x-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } + +# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of +# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the +# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only +# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like +# H100), with the TP8 latency rows started at conc 1 to capture single-request +# latency — matching the H100/MI355X MTP recipes. The shipped ROCm image lacks +# SupportsEagle3 on the AMD MiniMax-M3 model, so the recipe applies that fix +# in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; +# validated green on MI355X) before serving. +minimaxm3-fp8-mi300x-vllm-mtp: + image: vllm/vllm-openai-rocm:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi300x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp } diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh new file mode 100644 index 000000000..9dd10b30a --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh @@ -0,0 +1,212 @@ +#!/usr/bin/env bash + +# MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe with EAGLE3 +# speculative decoding — the spec-decoding=mtp variant of +# minimaxm3_fp8_mi300x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via +# --speculative-config with 3 speculative tokens. Everything else mirrors the +# non-MTP MI300X recipe: mandatory --block-size 128, --language-model-only for +# the text-only benchmark, --attention-backend TRITON_ATTN, --enforce-eager, +# and --no-enable-prefix-caching. The default BF16 KV cache is retained (unlike +# the MI355X recipe's FP8 KV cache): gfx942 has no calibrated q/prob scales for +# ROCm FP8 attention and vLLM's fallback scale of 1.0 corrupts accuracy. +# +# Unlike the CUDA recipes, the drafter needs no attention_backend override: +# the FlashInfer "page size 128 requires GQA/MQA" limitation that forced +# FLASH_ATTN for the EAGLE3 MHA head on Blackwell is FlashInfer/CUDA-specific. +# Here the whole server runs on TRITON_ATTN (set globally below), which serves +# the MHA draft fine. +# +# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image +# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3 +# engine init fails with "Model does not support EAGLE3 interface but +# aux_hidden_state_outputs was requested". This recipe applies that fix +# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as +# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we +# can validate EAGLE3 on real MI300X hardware ahead of an image rebuild. The +# same patch is validated green on MI355X. It is idempotent and fails the job +# loudly if the installed amd/model.py has drifted from the expected base. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + EP_SIZE \ + DP_ATTENTION \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3" + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +# MODEL is a bare HF id on the mi300x single-node runner (a fast cache hit when +# pre-staged). The EAGLE3 draft is not staged; fetch it into the same cache. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" + hf download "$DRAFT_MODEL" +fi + +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +SERVER_LOG=/workspace/server.log +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context +fi + +PARALLEL_ARGS=(--tensor-parallel-size "$TP") +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=( + --tensor-parallel-size 1 + --data-parallel-size "$TP" + --enable-expert-parallel + ) +elif [ "$EP_SIZE" -gt 1 ]; then + PARALLEL_ARGS+=(--enable-expert-parallel) +fi + +# use 3 speculative tokens for all configs for now +NUM_SPEC_TOKENS=3 + +# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the +# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546). +# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model + +# aux-hidden-state emission, and SupportsEagle3 to the two outer classes. +# Idempotent; hard-fails if the installed file has drifted from the expected +# base (so we never silently run unpatched and mislabel the result). +python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; } +import ast, importlib.util, pathlib, sys + +spec = importlib.util.find_spec("vllm") +root = pathlib.Path(spec.submodule_search_locations[0]) +target = root / "models" / "minimax_m3" / "amd" / "model.py" +src = target.read_text() + +if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src: + print(f"[eagle3-patch] already applied: {target}") + sys.exit(0) + +edits = [ + ( + "from vllm.model_executor.models.interfaces import (\n" + " MultiModalEmbeddings,\n" + " SupportsMultiModal,\n" + ")", + "from vllm.model_executor.models.interfaces import (\n" + " EagleModelMixin,\n" + " MultiModalEmbeddings,\n" + " SupportsEagle3,\n" + " SupportsMultiModal,\n" + ")", + ), + ( + "class MiniMaxM3Model(nn.Module):", + "class MiniMaxM3Model(nn.Module, EagleModelMixin):", + ), + ( + " inputs_embeds: torch.Tensor | None = None,\n" + " ) -> torch.Tensor:\n" + " if inputs_embeds is not None:", + " inputs_embeds: torch.Tensor | None = None,\n" + " ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n" + " if inputs_embeds is not None:", + ), + ( + " residual = None\n\n" + " for layer in self.layers[self.start_layer : self.end_layer]:\n" + " hidden_states, residual = layer(positions, hidden_states, residual)\n\n" + " hidden_states, _ = self.norm(hidden_states, residual)\n" + " return hidden_states", + " residual = None\n\n" + " # EAGLE3 is not yet compatible with pipeline parallel\n" + " aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n" + " for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n" + " hidden_states, residual = layer(positions, hidden_states, residual)\n" + " self._maybe_add_hidden_state(\n" + " aux_hidden_states, idx + 1, hidden_states, residual\n" + " )\n\n" + " hidden_states, _ = self.norm(hidden_states, residual)\n\n" + " if len(aux_hidden_states) > 0:\n" + " return hidden_states, aux_hidden_states\n" + " return hidden_states", + ), + ( + "class MiniMaxM3SparseForCausalLM(nn.Module):", + "class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):", + ), + ( + "class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):", + "class MiniMaxM3SparseForConditionalGeneration(\n" + " nn.Module, SupportsMultiModal, SupportsEagle3\n" + "):", + ), +] + +for old, new in edits: + count = src.count(old) + if count != 1: + sys.exit( + f"[eagle3-patch] anchor matched {count} times (expected 1); " + f"installed {target} has drifted from the expected base — aborting" + ) + src = src.replace(old, new) + +ast.parse(src) +target.write_text(src) +print(f"[eagle3-patch] applied EAGLE3 support to {target}") +PYEOF + +start_gpu_monitor + +set -x +vllm serve "$MODEL" --port "$PORT" \ + "${PARALLEL_ARGS[@]}" \ + --block-size 128 \ + --no-enable-prefix-caching \ + --language-model-only \ + --max-model-len "$MAX_MODEL_LEN" \ + --attention-backend TRITON_ATTN \ + --enforce-eager \ + --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ + --tool-call-parser minimax_m3 \ + --reasoning-parser minimax_m3 \ + --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +# Spec-decode acceptance rate degrades on raw random tokens; route prompts +# through the chat template as the other MTP recipes do. +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code \ + --use-chat-template + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1029c5700..168ce234c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3758,3 +3758,13 @@ - "H100-aligned layouts and concurrency ranges: TP8 and TP8+EP8 across 1k1k and 8k1k" - "Fix launch_mi300x-amds.sh node exclusion to use the current short Slurm node name" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1746 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm-mtp + description: + - "Initial submission: MiniMax-M3 MXFP8 MI300X (gfx942) vLLM benchmark with EAGLE3 speculative decoding (target: MiniMaxAI/MiniMax-M3-MXFP8, draft: Inferact/MiniMax-M3-EAGLE3, 3 speculative tokens) — spec-decoding=mtp variant of the MI300X day-zero recipe" + - "Image: vllm/vllm-openai-rocm:minimax-m3 (same day-zero ROCm build as the non-MTP entry)" + - "Serve shape follows minimaxm3-fp8-mi300x-vllm: --block-size 128, --no-enable-prefix-caching, --language-model-only, --attention-backend TRITON_ATTN, --enforce-eager, minimax_m3 parsers, and the default BF16 KV cache (gfx942 lacks calibrated ROCm FP8 attention scales); prompts routed through the chat template for realistic acceptance" + - "TP8-only search space (gfx942 192 GB is memory-tight, like H100): TP8 latency rows started at conc 1, TP8+EP8 (TEP) at high concurrency, across 1k1k and 8k1k" + - "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X) before serving — validates EAGLE3 on MI300X ahead of an image rebuild" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1749 diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index ce04ceadd..b0c1e22c8 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -7,6 +7,10 @@ PARTITION="compute" SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" +# Route spec-decoding=mtp configs to the _mtp benchmark script (parity with +# the h200 launchers, which have carried SPEC_SUFFIX since #392). +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + set -x # Exclude known-bad nodes; let Slurm pick from anything else: @@ -37,6 +41,6 @@ srun --jobid=$JOB_ID \ --container-remap-root \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh +bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x${SPEC_SUFFIX}.sh scancel $JOB_ID From 2bf5851d65bc68cb8510e2a0f545b694f4d1b5c3 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 14 Jun 2026 01:43:22 -0500 Subject: [PATCH 12/14] [AMD] perf: enable MiniMax M3 CUDA graphs on MI300X (#1750) * feat: add MiniMax M3 MI300X day-zero benchmark * chore: link MiniMax M3 MI300X changelog * fix: mount ROCm devices on MI300X * fix: disable prefix caching for MI300X MiniMax M3 * fix: use bf16 kv cache for MI300X MiniMax M3 * perf: enable MI300X MiniMax M3 CUDA graphs * chore: link MI300X CUDA graph changelog --- .../single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh | 2 +- perf-changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh index e3522e00a..f2cdaf284 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh @@ -32,6 +32,7 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_USE_BREAKABLE_CUDAGRAPH=0 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -58,7 +59,6 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ - --enforce-eager \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 168ce234c..65c43c0d7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3768,3 +3768,9 @@ - "TP8-only search space (gfx942 192 GB is memory-tight, like H100): TP8 latency rows started at conc 1, TP8+EP8 (TEP) at high concurrency, across 1k1k and 8k1k" - "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X) before serving — validates EAGLE3 on MI300X ahead of an image rebuild" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1749 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm + description: + - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI300X and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 per AMD guidance" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1750 From fd922a6a524c1d31e9dac8e6c9dce10b7748a6af Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 04:29:37 -0400 Subject: [PATCH 13/14] [Klaud Cold] minimaxm3-fp8-mi300x-vllm-mtp: run with CUDA graphs (drop --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0) (#1756) * minimaxm3-fp8-mi300x-vllm-mtp: run with CUDA graphs (drop --enforce-eager) Remove --enforce-eager from the MI300X EAGLE3 MTP recipe and set VLLM_USE_BREAKABLE_CUDAGRAPH=0, matching the non-MTP MI300X recipe (#1750). Avoids the M3-decode breakable-cudagraph path that previously forced eager execution. Re-sweeps minimaxm3-fp8-mi300x-vllm-mtp. Co-Authored-By: Claude Fable 5 * perf-changelog: fill in PR link for minimaxm3-fp8-mi300x-vllm-mtp cudagraphs Co-Authored-By: Claude Fable 5 --------- Co-authored-by: Claude Fable 5 --- .../single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh | 8 +++++--- perf-changelog.yaml | 7 +++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh index 9dd10b30a..40fbab536 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh @@ -5,8 +5,10 @@ # minimaxm3_fp8_mi300x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via # --speculative-config with 3 speculative tokens. Everything else mirrors the # non-MTP MI300X recipe: mandatory --block-size 128, --language-model-only for -# the text-only benchmark, --attention-backend TRITON_ATTN, --enforce-eager, -# and --no-enable-prefix-caching. The default BF16 KV cache is retained (unlike +# the text-only benchmark, --attention-backend TRITON_ATTN, and +# --no-enable-prefix-caching. Runs with CUDA graphs (no --enforce-eager); +# VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path. +# The default BF16 KV cache is retained (unlike # the MI355X recipe's FP8 KV cache): gfx942 has no calibrated q/prob scales for # ROCm FP8 attention and vLLM's fallback scale of 1.0 corrupts accuracy. # @@ -59,6 +61,7 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_USE_BREAKABLE_CUDAGRAPH=0 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -176,7 +179,6 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ - --enforce-eager \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 65c43c0d7..0b9067114 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3774,3 +3774,10 @@ description: - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI300X and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 per AMD guidance" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1750 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm-mtp + description: + - "Run the MiniMax-M3 MXFP8 MI300X EAGLE3 MTP recipe with CUDA graphs instead of --enforce-eager" + - "Drop --enforce-eager and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 (matching the non-MTP MI300X recipe, #1750), which avoids the M3-decode breakable-cudagraph path that previously forced eager execution" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1756 From 805dc1ce4614ee0f5690aa48d973d5839e06295a Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 14 Jun 2026 01:55:51 -0700 Subject: [PATCH 14/14] M3 GB300: drop dominated configs, restore 1P1D full range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Data from run 27489709722 showed: - 1P4D (20 GPU) strictly dominated by 1P1D (8 GPU): 320 vs 974 out/s/gpu @ conc 128 (1k1k). Single prefill can't feed 4 decode workers — 1P:4D ratio is too decode-heavy. - 8k1k 5P12D (68 GPU) dominated by 10P7D: 567 vs 874 out/s/gpu @ conc 1024. Prefill-heavy ratio is correct for long context. Changes: - Remove 1P4D recipes (both 1k1k and 8k1k) - Remove 8k1k 5P12D recipe (dominated by 10P7D) - Restore 1P1D to full concurrency range [8-512] 1k1k, [8-256] 8k1k (was truncated to [4-32] to avoid 1P4D overlap) Final GB300 configs: 1P1D (latency-to-mid) + rack-saturating (max tput) 1k1k: 1P1D [8-512] + 5P12D [2048-8192] 8k1k: 1P1D [8-256] + 10P7D [1024-4096] --- .github/configs/nvidia-master.yaml | 58 ++-------- .../1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml | 98 ----------------- .../8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml | 98 ----------------- .../8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml | 102 ------------------ 4 files changed, 6 insertions(+), 350 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0fa0079b3..46772c123 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11797,8 +11797,9 @@ minimaxm3-fp8-gb300-dynamo-vllm: # out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU # IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node, # so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation). - # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU). - - conc-list: [4, 8, 16, 32] + # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency: + # peaks at ~1646 out/s/gpu @ conc 256 (1k1k). Covers latency-to-mid. + - conc-list: [8, 16, 32, 64, 128, 256, 512] prefill: num-worker: 1 tp: 4 @@ -11812,23 +11813,6 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: false - # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). DSR1 pattern: multiple - # independent TP4 decode workers instead of wide-EP decode (M3's MoE - # all-to-all makes wide EP slower). - - conc-list: [64, 128, 256, 512] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 1 - dp-attn: false - # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes). # Balanced prefill:decode ratio (single prefill starved the decode pool) # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower). @@ -11851,8 +11835,9 @@ minimaxm3-fp8-gb300-dynamo-vllm: search-space: # 8k1k long context: prefill is heavier and KV is larger, so concurrency # is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer. - # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU). - - conc-list: [4, 8, 16] + # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency: + # peaks at ~1209 out/s/gpu @ conc 256 (8k1k). Covers latency-to-mid. + - conc-list: [8, 16, 32, 64, 128, 256] prefill: num-worker: 1 tp: 4 @@ -11866,37 +11851,6 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: false - # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). Multiple independent - # TP4 decode workers instead of wide-EP decode. - - conc-list: [32, 64, 128, 256] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 1 - dp-attn: false - - # Rack-saturating: 5 prefill + 12 decode, TP4 ep1 (17 nodes). - - conc-list: [1024, 2048, 4096] - prefill: - num-worker: 5 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml" - decode: - num-worker: 12 - tp: 4 - ep: 1 - dp-attn: false - # Prefill-heavy rack-saturating: 10P + 7D, TP4 ep1 (17 nodes). # At 8k context, prefill is 8x heavier — 5P:12D showed 616-req prefill # backlog. DSR1/DSV4 GB300 patterns use 6-10 prefill workers for 8k1k. diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml deleted file mode 100644 index c9bb30163..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-1k1k" - -# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes. -# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode -# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead -# makes wide EP slower than independent TP4 workers. -# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache). - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" -dynamo: - install: true - wheel: "1.2.0.dev20260526" -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" -slurm: - time_limit: "8:00:00" -health_check: - max_attempts: 720 - interval_seconds: 10 -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - HF_HUB_OFFLINE: "1" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - HF_HUB_OFFLINE: "1" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-cumem-allocator: true - enforce-eager: true - max-num-seqs: 16 - max-num-batched-tokens: 16384 - max-model-len: 2304 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-cumem-allocator: true - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 - max-model-len: 2304 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml deleted file mode 100644 index 86dbfca17..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-8k1k" - -# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes. -# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode -# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead -# makes wide EP slower than independent TP4 workers. -# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache). - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" -dynamo: - install: true - wheel: "1.2.0.dev20260526" -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" -slurm: - time_limit: "8:00:00" -health_check: - max_attempts: 720 - interval_seconds: 10 -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - HF_HUB_OFFLINE: "1" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - HF_HUB_OFFLINE: "1" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-cumem-allocator: true - enforce-eager: true - max-num-seqs: 16 - max-num-batched-tokens: 16384 - max-model-len: 9472 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-cumem-allocator: true - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 - max-model-len: 9472 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x64x128x256" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml deleted file mode 100644 index 7cd84fbf1..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml +++ /dev/null @@ -1,102 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-8k1k" - -# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU). -# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert -# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead -# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode -# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued), -# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0, -# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV -# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128 -# mandatory (MSA cache); text-only -> language-model-only. - -model: - path: "hf:MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" -dynamo: - install: true - wheel: "1.2.0.dev20260526" -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" -slurm: - time_limit: "8:00:00" -health_check: - max_attempts: 720 - interval_seconds: 10 -extra_mount: - - "__M3_HF_HOME__:__M3_HF_HOME__" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 5 - decode_nodes: 12 - prefill_workers: 5 - decode_workers: 12 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - HF_HUB_OFFLINE: "1" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - HF_HOME: "__M3_HF_HOME__" - HF_HUB_OFFLINE: "1" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-cumem-allocator: true - enforce-eager: true - max-num-seqs: 16 - max-num-batched-tokens: 16384 - max-model-len: 9472 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-cumem-allocator: true - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 - max-model-len: 9472 - block-size: 128 - language-model-only: true - gpu-memory-utilization: 0.9 - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048x4096"