From 965b046386dc8e24e25cefc75a2d3c996b981f17 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 18:05:57 -0700
Subject: [PATCH 01/14] feat: MiniMax-M3 MXFP8 full sweep config for GB300

Add minimaxm3-fp8-gb300-dynamo-vllm to nvidia-master.yaml with 7
topologies covering the full concurrency range:
- TP4/TP8 (low latency, conc 4-64)
- TP4+EP4 agg + 1P+1D disagg 2-node + 1P+1D collocated (mid, conc 64-512)
- DEP4/DEP8 (high throughput, conc 256-2048)

All recipe YAMLs included under minimax-m3-gb300-fp8/{1k1k,8k1k}/.
GB300 recipes include srun_options mem=0 (CW DefMemPerCPU cgroup fix)
and omit safetensors-load-strategy prefetch (host-memory limit).
---
 .github/configs/nvidia-master.yaml            | 122 ++++++++++++++++++
 .../workflows/benchmark-multinode-tmpl.yml    |   5 +
 .../1k1k/agg-gb300-dep4-1n.yaml               |  82 ++++++++++++
 .../1k1k/agg-gb300-dep8-2n.yaml               |  82 ++++++++++++
 .../1k1k/agg-gb300-tp4-1n.yaml                |  79 ++++++++++++
 .../1k1k/agg-gb300-tp4ep4-1n.yaml             |  80 ++++++++++++
 .../1k1k/agg-gb300-tp8-2n.yaml                |  79 ++++++++++++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml     |  97 ++++++++++++++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml     |  95 ++++++++++++++
 .../8k1k/agg-gb300-dep8-2n.yaml               |  82 ++++++++++++
 .../8k1k/agg-gb300-tp4-1n.yaml                |  79 ++++++++++++
 .../8k1k/agg-gb300-tp4ep4-1n.yaml             |  80 ++++++++++++
 .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml     |  97 ++++++++++++++
 perf-changelog.yaml                           |  13 ++
 runners/launch_gb300-cw.sh                    |  51 +++++++-
 15 files changed, 1122 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 187824347..4f66dd392 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11679,6 +11679,128 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: true
 
+# MiniMax-M3 GB300 full sweep — safetensors-load-strategy removed from all
+# GB300 recipes (host-memory OOM with prefetch on CW Grace Blackwell nodes).
+# srun_options mem=0 required (DefMemPerCPU=4096 cgroup limit).
+minimaxm3-fp8-gb300-dynamo-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: gb300
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Low latency: TP=4 aggregated, 1 node (4 GPU).
+      - conc-list: [4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+      # Low latency: TP=8 aggregated, 2 nodes (8 GPU).
+      - conc-list: [4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml"
+        decode:
+          num-worker: 0
+          tp: 8
+          ep: 1
+          dp-attn: false
+
+      # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU).
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
+      - conc-list: [64, 128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # Mid curve: 1P+1D disagg TP4+EP4, collocated 1 node (8 GPU).
+      - conc-list: [64, 128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # High throughput: DEP=4 aggregated, 1 node (4 GPU).
+      - conc-list: [256, 512, 1024]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 4
+          dp-attn: true
+
+      # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU).
+      - conc-list: [512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 8
+          dp-attn: true
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 81727ef39..85b399e6c 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -123,6 +123,11 @@ on:
 
 env:
   RANDOM_RANGE_RATIO: 0.8
+  # Day-zero models resolved via hf: ids download from the Hub inside the
+  # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
+  # get 429-rate-limited when several workers pull a 444 GB snapshot at
+  # once; sbatch/srun inherit this env so the token reaches the workers.
+  HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
new file mode 100644
index 000000000..58467b48d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
@@ -0,0 +1,82 @@
+name: "minimax-m3-vllm-agg-gb300-dep4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
new file mode 100644
index 000000000..5842a2aec
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
@@ -0,0 +1,82 @@
+name: "minimax-m3-vllm-agg-gb300-dep8-2n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
new file mode 100644
index 000000000..d6f981ab6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-agg-gb300-tp4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
new file mode 100644
index 000000000..8f5bc8675
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
@@ -0,0 +1,80 @@
+name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
new file mode 100644
index 000000000..6ab7b8a61
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-agg-gb300-tp8-2n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
new file mode 100644
index 000000000..ac80ba8e9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -0,0 +1,97 @@
+name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node.
+# 8 GPUs per GB300 node: 4 for prefill, 4 for decode.
+# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  allow_prefill_decode_colocation: true
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
new file mode 100644
index 000000000..1bcbc0ac2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
@@ -0,0 +1,95 @@
+name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-2n-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB300, 2 nodes.
+# 1 node prefill (4 GPUs, TP4+EP4) → NixlConnector → 1 node decode (4 GPUs, TP4+EP4).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# safetensors-load-strategy omitted — prefetch OOMs on CW GB300 host memory.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
new file mode 100644
index 000000000..d4d4af392
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
@@ -0,0 +1,82 @@
+name: "minimax-m3-vllm-agg-gb300-dep8-2n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 256
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
new file mode 100644
index 000000000..7d268187d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-agg-gb300-tp4-1n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
new file mode 100644
index 000000000..196981472
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
@@ -0,0 +1,80 @@
+name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
+# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
+# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
+# shared-FS path and bind-mounts it via extra_mount).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
new file mode 100644
index 000000000..e9c60933c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -0,0 +1,97 @@
+name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node.
+# 8 GPUs per GB300 node: 4 for prefill, 4 for decode.
+# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  allow_prefill_decode_colocation: true
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64x128x256"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d29c9a5d3..fe804d0cc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3646,3 +3646,16 @@
     - "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k"
     - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724
+
+- config-keys:
+    - minimaxm3-fp8-gb300-dynamo-vllm
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB300 via Dynamo"
+    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
+    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)"
+    - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode"
+    - "7 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), 1P+1D disagg collocated (1n), DEP4 (1n), DEP8 (2n)"
+    - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
+    - "GB300 recipes require srun_options mem=0 (CW DefMemPerCPU=4096 causes host-RAM OOM) and omit safetensors-load-strategy prefetch (~600 GB host-memory limit)"
+    - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
+  pr-link: TBD
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 6a5c50e38..50dae4465 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -59,8 +59,24 @@ elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then
         echo "Unsupported framework on gb300-cw for glm5/fp8: $FRAMEWORK. Currently supported: dynamo-sglang"
         exit 1
     fi
+elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
+    # Day-zero: MiniMax-M3-MXFP8 is not staged on this cluster. The recipes
+    # carry an hf: model id directly, so srtctl pre-downloads the snapshot
+    # into the shared HF_HOME sed-injected below; MODEL_PATH only feeds the
+    # (unreferenced) model_paths aliases in srtslurm.yaml.
+    export MODEL_PATH="hf:MiniMaxAI/MiniMax-M3-MXFP8"
+
+    if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+        SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git"
+        SRT_SLURM_RECIPES_REF="main"
+        SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8"
+        SRT_RECIPE_DST="recipes/vllm/minimax-m3-gb300-fp8"
+    else
+        echo "Unsupported framework on gb300-cw for minimaxm3/fp8: $FRAMEWORK. Currently supported: dynamo-vllm"
+        exit 1
+    fi
 else
-    echo "Unsupported model prefix/precision combination on gb300-cw: $MODEL_PREFIX/$PRECISION. Currently supported: dsv4/fp4, glm5/fp8"
+    echo "Unsupported model prefix/precision combination on gb300-cw: $MODEL_PREFIX/$PRECISION. Currently supported: dsv4/fp4, glm5/fp8, minimaxm3/fp8"
     exit 1
 fi
 
@@ -276,6 +292,39 @@ else
     mv "$TMP_CONFIG_FILE" "$CONFIG_FILE"
 fi
 
+# MiniMax-M3 day-zero: the recipes use an hf: model id and need a shared-FS
+# HF_HOME visible (and writable) on compute nodes for srtctl's one-time
+# pre-download of the 444 GB snapshot. /mnt/vast is the shared NFS that
+# already hosts models and squash files on this cluster.
+if [[ $MODEL_PREFIX == "minimaxm3" ]]; then
+    M3_HF_HOME="/mnt/vast/hf-home"
+    mkdir -p "$M3_HF_HOME"
+    sed -i "s|__M3_HF_HOME__|${M3_HF_HOME}|g" "$CONFIG_FILE"
+    # Dynamo's rust fetch_model dies instantly on ANY held .lock file
+    # ("Lock acquisition failed") — it doesn't retry like Python's
+    # huggingface_hub.  Concurrent GHA jobs race on the 444 GB download
+    # and create fresh locks that survive the old "mmin +10" cleanup.
+    # Fix: nuke ALL locks (safe — HF uses atomic rename from .incomplete),
+    # then force-download with the Python client (which DOES wait for
+    # locks) so srtctl's pre-download is a no-op and dynamo sees a fully
+    # cached snapshot with zero lock files.
+    find "$M3_HF_HOME" -name '*.lock' -delete 2>/dev/null || true
+    export HF_HOME="$M3_HF_HOME"
+    DL_CMD="huggingface-cli download"
+    command -v huggingface-cli >/dev/null 2>&1 || DL_CMD="hf download"
+    for _attempt in 1 2 3; do
+        if HF_HUB_OFFLINE=1 $DL_CMD MiniMaxAI/MiniMax-M3-MXFP8 --quiet 2>/dev/null; then
+            echo "MiniMax-M3-MXFP8 fully cached (verified offline)"
+            break
+        fi
+        echo "MiniMax-M3 cache incomplete, downloading (attempt $_attempt)..."
+        find "$M3_HF_HOME" -name '*.lock' -delete 2>/dev/null || true
+        $DL_CMD MiniMaxAI/MiniMax-M3-MXFP8 --quiet 2>&1 | tail -5 || true
+        sleep 5
+    done
+    find "$M3_HF_HOME" -name '*.lock' -delete 2>/dev/null || true
+fi
+
 SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
 echo "$SRTCTL_OUTPUT"
 

From e3fa89f3996ec115bae5a53cba57c1635c8b6cf0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 18:06:25 -0700
Subject: [PATCH 02/14] chore: update perf-changelog pr-link to #1735

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index fe804d0cc..2c1292116 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3658,4 +3658,4 @@
     - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
     - "GB300 recipes require srun_options mem=0 (CW DefMemPerCPU=4096 causes host-RAM OOM) and omit safetensors-load-strategy prefetch (~600 GB host-memory limit)"
     - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
-  pr-link: TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1735

From afc3f921abfb7ce333dcd3d8fe67f9dbbd299963 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 18:20:14 -0700
Subject: [PATCH 03/14] Update runner name in nvidia-master.yaml

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7dbaa8f2e..5e85e9f3c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11686,7 +11686,7 @@ minimaxm3-fp8-gb300-dynamo-vllm:
   image: vllm/vllm-openai:minimax-m3
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
-  runner: gb300
+  runner: gb300-cw
   precision: fp8
   framework: dynamo-vllm
   multinode: true

From 99a075bfeeb7962d92b8b35fc02aa0cadbba6575 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 18:48:09 -0700
Subject: [PATCH 04/14] fix: add sbatch_directives mem=0 + cpus-per-task=72 to
 M3 GB300 recipes

srun_options.mem=0 only grants a step the job's existing allocation; on
gb300-cw (DefMemPerCPU=4096, no DefCpuPerGPU) the job itself was only
allocated 4 GB/node and workers were cgroup-OOM-killed during engine
init (run 27452273567: oom_kill in StepId=7409.7 on slurm-gb300-133-193,
worker RLIMIT showed 4194304 KB). The canary passed because it landed on
gb300-nv, which doesn't enforce the cap. Mirrors the sbatch_directives
block of the DSV4 agentic recipes.
---
 .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml     | 9 +++++++++
 .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml     | 9 +++++++++
 .../vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml | 9 +++++++++
 .../minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml   | 9 +++++++++
 .../vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml | 9 +++++++++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml                | 9 +++++++++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml                | 9 +++++++++
 .../minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml     | 9 +++++++++
 .../vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml | 9 +++++++++
 .../minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml   | 9 +++++++++
 .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml                | 9 +++++++++
 11 files changed, 99 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
index 58467b48d..0ce1485b2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
index 5842a2aec..1afd90c56 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
index d6f981ab6..14596993f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
index 8f5bc8675..d0e543c28 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
index 6ab7b8a61..31e706fb7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
index ac80ba8e9..417c0958c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -14,6 +14,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
index 1bcbc0ac2..7ce075f5c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
@@ -14,6 +14,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
index d4d4af392..8451e0ad1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
index 7d268187d..6bf18b5a7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
index 196981472..a11b5c405 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
@@ -22,6 +22,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
index e9c60933c..53d2b049b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -14,6 +14,15 @@ dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
 
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
 srun_options:
   mem: "0"
 

From 26e2005bc27b25415735b1a51f2deaf203ff4177 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 19:07:17 -0700
Subject: [PATCH 05/14] fix: run M3 GB300 workers cache-only (HF_HUB_OFFLINE=1)
 to avoid fetch_model lock race

With the mem fix in place, run 27452976271 cleared the OOM but hit a new
failure: both nodes of the TP8-2n job called dynamo fetch_model within
200ms (191 @ :23.637, 193 @ :23.833), 191 took the per-blob .lock on the
shared /mnt/vast/hf-home cache and held it verifying the 444 GB snapshot,
193 retried ~6.4s and died 'Lock acquisition failed' (dynamo's rust hub
doesn't wait like Python hf_hub). The launcher already pre-stages and
verifies the snapshot offline before submit, so the workers never need to
fetch. Setting HF_HUB_OFFLINE=1 in every worker env block makes dynamo
serve cache-only and skip the download lock entirely, so co-fetching
workers no longer collide. Applied to all agg + disagg (prefill/decode)
env blocks across the 11 recipes.
---
 .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml |  6 ++++++
 .../minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml |  6 ++++++
 .../minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml  |  6 ++++++
 .../1k1k/agg-gb300-tp4ep4-1n.yaml                    |  6 ++++++
 .../minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml  |  6 ++++++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml            | 12 ++++++++++++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml            | 12 ++++++++++++
 .../minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml |  6 ++++++
 .../minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml  |  6 ++++++
 .../8k1k/agg-gb300-tp4ep4-1n.yaml                    |  6 ++++++
 .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml            | 12 ++++++++++++
 11 files changed, 84 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
index 0ce1485b2..f8cb4c161 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
index 1afd90c56..3b380e36d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
index 14596993f..c156fb59d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
index d0e543c28..0e0b2280f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
index 31e706fb7..5884b7f6b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
index 417c0958c..133b54846 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -60,11 +60,23 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
index 7ce075f5c..d2a98b286 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
@@ -58,11 +58,23 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
index 8451e0ad1..5d2330626 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
index 6bf18b5a7..d32879cbd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
index a11b5c405..06c01619f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
@@ -66,6 +66,12 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     aggregated:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
index 53d2b049b..0c671cd62 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -60,11 +60,23 @@ backend:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
 
   vllm_config:
     prefill:

From b660ddde1f46aa0ea68022b4f93a1758f943ee10 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:54:51 -0700
Subject: [PATCH 06/14] fix: re-pin utils/aiperf to live cjq/agentx-v0.3 tip
 (ff2b646c)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous pin 062a5de9 (set by #1571 "chore: agentx v0.3") was the
cjq/agentx-v0.3 tip on 2026-06-02, but that branch was later rebased/
force-pushed (now at ff2b646c) which orphaned 062a5de9; GitHub has since
garbage-collected it. It is now unfetchable ("upload-pack: not our ref")
and absent from every CI runner cache, so actions/checkout fails on any
cold runner with "Unable to find current revision in submodule path
utils/aiperf" (e.g. the newly-added gb300-cw runner-4, run 27453693856).

Re-pin to the current cjq/agentx-v0.3 tip — the branch .gitmodules already
declares, which is live/fetchable and contains the prior aiperf history as
an ancestor. This makes the pin and the declared branch consistent again.
---
 utils/aiperf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/aiperf b/utils/aiperf
index 062a5de92..ff2b646c0 160000
--- a/utils/aiperf
+++ b/utils/aiperf
@@ -1 +1 @@
-Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9
+Subproject commit ff2b646c0425aff9307a0e73161b23d77003a357

From ef7c65080b99fa7596363cb3c769b88673196b9a Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:14:26 -0700
Subject: [PATCH 07/14] MiniMax-M3 GB300: disagg-only sweep + multi-node-NVLink
 KV transfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the aggregated M3 GB300 topologies with disaggregated-only, and
enable NixlConnector KV transfer over multi-node NVLink on every disagg
recipe. On gb300-cw the cross-node prefill->decode KV handoff was silently
falling back to RDMA/TCP (~268 MB/s, ~1400 tiny descriptors for M3 MSA
cache) — the disagg ceiling. Setting UCX_CUDA_IPC_ENABLE_MNNVL=y plus
--enable-cumem-allocator (VMM-registers KV so NIXL uses cuda_ipc across the
NVL fabric) lifts it to ~1.4-1.7 GB/s and gives +17% / +23% / +49%
out tok/s/gpu at conc 64 / 128 / 256 (jobs 7490 base vs 7493 MNNVL, 1P1D
TP4EP4). This is a GB300-only win: B300 8-GPU IB islands cannot move KV
over multi-node NVLink.

Sweep (1k1k), all MNNVL:
- 1P1D TP4+EP4 collocated 1n (8 GPU), conc 8-256  - low/mid latency
- 1P1D TP4+EP4 split 2n (8 GPU),       conc 64-512 - mid throughput
- 1P + DP16+EP wide decode 5n (20 GPU), conc 512-2048 - max throughput
  (decode keeps scaling on NVL where 1P1D saturates: ~1213 vs ~810
   out tok/s/gpu @ conc 1024)

Removes all agg-gb300 recipes (1k1k + 8k1k); applies MNNVL to the 8k1k
disagg recipe too for consistency.
---
 .github/configs/nvidia-master.yaml            |  86 +++-----------
 .../1k1k/agg-gb300-dep4-1n.yaml               |  97 ---------------
 .../1k1k/agg-gb300-dep8-2n.yaml               |  97 ---------------
 .../1k1k/agg-gb300-tp4-1n.yaml                |  94 ---------------
 .../1k1k/agg-gb300-tp4ep4-1n.yaml             |  95 ---------------
 .../1k1k/agg-gb300-tp8-2n.yaml                |  94 ---------------
 .../1k1k/disagg-gb300-1p-dep16dec-5n.yaml     | 112 ++++++++++++++++++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml     |  10 ++
 .../1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml     |  10 ++
 .../8k1k/agg-gb300-dep8-2n.yaml               |  97 ---------------
 .../8k1k/agg-gb300-tp4-1n.yaml                |  94 ---------------
 .../8k1k/agg-gb300-tp4ep4-1n.yaml             |  95 ---------------
 .../8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml     |  10 ++
 13 files changed, 159 insertions(+), 832 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f3f720180..871518e5c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11790,52 +11790,28 @@ minimaxm3-fp8-gb300-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Low latency: TP=4 aggregated, 1 node (4 GPU).
-      - conc-list: [4, 8, 16, 32, 64]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml"
-        decode:
-          num-worker: 0
-          tp: 4
-          ep: 1
-          dp-attn: false
-
-      # Low latency: TP=8 aggregated, 2 nodes (8 GPU).
-      - conc-list: [4, 8, 16, 32, 64]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml"
-        decode:
-          num-worker: 0
-          tp: 8
-          ep: 1
-          dp-attn: false
-
-      # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU).
-      - conc-list: [128, 256, 512]
+      # Disagg-only sweep. Every recipe enables NixlConnector KV transfer over
+      # multi-node NVLink (UCX_CUDA_IPC_ENABLE_MNNVL=y + --enable-cumem-allocator),
+      # which moved the cross-node prefill->decode KV handoff off the RDMA/TCP
+      # fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s) for +17-49%
+      # tok/s/gpu on M3 — a GB300-only win (B300's 8-GPU IB islands cannot move
+      # KV over multi-node NVLink).
+      # Low / mid latency: 1P+1D TP4+EP4 collocated, 1 node (8 GPU).
+      - conc-list: [8, 16, 32, 64, 128, 256]
         prefill:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml"
         decode:
-          num-worker: 0
+          num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
 
-      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
+      # Mid throughput: 1P+1D TP4+EP4 split across 2 nodes (8 GPU).
       - conc-list: [64, 128, 256, 512]
         prefill:
           num-worker: 1
@@ -11850,49 +11826,21 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: false
 
-      # Mid curve: 1P+1D disagg TP4+EP4, collocated 1 node (8 GPU).
-      - conc-list: [64, 128, 256, 512]
+      # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU).
+      # Decode keeps scaling on the NVL fabric where 1P1D saturates:
+      # ~1213 out tok/s/gpu @ conc 1024 vs the 1P1D ~810 plateau.
+      - conc-list: [512, 1024, 2048]
         prefill:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
-
-      # High throughput: DEP=4 aggregated, 1 node (4 GPU).
-      - conc-list: [256, 512, 1024]
-        prefill:
-          num-worker: 1
-          tp: 1
-          ep: 4
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml"
         decode:
-          num-worker: 0
-          tp: 1
-          ep: 4
-          dp-attn: true
-
-      # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU).
-      - conc-list: [512, 1024, 2048]
-        prefill:
           num-worker: 1
           tp: 1
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml"
-        decode:
-          num-worker: 0
-          tp: 1
-          ep: 8
+          ep: 16
           dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
deleted file mode 100644
index f8cb4c161..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep4-1n.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-dep4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 4
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
deleted file mode 100644
index 3b380e36d..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-dep8-2n.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-dep8-2n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
deleted file mode 100644
index c156fb59d..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4-1n.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-tp4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
deleted file mode 100644
index 0e0b2280f..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp4ep4-1n.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
deleted file mode 100644
index 5884b7f6b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/agg-gb300-tp8-2n.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-tp8-2n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml
new file mode 100644
index 000000000..434007451
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml
@@ -0,0 +1,112 @@
+name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-1k1k"
+
+# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode.
+# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP
+# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y +
+# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu).
+# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector
+# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests
+# whether fast MNNVL KV transfer makes the wide decode pool (which B300
+# cannot build on one NVLink island) actually pay off.
+# --block-size 128 mandatory (MSA cache); text-only -> language-model-only.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enable-cumem-allocator: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-cumem-allocator: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
index 133b54846..f4bd604f6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -66,6 +66,10 @@ backend:
     # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
     # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
     HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -77,6 +81,10 @@ backend:
     # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
     # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
     HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   vllm_config:
     prefill:
@@ -84,6 +92,7 @@ backend:
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
       enable-expert-parallel: true
+      enable-cumem-allocator: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
@@ -100,6 +109,7 @@ backend:
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
       enable-expert-parallel: true
+      enable-cumem-allocator: true
       max-model-len: 2304
       max-num-seqs: 256
       max-num-batched-tokens: 256
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
index d2a98b286..8e13b522f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
@@ -64,6 +64,10 @@ backend:
     # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
     # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
     HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -75,6 +79,10 @@ backend:
     # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
     # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
     HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   vllm_config:
     prefill:
@@ -82,6 +90,7 @@ backend:
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
       enable-expert-parallel: true
+      enable-cumem-allocator: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
@@ -98,6 +107,7 @@ backend:
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
       enable-expert-parallel: true
+      enable-cumem-allocator: true
       max-model-len: 2304
       max-num-seqs: 256
       max-num-batched-tokens: 256
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
deleted file mode 100644
index 5d2330626..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-dep8-2n.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-dep8-2n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 256
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
deleted file mode 100644
index d32879cbd..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4-1n.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-tp4-1n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
deleted file mode 100644
index 06c01619f..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/agg-gb300-tp4ep4-1n.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-name: "minimax-m3-vllm-agg-gb300-tp4ep4-1n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses the hf: prefix — srtctl pre-downloads
-# the 444 GB MXFP8 snapshot once, on a compute node, into the shared
-# HF_HOME below (launcher seds __M3_HF_HOME__ to a cluster-writable
-# shared-FS path and bind-mounts it via extra_mount).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
index 0c671cd62..843f47951 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
@@ -66,6 +66,10 @@ backend:
     # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
     # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
     HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -77,6 +81,10 @@ backend:
     # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
     # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
     HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   vllm_config:
     prefill:
@@ -84,6 +92,7 @@ backend:
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
       enable-expert-parallel: true
+      enable-cumem-allocator: true
       enforce-eager: true
       max-model-len: 9472
       max-num-seqs: 16
@@ -100,6 +109,7 @@ backend:
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
       enable-expert-parallel: true
+      enable-cumem-allocator: true
       max-model-len: 9472
       max-num-seqs: 256
       max-num-batched-tokens: 256

From 7fd890415772d9cbb4b4ecd37465b67c29368fc3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:47:31 -0700
Subject: [PATCH 08/14] M3 GB300: add 8k1k disagg sweep; drop unschedulable
 collocated-1n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The collocated-1n topology (disagg-gb300-1p1d-tp4ep4-1n) declared
gpus_per_node: 8, but gb300-cw nodes have 4 GPUs — sbatch rejects it with
"Requested node configuration is not available" even on a fully idle
cluster (confirmed: fails standalone with 28 nodes free; the split-2n and
wide-decode at gpus_per_node 4 schedule fine). It was an 8-GPU-node
template artifact that never reached sbatch before. Remove it (1k1k + 8k1k)
and let the split-2n cover the low-latency end (conc extended down to 8).

Add the 8k1k (isl 8192) scenario mirroring 1k1k with the two valid disagg
shapes (split-2n + wide DP16 decode), MNNVL KV transfer on both, seq params
retuned for long context (max-model-len 9472) and lower concurrency.
---
 .github/configs/nvidia-master.yaml            |  47 +++++--
 .../1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml     | 128 ------------------
 .../8k1k/disagg-gb300-1p-dep16dec-5n.yaml     | 112 +++++++++++++++
 ....yaml => disagg-gb300-1p1d-tp4ep4-2n.yaml} |  14 +-
 4 files changed, 151 insertions(+), 150 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-1p1d-tp4ep4-1n.yaml => disagg-gb300-1p1d-tp4ep4-2n.yaml} (91%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7abaf7160..d349b9ed5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11790,36 +11790,57 @@ minimaxm3-fp8-gb300-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Disagg-only sweep. Every recipe enables NixlConnector KV transfer over
+      # Disagg-only. Every recipe enables NixlConnector KV transfer over
       # multi-node NVLink (UCX_CUDA_IPC_ENABLE_MNNVL=y + --enable-cumem-allocator),
       # which moved the cross-node prefill->decode KV handoff off the RDMA/TCP
-      # fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s) for +17-49%
-      # tok/s/gpu on M3 — a GB300-only win (B300's 8-GPU IB islands cannot move
-      # KV over multi-node NVLink).
-      # Low / mid latency: 1P+1D TP4+EP4 collocated, 1 node (8 GPU).
-      - conc-list: [8, 16, 32, 64, 128, 256]
+      # fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s): +17/+23/+49%
+      # out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU
+      # IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node,
+      # so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation).
+      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU) - low -> mid.
+      - conc-list: [8, 16, 32, 64, 128, 256, 512]
         prefill:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
         decode:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
 
-      # Mid throughput: 1P+1D TP4+EP4 split across 2 nodes (8 GPU).
-      - conc-list: [64, 128, 256, 512]
+      # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU). Decode keeps scaling on the NVL fabric where 1P1D saturates: ~1213 vs ~810 out tok/s/gpu @ conc 1024.
+      - conc-list: [512, 1024, 2048]
         prefill:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml"
+        decode:
+          num-worker: 1
+          tp: 1
+          ep: 16
+          dp-attn: true
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 8k1k long context: prefill is heavier and KV is larger, so concurrency
+      # is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer.
+      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU).
+      - conc-list: [16, 32, 64, 128, 256]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
         decode:
           num-worker: 1
           tp: 4
@@ -11827,16 +11848,14 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           dp-attn: false
 
       # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU).
-      # Decode keeps scaling on the NVL fabric where 1P1D saturates:
-      # ~1213 out tok/s/gpu @ conc 1024 vs the 1P1D ~810 plateau.
-      - conc-list: [512, 1024, 2048]
+      - conc-list: [256, 512, 1024]
         prefill:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml"
         decode:
           num-worker: 1
           tp: 1
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
deleted file mode 100644
index f4bd604f6..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ /dev/null
@@ -1,128 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-1k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node.
-# 8 GPUs per GB300 node: 4 for prefill, 4 for decode.
-# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
-# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
-# OOM-kill during engine init; srun_options.mem=0 alone only grants a
-# step what the job already holds. cpus-per-task=72 (one NUMA socket)
-# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
-# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 8
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-  allow_prefill_decode_colocation: true
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
-    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
-    # registered KV (enable-cumem-allocator below).
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    # Cache-only at runtime: the launcher pre-stages the full snapshot and
-    # verifies it offline before submit, so workers must NOT re-fetch. Without
-    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
-    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
-    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
-    HF_HUB_OFFLINE: "1"
-    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
-    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
-    # registered KV (enable-cumem-allocator below).
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      enable-cumem-allocator: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      enable-cumem-allocator: true
-      max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml
new file mode 100644
index 000000000..2771039e8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml
@@ -0,0 +1,112 @@
+name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-8k1k"
+
+# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode.
+# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP
+# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y +
+# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu).
+# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector
+# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests
+# whether fast MNNVL KV transfer makes the wide decode pool (which B300
+# cannot build on one NVLink island) actually pay off.
+# --block-size 128 mandatory (MSA cache); text-only -> language-model-only.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enable-cumem-allocator: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-cumem-allocator: true
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
similarity index 91%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
index 843f47951..1e0d67e3c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
@@ -1,9 +1,9 @@
-name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-8k1k"
+name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-2n-8k1k"
 
-# MiniMax-M3 disaggregated 1P+1D recipe for GB300, collocated on 1 node.
-# 8 GPUs per GB300 node: 4 for prefill, 4 for decode.
-# Prefill (TP4+EP4) → NixlConnector → Decode (TP4+EP4).
+# MiniMax-M3 disaggregated 1P+1D recipe for GB300, 2 nodes.
+# 1 node prefill (4 GPUs, TP4+EP4) → NixlConnector → 1 node decode (4 GPUs, TP4+EP4).
 # --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# safetensors-load-strategy omitted — prefetch OOMs on CW GB300 host memory.
 
 model:
   path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
@@ -26,7 +26,6 @@ sbatch_directives:
 srun_options:
   mem: "0"
 
-
 slurm:
   time_limit: "8:00:00"
 
@@ -39,7 +38,7 @@ extra_mount:
 
 resources:
   gpu_type: "gb300"
-  gpus_per_node: 8
+  gpus_per_node: 4
   prefill_nodes: 1
   decode_nodes: 1
   prefill_workers: 1
@@ -54,7 +53,6 @@ frontend:
 backend:
   type: vllm
   connector: null
-  allow_prefill_decode_colocation: true
 
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -125,4 +123,4 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "64x128x256"
+  concurrencies: "16x32x64x128x256"

From 5df0669a3e04ea4427cb92c66691c576940f706a Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 18:53:25 -0700
Subject: [PATCH 09/14] M3 GB300: add rack-saturating balanced-ratio TP-ep1
 max-throughput disagg config

Adds a 17-node (full-rack) disagg topology to the M3 GB300 sweep (1k1k +
8k1k) from on-cluster tuning (gb300-cw):

- PREFILL is the binding bottleneck, not decode width or KV transfer:
  a single prefill worker left ~3967 reqs queued and starved 64 decode
  GPUs. Balancing to 5 prefill : 12 decode (TP4) cleared the backlog and
  lifted throughput +57% (535 -> 843 out tok/s/gpu @ conc 2048).
- TP-only decode (ep1, no expert parallelism) per the Qwen3.5-397B-A17B
  recipes (closest M3 analog); M3 wide-EP/DP-attention all-to-all was
  slower and DP32 < DP16 per-GPU.
- Kept the existing 1p1d (low/mid latency) and dep16dec (wide-decode)
  topologies so CI measures the full Pareto rather than replacing them.

NixlConnector KV transfer stays on multi-node NVLink (MNNVL + cumem);
note KV transfer was verified NOT to bottleneck throughput (doubling its
bandwidth via num_threads changed end-to-end tok/s/gpu by ~0). recipe
yamls line up 1:1 with the nvidia-master.yaml CONFIG_FILE references.
---
 .github/configs/nvidia-master.yaml            |  34 ++++++
 .../1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml   | 102 ++++++++++++++++++
 .../8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml   | 102 ++++++++++++++++++
 3 files changed, 238 insertions(+)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d349b9ed5..fd235392b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11827,6 +11827,23 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 16
           dp-attn: true
 
+      # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
+      # Balanced prefill:decode ratio (single prefill starved the decode pool)
+      # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower).
+      - conc-list: [2048, 4096, 8192]
+        prefill:
+          num-worker: 5
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml"
+        decode:
+          num-worker: 12
+          tp: 4
+          ep: 1
+          dp-attn: false
+
     - isl: 8192
       osl: 1024
       search-space:
@@ -11862,6 +11879,23 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 16
           dp-attn: true
 
+      # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
+      # Balanced prefill:decode ratio (single prefill starved the decode pool)
+      # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower).
+      - conc-list: [1024, 2048, 4096]
+        prefill:
+          num-worker: 5
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml"
+        decode:
+          num-worker: 12
+          tp: 4
+          ep: 1
+          dp-attn: false
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
new file mode 100644
index 000000000..eb39f71f9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
@@ -0,0 +1,102 @@
+name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-1k1k"
+
+# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU).
+# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert
+# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead
+# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode
+# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued),
+# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0,
+# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV
+# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128
+# mandatory (MSA cache); text-only -> language-model-only.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+slurm:
+  time_limit: "8:00:00"
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 12
+  prefill_workers: 5
+  decode_workers: 12
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      enforce-eager: true
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      max-model-len: 2304
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      max-model-len: 2304
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048x4096x8192"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
new file mode 100644
index 000000000..7cd84fbf1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
@@ -0,0 +1,102 @@
+name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-8k1k"
+
+# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU).
+# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert
+# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead
+# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode
+# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued),
+# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0,
+# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV
+# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128
+# mandatory (MSA cache); text-only -> language-model-only.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+slurm:
+  time_limit: "8:00:00"
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 12
+  prefill_workers: 5
+  decode_workers: 12
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      enforce-eager: true
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      max-model-len: 9472
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      max-model-len: 9472
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024x2048x4096"

From 62fe18d3784f89f4ff2271a6da34cdd7dddbbe14 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 22:38:01 -0700
Subject: [PATCH 10/14] M3 GB300: replace dep16dec with 1P4D TP4-ep1; add
 prefill-heavy 10P7D for 8k1k
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DSR1 GB300 patterns show wide-EP decode hurts M3's MoE all-to-all;
independent TP4 decode workers are strictly better. Also, 8k1k is
prefill-bound (616-req backlog at 5P:12D) — rebalance to 10P:7D
per DSR1/DSV4's prefill-heavy long-context ratios.

Changes:
- Replace dep16dec (EP16 single decode) with 1P+4D (4x TP4 ep1 decode)
  for both 1k1k and 8k1k, same 5 nodes
- Add 10P+7D TP4 ep1 (17 nodes) for 8k1k max throughput
- Tighten concurrency ranges: 1P1D [4-32], 1P4D [64-512], 5P12D/10P7D [1024+]
---
 .github/configs/nvidia-master.yaml            | 64 +++++++-----
 ....yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} | 38 +++----
 .../8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml   | 99 +++++++++++++++++++
 ....yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} | 38 +++----
 4 files changed, 164 insertions(+), 75 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/{disagg-gb300-1p-dep16dec-5n.yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} (68%)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-1p-dep16dec-5n.yaml => disagg-gb300-1p4d-tp4ep1-5n.yaml} (68%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index fd235392b..0fa0079b3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11797,8 +11797,8 @@ minimaxm3-fp8-gb300-dynamo-vllm:
       # out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU
       # IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node,
       # so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation).
-      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU) - low -> mid.
-      - conc-list: [8, 16, 32, 64, 128, 256, 512]
+      # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU).
+      - conc-list: [4, 8, 16, 32]
         prefill:
           num-worker: 1
           tp: 4
@@ -11812,20 +11812,22 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: false
 
-      # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU). Decode keeps scaling on the NVL fabric where 1P1D saturates: ~1213 vs ~810 out tok/s/gpu @ conc 1024.
-      - conc-list: [512, 1024, 2048]
+      # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). DSR1 pattern: multiple
+      # independent TP4 decode workers instead of wide-EP decode (M3's MoE
+      # all-to-all makes wide EP slower).
+      - conc-list: [64, 128, 256, 512]
         prefill:
           num-worker: 1
           tp: 4
-          ep: 4
+          ep: 1
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml"
         decode:
-          num-worker: 1
-          tp: 1
-          ep: 16
-          dp-attn: true
+          num-worker: 4
+          tp: 4
+          ep: 1
+          dp-attn: false
 
       # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
       # Balanced prefill:decode ratio (single prefill starved the decode pool)
@@ -11849,8 +11851,8 @@ minimaxm3-fp8-gb300-dynamo-vllm:
       search-space:
       # 8k1k long context: prefill is heavier and KV is larger, so concurrency
       # is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer.
-      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU).
-      - conc-list: [16, 32, 64, 128, 256]
+      # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU).
+      - conc-list: [4, 8, 16]
         prefill:
           num-worker: 1
           tp: 4
@@ -11864,24 +11866,23 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: false
 
-      # Max throughput: 1 prefill (TP4+EP4) + wide DP16+EP decode, 5 nodes (20 GPU).
-      - conc-list: [256, 512, 1024]
+      # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). Multiple independent
+      # TP4 decode workers instead of wide-EP decode.
+      - conc-list: [32, 64, 128, 256]
         prefill:
           num-worker: 1
           tp: 4
-          ep: 4
+          ep: 1
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml"
         decode:
-          num-worker: 1
-          tp: 1
-          ep: 16
-          dp-attn: true
+          num-worker: 4
+          tp: 4
+          ep: 1
+          dp-attn: false
 
-      # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
-      # Balanced prefill:decode ratio (single prefill starved the decode pool)
-      # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower).
+      # Rack-saturating: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
       - conc-list: [1024, 2048, 4096]
         prefill:
           num-worker: 5
@@ -11896,6 +11897,23 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 1
           dp-attn: false
 
+      # Prefill-heavy rack-saturating: 10P + 7D, TP4 ep1 (17 nodes).
+      # At 8k context, prefill is 8x heavier — 5P:12D showed 616-req prefill
+      # backlog. DSR1/DSV4 GB300 patterns use 6-10 prefill workers for 8k1k.
+      - conc-list: [1024, 2048, 4096]
+        prefill:
+          num-worker: 10
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml"
+        decode:
+          num-worker: 7
+          tp: 4
+          ep: 1
+          dp-attn: false
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
similarity index 68%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
index 434007451..c9bb30163 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p-dep16dec-5n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
@@ -1,38 +1,28 @@
-name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-1k1k"
+name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-1k1k"
 
-# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode.
-# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP
-# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y +
-# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu).
-# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector
-# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests
-# whether fast MNNVL KV transfer makes the wide decode pool (which B300
-# cannot build on one NVLink island) actually pay off.
-# --block-size 128 mandatory (MSA cache); text-only -> language-model-only.
+# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes.
+# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode
+# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead
+# makes wide EP slower than independent TP4 workers.
+# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache).
 
 model:
   path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
   container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
-
 dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
-
 sbatch_directives:
   mem: "0"
   cpus-per-task: "72"
 srun_options:
   mem: "0"
-
-
 slurm:
   time_limit: "8:00:00"
-
 health_check:
   max_attempts: 720
   interval_seconds: 10
-
 extra_mount:
   - "__M3_HF_HOME__:__M3_HF_HOME__"
 
@@ -42,9 +32,9 @@ resources:
   prefill_nodes: 1
   decode_nodes: 4
   prefill_workers: 1
-  decode_workers: 1
+  decode_workers: 4
   gpus_per_prefill: 4
-  gpus_per_decode: 16
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -73,12 +63,11 @@ backend:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
-      enable-expert-parallel: true
       enable-cumem-allocator: true
       enforce-eager: true
-      max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
+      max-model-len: 2304
       block-size: 128
       language-model-only: true
       gpu-memory-utilization: 0.9
@@ -88,16 +77,13 @@ backend:
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
+      tensor-parallel-size: 4
       pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
       enable-cumem-allocator: true
-      max-model-len: 2304
       max-num-seqs: 256
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
+      max-model-len: 2304
       block-size: 128
       language-model-only: true
       gpu-memory-utilization: 0.9
@@ -109,4 +95,4 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "512x1024x2048"
+  concurrencies: "32x64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml
new file mode 100644
index 000000000..f005aa6dc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml
@@ -0,0 +1,99 @@
+name: "minimax-m3-vllm-disagg-gb300-10p7d-tp4ep1-8k1k"
+
+# MiniMax-M3 GB300 prefill-heavy max-throughput disagg, 17 nodes / 68 GPU.
+# At 8k context, prefill is 8x heavier per request: the 5P:12D ratio (tuned
+# for light 1k1k prefill) is prefill-bound — 616 reqs queued behind 5 workers.
+# This config rebalances to 10P:7D per DSR1/DSV4 GB300 patterns (DSR1 8k1k
+# uses 6P:1D; DSV4 uses 10P:8D at 18 nodes). TP4 ep1 decode (no wide EP —
+# M3's MoE all-to-all overhead makes it slower). MNNVL KV + cumem.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+slurm:
+  time_limit: "8:00:00"
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 10
+  decode_nodes: 7
+  prefill_workers: 10
+  decode_workers: 7
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      enforce-eager: true
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      max-model-len: 9472
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      max-model-len: 9472
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024x2048x4096"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
similarity index 68%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
index 2771039e8..86dbfca17 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p-dep16dec-5n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
@@ -1,38 +1,28 @@
-name: "minimax-m3-vllm-disagg-gb300-1p-dep16dec-5n-8k1k"
+name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-8k1k"
 
-# Combined fix + scale-up: MNNVL KV transfer ON + wide-EP decode.
-# Iter 2 showed DP16 decode starved because cross-node KV ran on RDMA/TCP
-# (~40-120 MB/s, decode idle). Iter 3 showed UCX_CUDA_IPC_ENABLE_MNNVL=y +
-# --enable-cumem-allocator lifts 1P1D KV transfer 5-6x (+17-23% tok/s/gpu).
-# This recipe puts both together: 1 compact TP4 prefill -> NixlConnector
-# over multi-node NVLink -> wide DP16+EP decode (16 GPUs / 4 nodes). Tests
-# whether fast MNNVL KV transfer makes the wide decode pool (which B300
-# cannot build on one NVLink island) actually pay off.
-# --block-size 128 mandatory (MSA cache); text-only -> language-model-only.
+# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes.
+# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode
+# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead
+# makes wide EP slower than independent TP4 workers.
+# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache).
 
 model:
   path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
   container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
-
 dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
-
 sbatch_directives:
   mem: "0"
   cpus-per-task: "72"
 srun_options:
   mem: "0"
-
-
 slurm:
   time_limit: "8:00:00"
-
 health_check:
   max_attempts: 720
   interval_seconds: 10
-
 extra_mount:
   - "__M3_HF_HOME__:__M3_HF_HOME__"
 
@@ -42,9 +32,9 @@ resources:
   prefill_nodes: 1
   decode_nodes: 4
   prefill_workers: 1
-  decode_workers: 1
+  decode_workers: 4
   gpus_per_prefill: 4
-  gpus_per_decode: 16
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -73,12 +63,11 @@ backend:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
       pipeline-parallel-size: 1
-      enable-expert-parallel: true
       enable-cumem-allocator: true
       enforce-eager: true
-      max-model-len: 9472
       max-num-seqs: 16
       max-num-batched-tokens: 16384
+      max-model-len: 9472
       block-size: 128
       language-model-only: true
       gpu-memory-utilization: 0.9
@@ -88,16 +77,13 @@ backend:
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
+      tensor-parallel-size: 4
       pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
       enable-cumem-allocator: true
-      max-model-len: 9472
       max-num-seqs: 256
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
+      max-model-len: 9472
       block-size: 128
       language-model-only: true
       gpu-memory-utilization: 0.9
@@ -109,4 +95,4 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "256x512x1024"
+  concurrencies: "32x64x128x256"

From 1d71f496cfbe400481bd56c91be63f72d9ace933 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 14 Jun 2026 01:49:20 -0400
Subject: [PATCH 11/14] [Klaud Cold]minimaxm3-fp8-mi300x-vllm-mtp: day-zero
 MiniMax-M3 EAGLE3 (MTP) MI300X recipe (#1749)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* minimaxm3-fp8-mi300x-vllm-mtp: day-zero MiniMax-M3 EAGLE3 MI300X recipe

Adds the spec-decoding=mtp sibling of minimaxm3-fp8-mi300x-vllm, based
on the MI300X non-MTP recipe + the MI355X MTP recipe. Keeps the MI300X
serve shape (BF16 KV cache — gfx942 lacks calibrated ROCm FP8 attention
scales — plus --no-enable-prefix-caching, TRITON_ATTN, --enforce-eager,
minimax_m3 parsers) and adds the Inferact/MiniMax-M3-EAGLE3 draft via
--speculative-config (method eagle3, 3 spec tokens) + chat-template
prompts.

Carries the same in-place EAGLE3 patch as the MI355X MTP recipe: the
shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the
recipe patches the installed amd/model.py before serving
(functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated
green on MI355X). Idempotent; hard-fails on base drift.

TP8-only search space (gfx942 192 GB is memory-tight, like H100), TP8
latency rows started at conc 1, matching the H100/MI355X MTP recipes.
Also adds SPEC_SUFFIX to launch_mi300x-amds.sh so spec-decoding=mtp
routes to the _mtp script (the launcher hardcoded _mi300x.sh).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

* perf-changelog: fill in PR link for minimaxm3-fp8-mi300x-vllm-mtp (#1749)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |  30 +++
 .../fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh | 212 ++++++++++++++++++
 perf-changelog.yaml                           |  10 +
 runners/launch_mi300x-amds.sh                 |   6 +-
 4 files changed, 257 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2528798b1..f18b3f94e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2871,3 +2871,33 @@ minimaxm3-fp8-mi300x-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
+
+# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
+# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
+# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only
+# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like
+# H100), with the TP8 latency rows started at conc 1 to capture single-request
+# latency — matching the H100/MI355X MTP recipes. The shipped ROCm image lacks
+# SupportsEagle3 on the AMD MiniMax-M3 model, so the recipe applies that fix
+# in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546;
+# validated green on MI355X) before serving.
+minimaxm3-fp8-mi300x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh
new file mode 100644
index 000000000..9dd10b30a
--- /dev/null
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh
@@ -0,0 +1,212 @@
+#!/usr/bin/env bash
+
+# MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe with EAGLE3
+# speculative decoding — the spec-decoding=mtp variant of
+# minimaxm3_fp8_mi300x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via
+# --speculative-config with 3 speculative tokens. Everything else mirrors the
+# non-MTP MI300X recipe: mandatory --block-size 128, --language-model-only for
+# the text-only benchmark, --attention-backend TRITON_ATTN, --enforce-eager,
+# and --no-enable-prefix-caching. The default BF16 KV cache is retained (unlike
+# the MI355X recipe's FP8 KV cache): gfx942 has no calibrated q/prob scales for
+# ROCm FP8 attention and vLLM's fallback scale of 1.0 corrupts accuracy.
+#
+# Unlike the CUDA recipes, the drafter needs no attention_backend override:
+# the FlashInfer "page size 128 requires GQA/MQA" limitation that forced
+# FLASH_ATTN for the EAGLE3 MHA head on Blackwell is FlashInfer/CUDA-specific.
+# Here the whole server runs on TRITON_ATTN (set globally below), which serves
+# the MHA draft fine.
+#
+# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image
+# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3
+# engine init fails with "Model does not support EAGLE3 interface but
+# aux_hidden_state_outputs was requested". This recipe applies that fix
+# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as
+# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we
+# can validate EAGLE3 on real MI300X hardware ahead of an image rebuild. The
+# same patch is validated green on MI355X. It is idempotent and fails the job
+# loudly if the installed amd/model.py has drifted from the expected base.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    EP_SIZE \
+    DP_ATTENTION \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+# MODEL is a bare HF id on the mi300x single-node runner (a fast cache hit when
+# pre-staged). The EAGLE3 draft is not staged; fetch it into the same cache.
+if [[ "$MODEL" != /* ]]; then
+  hf download "$MODEL"
+  hf download "$DRAFT_MODEL"
+fi
+
+if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+SERVER_LOG=/workspace/server.log
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+fi
+
+PARALLEL_ARGS=(--tensor-parallel-size "$TP")
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(
+        --tensor-parallel-size 1
+        --data-parallel-size "$TP"
+        --enable-expert-parallel
+    )
+elif [ "$EP_SIZE" -gt 1 ]; then
+    PARALLEL_ARGS+=(--enable-expert-parallel)
+fi
+
+# use 3 speculative tokens for all configs for now
+NUM_SPEC_TOKENS=3
+
+# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
+# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546).
+# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model +
+# aux-hidden-state emission, and SupportsEagle3 to the two outer classes.
+# Idempotent; hard-fails if the installed file has drifted from the expected
+# base (so we never silently run unpatched and mislabel the result).
+python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; }
+import ast, importlib.util, pathlib, sys
+
+spec = importlib.util.find_spec("vllm")
+root = pathlib.Path(spec.submodule_search_locations[0])
+target = root / "models" / "minimax_m3" / "amd" / "model.py"
+src = target.read_text()
+
+if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src:
+    print(f"[eagle3-patch] already applied: {target}")
+    sys.exit(0)
+
+edits = [
+    (
+        "from vllm.model_executor.models.interfaces import (\n"
+        "    MultiModalEmbeddings,\n"
+        "    SupportsMultiModal,\n"
+        ")",
+        "from vllm.model_executor.models.interfaces import (\n"
+        "    EagleModelMixin,\n"
+        "    MultiModalEmbeddings,\n"
+        "    SupportsEagle3,\n"
+        "    SupportsMultiModal,\n"
+        ")",
+    ),
+    (
+        "class MiniMaxM3Model(nn.Module):",
+        "class MiniMaxM3Model(nn.Module, EagleModelMixin):",
+    ),
+    (
+        "        inputs_embeds: torch.Tensor | None = None,\n"
+        "    ) -> torch.Tensor:\n"
+        "        if inputs_embeds is not None:",
+        "        inputs_embeds: torch.Tensor | None = None,\n"
+        "    ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n"
+        "        if inputs_embeds is not None:",
+    ),
+    (
+        "        residual = None\n\n"
+        "        for layer in self.layers[self.start_layer : self.end_layer]:\n"
+        "            hidden_states, residual = layer(positions, hidden_states, residual)\n\n"
+        "        hidden_states, _ = self.norm(hidden_states, residual)\n"
+        "        return hidden_states",
+        "        residual = None\n\n"
+        "        # EAGLE3 is not yet compatible with pipeline parallel\n"
+        "        aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n"
+        "        for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n"
+        "            hidden_states, residual = layer(positions, hidden_states, residual)\n"
+        "            self._maybe_add_hidden_state(\n"
+        "                aux_hidden_states, idx + 1, hidden_states, residual\n"
+        "            )\n\n"
+        "        hidden_states, _ = self.norm(hidden_states, residual)\n\n"
+        "        if len(aux_hidden_states) > 0:\n"
+        "            return hidden_states, aux_hidden_states\n"
+        "        return hidden_states",
+    ),
+    (
+        "class MiniMaxM3SparseForCausalLM(nn.Module):",
+        "class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):",
+    ),
+    (
+        "class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):",
+        "class MiniMaxM3SparseForConditionalGeneration(\n"
+        "    nn.Module, SupportsMultiModal, SupportsEagle3\n"
+        "):",
+    ),
+]
+
+for old, new in edits:
+    count = src.count(old)
+    if count != 1:
+        sys.exit(
+            f"[eagle3-patch] anchor matched {count} times (expected 1); "
+            f"installed {target} has drifted from the expected base — aborting"
+        )
+    src = src.replace(old, new)
+
+ast.parse(src)
+target.write_text(src)
+print(f"[eagle3-patch] applied EAGLE3 support to {target}")
+PYEOF
+
+start_gpu_monitor
+
+set -x
+vllm serve "$MODEL" --port "$PORT" \
+    "${PARALLEL_ARGS[@]}" \
+    --block-size 128 \
+    --no-enable-prefix-caching \
+    --language-model-only \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --attention-backend TRITON_ATTN \
+    --enforce-eager \
+    --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
+    --tool-call-parser minimax_m3 \
+    --reasoning-parser minimax_m3 \
+    --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+# Spec-decode acceptance rate degrades on raw random tokens; route prompts
+# through the chat template as the other MTP recipes do.
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code \
+    --use-chat-template
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1029c5700..168ce234c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3758,3 +3758,13 @@
     - "H100-aligned layouts and concurrency ranges: TP8 and TP8+EP8 across 1k1k and 8k1k"
     - "Fix launch_mi300x-amds.sh node exclusion to use the current short Slurm node name"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1746
+
+- config-keys:
+    - minimaxm3-fp8-mi300x-vllm-mtp
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 MI300X (gfx942) vLLM benchmark with EAGLE3 speculative decoding (target: MiniMaxAI/MiniMax-M3-MXFP8, draft: Inferact/MiniMax-M3-EAGLE3, 3 speculative tokens) — spec-decoding=mtp variant of the MI300X day-zero recipe"
+    - "Image: vllm/vllm-openai-rocm:minimax-m3 (same day-zero ROCm build as the non-MTP entry)"
+    - "Serve shape follows minimaxm3-fp8-mi300x-vllm: --block-size 128, --no-enable-prefix-caching, --language-model-only, --attention-backend TRITON_ATTN, --enforce-eager, minimax_m3 parsers, and the default BF16 KV cache (gfx942 lacks calibrated ROCm FP8 attention scales); prompts routed through the chat template for realistic acceptance"
+    - "TP8-only search space (gfx942 192 GB is memory-tight, like H100): TP8 latency rows started at conc 1, TP8+EP8 (TEP) at high concurrency, across 1k1k and 8k1k"
+    - "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X) before serving — validates EAGLE3 on MI300X ahead of an image rebuild"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1749
diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh
index ce04ceadd..b0c1e22c8 100644
--- a/runners/launch_mi300x-amds.sh
+++ b/runners/launch_mi300x-amds.sh
@@ -7,6 +7,10 @@ PARTITION="compute"
 SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
+# Route spec-decoding=mtp configs to the _mtp benchmark script (parity with
+# the h200 launchers, which have carried SPEC_SUFFIX since #392).
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+
 set -x
 
 # Exclude known-bad nodes; let Slurm pick from anything else:
@@ -37,6 +41,6 @@ srun --jobid=$JOB_ID \
 --container-remap-root \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh
+bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x${SPEC_SUFFIX}.sh
 
 scancel $JOB_ID

From 2bf5851d65bc68cb8510e2a0f545b694f4d1b5c3 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 14 Jun 2026 01:43:22 -0500
Subject: [PATCH 12/14] [AMD] perf: enable MiniMax M3 CUDA graphs on MI300X
 (#1750)

* feat: add MiniMax M3 MI300X day-zero benchmark

* chore: link MiniMax M3 MI300X changelog

* fix: mount ROCm devices on MI300X

* fix: disable prefix caching for MI300X MiniMax M3

* fix: use bf16 kv cache for MI300X MiniMax M3

* perf: enable MI300X MiniMax M3 CUDA graphs

* chore: link MI300X CUDA graph changelog
---
 .../single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh       | 2 +-
 perf-changelog.yaml                                         | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
index e3522e00a..f2cdaf284 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
@@ -32,6 +32,7 @@ fi
 
 SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
+export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -58,7 +59,6 @@ vllm serve "$MODEL" --port "$PORT" \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
-    --enforce-eager \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \
     --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 168ce234c..65c43c0d7 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3768,3 +3768,9 @@
     - "TP8-only search space (gfx942 192 GB is memory-tight, like H100): TP8 latency rows started at conc 1, TP8+EP8 (TEP) at high concurrency, across 1k1k and 8k1k"
     - "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X) before serving — validates EAGLE3 on MI300X ahead of an image rebuild"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1749
+
+- config-keys:
+    - minimaxm3-fp8-mi300x-vllm
+  description:
+    - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI300X and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 per AMD guidance"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1750

From fd922a6a524c1d31e9dac8e6c9dce10b7748a6af Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 14 Jun 2026 04:29:37 -0400
Subject: [PATCH 13/14] [Klaud Cold] minimaxm3-fp8-mi300x-vllm-mtp: run with
 CUDA graphs (drop --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0) (#1756)

* minimaxm3-fp8-mi300x-vllm-mtp: run with CUDA graphs (drop --enforce-eager)

Remove --enforce-eager from the MI300X EAGLE3 MTP recipe and set
VLLM_USE_BREAKABLE_CUDAGRAPH=0, matching the non-MTP MI300X recipe
(#1750). Avoids the M3-decode breakable-cudagraph path that previously
forced eager execution. Re-sweeps minimaxm3-fp8-mi300x-vllm-mtp.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

* perf-changelog: fill in PR link for minimaxm3-fp8-mi300x-vllm-mtp cudagraphs

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
---
 .../single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh | 8 +++++---
 perf-changelog.yaml                                       | 7 +++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh
index 9dd10b30a..40fbab536 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh
@@ -5,8 +5,10 @@
 # minimaxm3_fp8_mi300x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via
 # --speculative-config with 3 speculative tokens. Everything else mirrors the
 # non-MTP MI300X recipe: mandatory --block-size 128, --language-model-only for
-# the text-only benchmark, --attention-backend TRITON_ATTN, --enforce-eager,
-# and --no-enable-prefix-caching. The default BF16 KV cache is retained (unlike
+# the text-only benchmark, --attention-backend TRITON_ATTN, and
+# --no-enable-prefix-caching. Runs with CUDA graphs (no --enforce-eager);
+# VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path.
+# The default BF16 KV cache is retained (unlike
 # the MI355X recipe's FP8 KV cache): gfx942 has no calibrated q/prob scales for
 # ROCm FP8 attention and vLLM's fallback scale of 1.0 corrupts accuracy.
 #
@@ -59,6 +61,7 @@ fi
 
 SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
+export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -176,7 +179,6 @@ vllm serve "$MODEL" --port "$PORT" \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
-    --enforce-eager \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 65c43c0d7..0b9067114 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3774,3 +3774,10 @@
   description:
     - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI300X and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 per AMD guidance"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1750
+
+- config-keys:
+    - minimaxm3-fp8-mi300x-vllm-mtp
+  description:
+    - "Run the MiniMax-M3 MXFP8 MI300X EAGLE3 MTP recipe with CUDA graphs instead of --enforce-eager"
+    - "Drop --enforce-eager and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 (matching the non-MTP MI300X recipe, #1750), which avoids the M3-decode breakable-cudagraph path that previously forced eager execution"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1756

From 805dc1ce4614ee0f5690aa48d973d5839e06295a Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 14 Jun 2026 01:55:51 -0700
Subject: [PATCH 14/14] M3 GB300: drop dominated configs, restore 1P1D full
 range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Data from run 27489709722 showed:
- 1P4D (20 GPU) strictly dominated by 1P1D (8 GPU): 320 vs 974
  out/s/gpu @ conc 128 (1k1k). Single prefill can't feed 4 decode
  workers — 1P:4D ratio is too decode-heavy.
- 8k1k 5P12D (68 GPU) dominated by 10P7D: 567 vs 874 out/s/gpu
  @ conc 1024. Prefill-heavy ratio is correct for long context.

Changes:
- Remove 1P4D recipes (both 1k1k and 8k1k)
- Remove 8k1k 5P12D recipe (dominated by 10P7D)
- Restore 1P1D to full concurrency range [8-512] 1k1k, [8-256] 8k1k
  (was truncated to [4-32] to avoid 1P4D overlap)

Final GB300 configs: 1P1D (latency-to-mid) + rack-saturating (max tput)
  1k1k: 1P1D [8-512] + 5P12D [2048-8192]
  8k1k: 1P1D [8-256] + 10P7D [1024-4096]
---
 .github/configs/nvidia-master.yaml            |  58 ++--------
 .../1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml     |  98 -----------------
 .../8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml     |  98 -----------------
 .../8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml   | 102 ------------------
 4 files changed, 6 insertions(+), 350 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0fa0079b3..46772c123 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11797,8 +11797,9 @@ minimaxm3-fp8-gb300-dynamo-vllm:
       # out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU
       # IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node,
       # so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation).
-      # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU).
-      - conc-list: [4, 8, 16, 32]
+      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency:
+      # peaks at ~1646 out/s/gpu @ conc 256 (1k1k). Covers latency-to-mid.
+      - conc-list: [8, 16, 32, 64, 128, 256, 512]
         prefill:
           num-worker: 1
           tp: 4
@@ -11812,23 +11813,6 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: false
 
-      # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). DSR1 pattern: multiple
-      # independent TP4 decode workers instead of wide-EP decode (M3's MoE
-      # all-to-all makes wide EP slower).
-      - conc-list: [64, 128, 256, 512]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml"
-        decode:
-          num-worker: 4
-          tp: 4
-          ep: 1
-          dp-attn: false
-
       # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
       # Balanced prefill:decode ratio (single prefill starved the decode pool)
       # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower).
@@ -11851,8 +11835,9 @@ minimaxm3-fp8-gb300-dynamo-vllm:
       search-space:
       # 8k1k long context: prefill is heavier and KV is larger, so concurrency
       # is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer.
-      # Low latency: 1P+1D TP4+EP4 split, 2 nodes (8 GPU).
-      - conc-list: [4, 8, 16]
+      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency:
+      # peaks at ~1209 out/s/gpu @ conc 256 (8k1k). Covers latency-to-mid.
+      - conc-list: [8, 16, 32, 64, 128, 256]
         prefill:
           num-worker: 1
           tp: 4
@@ -11866,37 +11851,6 @@ minimaxm3-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: false
 
-      # Mid curve: 1P + 4D, TP4 ep1, 5 nodes (20 GPU). Multiple independent
-      # TP4 decode workers instead of wide-EP decode.
-      - conc-list: [32, 64, 128, 256]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml"
-        decode:
-          num-worker: 4
-          tp: 4
-          ep: 1
-          dp-attn: false
-
-      # Rack-saturating: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
-      - conc-list: [1024, 2048, 4096]
-        prefill:
-          num-worker: 5
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml"
-        decode:
-          num-worker: 12
-          tp: 4
-          ep: 1
-          dp-attn: false
-
       # Prefill-heavy rack-saturating: 10P + 7D, TP4 ep1 (17 nodes).
       # At 8k context, prefill is 8x heavier — 5P:12D showed 616-req prefill
       # backlog. DSR1/DSV4 GB300 patterns use 6-10 prefill workers for 8k1k.
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
deleted file mode 100644
index c9bb30163..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
+++ /dev/null
@@ -1,98 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-1k1k"
-
-# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes.
-# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode
-# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead
-# makes wide EP slower than independent TP4 workers.
-# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache).
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-slurm:
-  time_limit: "8:00:00"
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 4
-  prefill_workers: 1
-  decode_workers: 4
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    HF_HUB_OFFLINE: "1"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    HF_HUB_OFFLINE: "1"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-cumem-allocator: true
-      enforce-eager: true
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      max-model-len: 2304
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-cumem-allocator: true
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
-      max-model-len: 2304
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "32x64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
deleted file mode 100644
index 86dbfca17..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p4d-tp4ep1-5n.yaml
+++ /dev/null
@@ -1,98 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb300-1p4d-tp4ep1-8k1k"
-
-# MiniMax-M3 GB300 mid-curve disagg: 1 prefill + 4 decode, TP4 ep1, 5 nodes.
-# DSR1 low-latency pattern adapted for M3: multiple independent TP4 decode
-# workers instead of wide-EP decode (dep16dec). M3's MoE all-to-all overhead
-# makes wide EP slower than independent TP4 workers.
-# MNNVL KV transfer + cumem. --block-size 128 mandatory (MSA cache).
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-slurm:
-  time_limit: "8:00:00"
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 4
-  prefill_workers: 1
-  decode_workers: 4
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    HF_HUB_OFFLINE: "1"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    HF_HUB_OFFLINE: "1"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-cumem-allocator: true
-      enforce-eager: true
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      max-model-len: 9472
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-cumem-allocator: true
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
-      max-model-len: 9472
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "32x64x128x256"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
deleted file mode 100644
index 7cd84fbf1..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-8k1k"
-
-# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU).
-# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert
-# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead
-# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode
-# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued),
-# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0,
-# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV
-# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128
-# mandatory (MSA cache); text-only -> language-model-only.
-
-model:
-  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-sbatch_directives:
-  mem: "0"
-  cpus-per-task: "72"
-srun_options:
-  mem: "0"
-slurm:
-  time_limit: "8:00:00"
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-extra_mount:
-  - "__M3_HF_HOME__:__M3_HF_HOME__"
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 5
-  decode_nodes: 12
-  prefill_workers: 5
-  decode_workers: 12
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    HF_HUB_OFFLINE: "1"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    HF_HOME: "__M3_HF_HOME__"
-    HF_HUB_OFFLINE: "1"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-cumem-allocator: true
-      enforce-eager: true
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      max-model-len: 9472
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-cumem-allocator: true
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
-      max-model-len: 9472
-      block-size: 128
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1024x2048x4096"