From bbdd923d53275ffd09b195baa7d2113da8fb521c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 17:59:43 -0700
Subject: [PATCH 1/8] feat: MiniMax-M3 MXFP8 full sweep config for GB200

Add minimaxm3-fp8-gb200-dynamo-vllm to nvidia-master.yaml with 6
topologies covering the full concurrency range:
- TP4/TP8 (low latency, conc 4-64)
- TP4+EP4 agg + 1P+1D disagg (mid curve, conc 64-512)
- DEP4/DEP8 (high throughput, conc 256-2048)

All recipe YAMLs included under minimax-m3-gb200-fp8/{1k1k,8k1k}/.
---
 .github/configs/nvidia-master.yaml            | 111 ++++++++++++++++++
 .../workflows/benchmark-multinode-tmpl.yml    |   5 +
 .../1k1k/agg-gb200-dep4-1n.yaml               |  74 ++++++++++++
 .../1k1k/agg-gb200-dep8-2n.yaml               |  74 ++++++++++++
 .../1k1k/agg-gb200-tp4-1n.yaml                |  71 +++++++++++
 .../1k1k/agg-gb200-tp4ep4-1n.yaml             |  72 ++++++++++++
 .../1k1k/agg-gb200-tp8-2n.yaml                |  71 +++++++++++
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     |  89 ++++++++++++++
 .../8k1k/agg-gb200-dep8-2n.yaml               |  74 ++++++++++++
 .../8k1k/agg-gb200-tp4-1n.yaml                |  71 +++++++++++
 .../8k1k/agg-gb200-tp4ep4-1n.yaml             |  72 ++++++++++++
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     |  89 ++++++++++++++
 perf-changelog.yaml                           |  12 ++
 runners/launch_gb200-nv.sh                    |  50 ++++++--
 14 files changed, 927 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 187824347..e68adb5f4 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11679,6 +11679,117 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: true
 
+# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
+# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
+# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
+# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release;
+# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64)
+# image built from the m3_release branch (vllm-project/vllm#45381).
+# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
+minimaxm3-fp8-gb200-dynamo-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: gb200
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Low latency: TP=4 aggregated, 1 node (4 GPU).
+      - conc-list: [4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+      # Low latency: TP=8 aggregated, 2 nodes (8 GPU).
+      - conc-list: [4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml"
+        decode:
+          num-worker: 0
+          tp: 8
+          ep: 1
+          dp-attn: false
+
+      # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU).
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
+      - conc-list: [64, 128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # High throughput: DEP=4 aggregated, 1 node (4 GPU).
+      - conc-list: [256, 512, 1024]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 4
+          dp-attn: true
+
+      # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU).
+      - conc-list: [512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 8
+          dp-attn: true
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 81727ef39..85b399e6c 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -123,6 +123,11 @@ on:
 
 env:
   RANDOM_RANGE_RATIO: 0.8
+  # Day-zero models resolved via hf: ids download from the Hub inside the
+  # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
+  # get 429-rate-limited when several workers pull a 444 GB snapshot at
+  # once; sbatch/srun inherit this env so the token reaches the workers.
+  HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
new file mode 100644
index 000000000..a95d2df41
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
@@ -0,0 +1,74 @@
+name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
new file mode 100644
index 000000000..ab231e733
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
@@ -0,0 +1,74 @@
+name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
new file mode 100644
index 000000000..ce431c3c0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
@@ -0,0 +1,71 @@
+name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
new file mode 100644
index 000000000..29efa7ecc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
@@ -0,0 +1,72 @@
+name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
new file mode 100644
index 000000000..29a5934bd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
@@ -0,0 +1,71 @@
+name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
new file mode 100644
index 000000000..17769abf3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -0,0 +1,89 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200.
+# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
new file mode 100644
index 000000000..db729764a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
@@ -0,0 +1,74 @@
+name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
new file mode 100644
index 000000000..8c7ecbe17
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
@@ -0,0 +1,71 @@
+name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
new file mode 100644
index 000000000..3e146af8b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
@@ -0,0 +1,72 @@
+name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 256
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128x256"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
new file mode 100644
index 000000000..54980f7d3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -0,0 +1,89 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k.
+# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64x128x256"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d29c9a5d3..647121c12 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3646,3 +3646,15 @@
     - "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k"
     - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724
+
+- config-keys:
+    - minimaxm3-fp8-gb200-dynamo-vllm
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo"
+    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
+    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)"
+    - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode"
+    - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)"
+    - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
+    - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
+  pr-link: TBD
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 36c8af203..9c3430289 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -60,8 +60,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5"
         export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8"
+    elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
+        export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8"
+        export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8"
     else
-        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8"
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8"
         exit 1
     fi
 else
@@ -81,7 +84,7 @@ NGINX_IMAGE="nginx:1.27.4"
 # squash dir on a path that's also visible to compute nodes. Falls
 # back to the legacy sa-shared path so other configs are untouched.
 SQUASH_DIR="/mnt/lustre01/users-public/sa-shared"
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     echo "=== cluster diagnostic (minimax sweep) ==="
     echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)"
     echo "HOME=$HOME"
@@ -128,8 +131,32 @@ fi
 SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
-enroot import -o $SQUASH_FILE docker://$IMAGE
-enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+# Concurrent matrix jobs (three gb200-nv runners) all import to the same
+# shared-FS squash path. An unsynchronized `enroot import -o` onto an
+# existing file APPENDS to it (mksquashfs default), corrupting the image
+# while other jobs' pyxis extractions are reading it — observed on the
+# minimaxm3 day-zero sweep (R1: an eval job appended to the live squash
+# mid-run). Serialize with a lock, skip when the existing file is valid,
+# and build to a temp path + atomic mv so readers never see a half-written
+# file. Mirrors the import_squash pattern in launch_gb300-nv.sh.
+import_squash() {
+    local squash="$1" image="$2"
+    local lock="${squash}.lock"
+    (
+        exec 9>"$lock"
+        flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; }
+        if unsquashfs -l "$squash" > /dev/null 2>&1; then
+            echo "Squash file already exists and is valid, skipping import: $squash"
+        else
+            rm -f "$squash" "$squash".tmp.*
+            enroot import -o "${squash}.tmp.$$" "docker://$image"
+            mv -f "${squash}.tmp.$$" "$squash"
+        fi
+    ) || exit 1
+}
+
+import_squash "$SQUASH_FILE" "$IMAGE"
+import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE"
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
@@ -202,7 +229,7 @@ SRT_REPO_DIR="srt-slurm"
 # cross-mounted to compute nodes. Put the srt-slurm workspace and staged
 # InferenceX checkout on a writable shared-FS path that compute can see.
 # Per-run-unique paths avoid races between parallel sweep jobs.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SHARED_BASE=""
     for cand in \
         /mnt/lustre01/users-public/sa-shared/gha-runs \
@@ -269,6 +296,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then
         echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2
         exit 1
     fi
+elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1
+    cd "$SRT_REPO_DIR" || exit 1
+    git checkout main || exit 1
+    mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
@@ -292,7 +325,7 @@ source $HOME/.local/bin/env
 # under a head-node-only path, .venv/bin/python3 becomes a broken
 # symlink on compute. Pin the venv to /usr/bin/python3 — a system
 # path that exists at the same location on both head and compute.
-if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then
+if [[ ( $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ) && -x /usr/bin/python3 ]]; then
     uv venv --seed --python /usr/bin/python3
 else
     uv venv --seed
@@ -312,7 +345,7 @@ SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
 # Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path
 # above so srtctl's outputs/ directory (which lives under
 # SRTCTL_ROOT) is visible to compute nodes.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SRTCTL_ROOT="$SRT_REPO_DIR"
 fi
 echo "Creating srtslurm.yaml configuration..."
@@ -354,7 +387,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 # can't see. Stage the relevant subset to shared FS and repoint
 # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already
 # on shared FS) and .git (not needed in container) for speed.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}"
     mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1
     rsync -a --delete \
@@ -379,6 +412,7 @@ if [[ ! -f "$CONFIG_PATH" ]]; then
 fi
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH"
 
+
 if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1)
 else

From dbf5135c0299f26b19ff814519651f17efdc68e8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 18:06:32 -0700
Subject: [PATCH 2/8] chore: update perf-changelog pr-link to #1734

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 647121c12..e1d38dd9f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3657,4 +3657,4 @@
     - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)"
     - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
     - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
-  pr-link: TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734

From ed63c1e042078379d6f555d573528c82e7559623 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 20:52:36 -0700
Subject: [PATCH 3/8] feat: switch GB200 M3 to ai-dynamo vllm-runtime 1.3.0
 image
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adopt the NVIDIA Dynamo vLLM runtime image
(nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1), the
canonical M3 runtime from ai-dynamo/dynamo
release/1.3.0-minimax-m3-dev.1.

Changes mirrored from that release's
recipes/minimax-m3/vllm/disagg/MXFP8/deploy.yaml:
- dynamo.install: false — the runtime image bundles dynamo 1.3.0, so
  the prior 1.2.0 wheel install is dropped (srtctl defaults install=true)
- attention-backend: FLASH_ATTN on every prefill/decode/agg engine

Benchmark-specific knobs kept over the reference's serving defaults:
language-model-only (text-only), no-enable-prefix-caching (random data),
scenario-trimmed max-model-len.
---
 .github/configs/nvidia-master.yaml                     | 10 ++++++----
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml   |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml   |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml    |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml    |  6 +++---
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml              |  7 ++++---
 .../minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml   |  6 +++---
 .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml    |  6 +++---
 .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml |  6 +++---
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml              |  7 ++++---
 11 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c11f6505b..d1926f30f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11682,12 +11682,14 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
-# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release;
-# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64)
-# image built from the m3_release branch (vllm-project/vllm#45381).
+# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime
+# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
+# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
+# Engine args mirror the canonical recipe (ai-dynamo/dynamo
+# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
 # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:minimax-m3
+  image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
index a95d2df41..3b328ea28 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -60,6 +59,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
index ab231e733..81b000039 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -60,6 +59,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
index ce431c3c0..f7684fe8d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -57,6 +56,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 64
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
index 29efa7ecc..1fc4a3d98 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
index 29a5934bd..65e85f441 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -57,6 +56,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 64
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 17769abf3..90ec1d007 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
@@ -75,6 +75,7 @@ backend:
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
index db729764a..c3f50da69 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -60,6 +59,7 @@ backend:
       max-num-batched-tokens: 16384
       max-cudagraph-capture-size: 128
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
index 8c7ecbe17..444f1e1df 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -57,6 +56,7 @@ backend:
       max-num-batched-tokens: 16384
       max-cudagraph-capture-size: 64
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
index 3e146af8b..ca8ea7e48 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-batched-tokens: 16384
       max-cudagraph-capture-size: 256
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 54980f7d3..6a13b50d1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
@@ -75,6 +75,7 @@ backend:
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"

From 8738f42aabdbffb4235c5b5bc89c359c59ff26c4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:11:07 -0700
Subject: [PATCH 4/8] fix: use enroot registry syntax (nvcr.io#) for GB200 M3
 image

enroot's docker:// URI needs `#` to separate the registry host from
the image path; `nvcr.io/...` was parsed as a Docker Hub repo and 401'd
against registry-1.docker.io. Matches the existing nvcr.io# convention
in nvidia-master.yaml. Recipe container fields kept byte-identical to
the master image: field (srtslurm.yaml maps "${IMAGE}" -> squashfile).
---
 .github/configs/nvidia-master.yaml                            | 4 ++--
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml      | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml   | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml      | 2 +-
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml                     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml      | 2 +-
 .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml   | 2 +-
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml                     | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1d24e2857..9e3977232 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11777,13 +11777,13 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
 # tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime
-# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
+# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
 # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
 # Engine args mirror the canonical recipe (ai-dynamo/dynamo
 # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
 # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
+  image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
index 3b328ea28..921f99b8e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
index 81b000039..50eb3ff64 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
index f7684fe8d..6115d210c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
index 1fc4a3d98..94df4c8ec 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
index 65e85f441..1ac2612bd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 90ec1d007..4f9c01c6b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
index c3f50da69..adb36f646 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
index 444f1e1df..8cfbcb616 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
index ca8ea7e48..1567ca57c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 6a13b50d1..86d48468a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:

From 3415fb4e6a815393fd6c8ba12210bc9cd2f5074d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 13:44:48 -0700
Subject: [PATCH 5/8] feat: convert MiniMax-M3 GB200 sweep to fully
 disaggregated inference

Replace the mostly-aggregated GB200 sweep (5 agg + 1 disagg) with a fully
disaggregated sweep that splits prefill/decode over NixlConnector, mirroring
the minimaxm2.5-fp8-gb200 reference. Every worker = one 4-GPU node since the
444 GB MXFP8 checkpoint can't fit in fewer.

Topologies (1k1k): 1P1D TP4 (low-lat), 1P1D TP4+EP4 (mid), 1P2D TP4+EP4
(decode-scaled), 2P1D TP4+EP4 (prefill-scaled), 1P1D DEP4 (max-tput),
spanning conc 4-2048.

- add 4 disagg recipes; remove 8 orphaned agg recipes (1k1k + 8k1k)
- rewire nvidia-master.yaml search-space to the 5 disagg entries
- perf-changelog: describe disagg sweep; fix stale Image line
  (vllm/vllm-openai:minimax-m3 -> nvcr.io#.../vllm-runtime:1.3.0-minimax-m3-dev.1)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 66 ++++++-------
 .../1k1k/agg-gb200-dep4-1n.yaml               | 74 --------------
 .../1k1k/agg-gb200-dep8-2n.yaml               | 74 --------------
 .../1k1k/agg-gb200-tp4-1n.yaml                | 71 --------------
 .../1k1k/agg-gb200-tp4ep4-1n.yaml             | 72 --------------
 .../1k1k/agg-gb200-tp8-2n.yaml                | 71 --------------
 .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml       | 96 +++++++++++++++++++
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 89 +++++++++++++++++
 .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml     | 92 ++++++++++++++++++
 .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml     | 92 ++++++++++++++++++
 .../8k1k/agg-gb200-dep8-2n.yaml               | 74 --------------
 .../8k1k/agg-gb200-tp4-1n.yaml                | 71 --------------
 .../8k1k/agg-gb200-tp4ep4-1n.yaml             | 72 --------------
 perf-changelog.yaml                           | 10 +-
 14 files changed, 401 insertions(+), 623 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9e3977232..15aee30c5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11781,7 +11781,10 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
 # Engine args mirror the canonical recipe (ai-dynamo/dynamo
 # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
-# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
+# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split,
+# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in
+# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4
+# (decode- and prefill-scaled), 1P1D DEP4 (max throughput).
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -11796,7 +11799,7 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Low latency: TP=4 aggregated, 1 node (4 GPU).
+      # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each).
       - conc-list: [4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
@@ -11804,86 +11807,71 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml"
         decode:
-          num-worker: 0
+          num-worker: 1
           tp: 4
           ep: 1
           dp-attn: false
 
-      # Low latency: TP=8 aggregated, 2 nodes (8 GPU).
-      - conc-list: [4, 8, 16, 32, 64]
+      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
+      - conc-list: [64, 128, 256, 512]
         prefill:
           num-worker: 1
-          tp: 8
-          ep: 1
+          tp: 4
+          ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
         decode:
-          num-worker: 0
-          tp: 8
-          ep: 1
+          num-worker: 1
+          tp: 4
+          ep: 4
           dp-attn: false
 
-      # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU).
-      - conc-list: [128, 256, 512]
+      # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each).
+      - conc-list: [256, 512, 1024]
         prefill:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml"
         decode:
-          num-worker: 0
+          num-worker: 2
           tp: 4
           ep: 4
           dp-attn: false
 
-      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
-      - conc-list: [64, 128, 256, 512]
+      # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each).
+      - conc-list: [256, 512, 1024]
         prefill:
-          num-worker: 1
+          num-worker: 2
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml"
         decode:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
 
-      # High throughput: DEP=4 aggregated, 1 node (4 GPU).
-      - conc-list: [256, 512, 1024]
+      # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each).
+      - conc-list: [512, 1024, 2048]
         prefill:
           num-worker: 1
           tp: 1
           ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml"
         decode:
-          num-worker: 0
-          tp: 1
-          ep: 4
-          dp-attn: true
-
-      # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU).
-      - conc-list: [512, 1024, 2048]
-        prefill:
           num-worker: 1
           tp: 1
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml"
-        decode:
-          num-worker: 0
-          tp: 1
-          ep: 8
+          ep: 4
           dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
deleted file mode 100644
index 921f99b8e..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 4
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
deleted file mode 100644
index 50eb3ff64..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
deleted file mode 100644
index 6115d210c..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
deleted file mode 100644
index 94df4c8ec..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
deleted file mode 100644
index 1ac2612bd..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
new file mode 100644
index 000000000..0749dbc86
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
@@ -0,0 +1,96 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve).
+# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode
+# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode
+# token throughput at high concurrency; engine shape mirrors the proven
+# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index
+# cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
new file mode 100644
index 000000000..927066e42
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -0,0 +1,89 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve).
+# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP,
+# no expert parallel: lowest TTFT/ITL for small concurrencies.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
new file mode 100644
index 000000000..fbb99a3dd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
@@ -0,0 +1,92 @@
+name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled).
+# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node
+# each) = 3 nodes. Two decode workers absorb more in-flight sequences for
+# mid/high concurrencies while a single prefill keeps TTFT low.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
new file mode 100644
index 000000000..fb27934cb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
@@ -0,0 +1,92 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k"
+
+# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled).
+# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4,
+# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at
+# mid/high concurrencies without starving a single decode worker.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
deleted file mode 100644
index adb36f646..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
deleted file mode 100644
index 8cfbcb616..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
deleted file mode 100644
index 1567ca57c..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 256
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "128x256"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8ab05189e..5327dbd02 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3652,12 +3652,12 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo"
+    - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo"
     - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
-    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)"
-    - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode"
-    - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)"
-    - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
+    - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)"
+    - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
+    - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)"
+    - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048"
     - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 

From 803cd20f243bb841b2013364af932e6aa9690850 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:19:36 -0700
Subject: [PATCH 6/8] fix: restore NIXL-bearing image for M3 GB200 disagg +
 enable MNNVL KV transfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 27478698552 failed: every disagg worker crashed at NixlConnector init
with "NIXL is not available" (RuntimeError, vllm .../nixl/worker.py:248).
The ai-dynamo vllm-runtime:1.3.0-minimax-m3-dev.1 image ships dynamo but
NOT the nixl bindings (cupy missing too), so kv_connector=NixlConnector
cannot initialize and the engine core never becomes healthy.

Revert to the pre-ed63c1e0 runtime path that pulls NIXL in via the dynamo
wheel (same as the working minimaxm2.5-gb200 disagg recipes):
- image/container: vllm/vllm-openai:minimax-m3 (the m3_release build all
  other m3 entries already use)
- dynamo.install=true + wheel 1.2.0.dev20260526 (nixl is a dynamo dep)
- keep attention-backend FLASH_ATTN (added in the image-switch commit)

Also enable NVLink (MNNVL) KV transfer so NIXL doesn't fall back to TCP,
mirroring the deepseek-v4 gb200 disagg recipes — on every prefill/decode
env block:
  UCX_TLS=cuda_copy,cuda_ipc,tcp
  UCX_CUDA_IPC_ENABLE_MNNVL=y
  UCX_MEMTYPE_CACHE=n / UCX_MEMTYPE_REG_WHOLE=n
  NCCL_CUMEM_ENABLE=1   (cuMem-allocate buffers so they are IPC-exportable)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 18 +++++++++-------
 .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml       | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml     | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml     | 21 +++++++++++++++++--
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     | 21 +++++++++++++++++--
 perf-changelog.yaml                           |  4 ++--
 8 files changed, 126 insertions(+), 22 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index b0b99d53f..f246f518a 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11776,17 +11776,19 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
-# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime
-# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
-# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
+# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build
+# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set
+# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
+# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
+# shipped without NIXL, so disagg workers crashed at NixlConnector init).
 # Engine args mirror the canonical recipe (ai-dynamo/dynamo
 # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
-# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split,
-# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in
-# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4
-# (decode- and prefill-scaled), 1P1D DEP4 (max throughput).
+# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over
+# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB
+# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid),
+# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput).
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
+  image: vllm/vllm-openai:minimax-m3
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
index 0749dbc86..4b56e9e6f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
@@ -9,11 +9,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -44,10 +45,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 927066e42..558c5d894 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -7,11 +7,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -42,10 +43,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 4f9c01c6b..eeefc68c1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -41,10 +42,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
index fbb99a3dd..02d9bd98e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
@@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -43,10 +44,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
index fb27934cb..4a440766a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
@@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -43,10 +44,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 86d48468a..c14b9fb3b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -41,10 +42,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index be638f5f1..627ed5bb1 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3654,8 +3654,8 @@
   description:
     - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo"
     - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
-    - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)"
-    - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
+    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
+    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
     - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)"
     - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048"
     - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"

From 1320056380a6f095211fbbb016a9fcc57fdbfbb6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 20:04:13 -0700
Subject: [PATCH 7/8] feat: rack-scale wide-EP (DeepSeek megamoe) M3 GB200
 disagg + FLASHINFER

The narrow DEP8-max sweep showed no GB200 advantage over B200 because both
cap at an 8-GPU NVLink island. Exploit NVL72's rack-scale NVLink with wide
expert parallelism spanning multiple nodes, mirroring the deepseek-v4
"megamoe" ladder (DEP = data-parallel attention + expert-parallel):

- 1P1D TP4 (2n)            low-latency, conc 4-64
- 1P1D DEP8 (4n)           mid, EP8/16-experts-per-rank, conc 128-512
- 1P1D DEP8->DEP16 (6n)    wide decode (EP16), conc 512-2048
- 2P1D DEP8->DEP16 (8n)    prefill-scaled, conc 2048-4096
- 4P1D DEP8->DEP16 (12n)   max throughput, conc 4096-8192

M3 has 128 routed experts (top-4), so EP8/EP16 shard cleanly. EP16 across
16 GPU / 4 nodes is the regime B200 physically can't reach.

Attention: FLASH_ATTN -> FLASHINFER (trtllm-gen) on all GB200 recipes to
exploit Blackwell. Requires the :minimax-m3 image rebuilt from m3_release
HEAD 022448dd (vllm-project/vllm#45381), which gates trtllm-gen page>=128.

Also add GB200 perf/NVLink-KV knobs from the deepseek-v4 reference:
numa-bind (Grace) and enable-sleep-mode (cuMem allocator so the KV cache is
IPC-exportable over the MNNVL fabric), alongside the existing UCX MNNVL env.

Replaces the four narrow EP4 recipes; keeps 1P1D TP4 for low latency.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  95 +++++++++-------
 ...3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} |  45 +++++---
 ...l => disagg-gb200-1p1d-dep8-dep16-6n.yaml} |  36 +++---
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        |  17 ++-
 ...l => disagg-gb200-2p1d-dep8-dep16-8n.yaml} |  42 ++++---
 ... => disagg-gb200-4p1d-dep8-dep16-12n.yaml} |  44 ++++---
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     | 107 ------------------
 perf-changelog.yaml                           |  15 +--
 8 files changed, 170 insertions(+), 231 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p2d-tp4ep4-3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} (69%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep4-2n.yaml => disagg-gb200-1p1d-dep8-dep16-6n.yaml} (77%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-tp4ep4-3n.yaml => disagg-gb200-2p1d-dep8-dep16-8n.yaml} (74%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tp4ep4-2n.yaml => disagg-gb200-4p1d-dep8-dep16-12n.yaml} (71%)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f246f518a..70ec293af 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11781,12 +11781,16 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
 # NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
 # shipped without NIXL, so disagg workers crashed at NixlConnector init).
-# Engine args mirror the canonical recipe (ai-dynamo/dynamo
-# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
-# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over
-# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB
-# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid),
-# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput).
+# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER
+# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd
+# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image
+# from m3_release before running. Fully disaggregated, rack-scale wide-EP
+# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors
+# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel
+# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers
+# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge.
+# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
+# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: vllm/vllm-openai:minimax-m3
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -11801,7 +11805,8 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each).
+      # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP
+      # would idle DP ranks at small concurrencies, so stay narrow here.
       - conc-list: [4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
@@ -11816,64 +11821,68 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
 
-      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
-      - conc-list: [64, 128, 256, 512]
+      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
+      # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
+      - conc-list: [128, 256, 512]
         prefill:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
         decode:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
 
-      # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each).
-      - conc-list: [256, 512, 1024]
+      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
+      # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
+      - conc-list: [512, 1024, 2048]
         prefill:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
         decode:
-          num-worker: 2
-          tp: 4
-          ep: 4
-          dp-attn: false
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
 
-      # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each).
-      - conc-list: [256, 512, 1024]
+      # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
+      # (4 nodes) = 8 nodes.
+      - conc-list: [2048, 4096]
         prefill:
           num-worker: 2
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
         decode:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 16
+          ep: 16
+          dp-attn: true
 
-      # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each).
-      - conc-list: [512, 1024, 2048]
+      # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
+      # (4 nodes) = 12 nodes within one NVL72 rack.
+      - conc-list: [4096, 8192]
         prefill:
-          num-worker: 1
-          tp: 1
-          ep: 4
+          num-worker: 4
+          tp: 8
+          ep: 8
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
         decode:
           num-worker: 1
-          tp: 1
-          ep: 4
+          tp: 16
+          ep: 16
           dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
similarity index 69%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
index 02d9bd98e..efc5d5740 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -1,10 +1,11 @@
-name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"
 
-# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled).
-# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node
-# each) = 3 nodes. Two decode workers absorb more in-flight sequences for
-# mid/high concurrencies while a single prefill keeps TTFT low.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP).
+# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
+# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
+# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of
+# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so
+# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,12 +27,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 2
   prefill_workers: 1
-  decode_workers: 2
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -68,42 +69,50 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "256x512x1024"
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
similarity index 77%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
index 4b56e9e6f..5ca08a06d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -1,11 +1,10 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k"
 
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve).
-# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode
-# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode
-# token throughput at high concurrency; engine shape mirrors the proven
-# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index
-# cache alignment).
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve).
+# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn
+# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128)
+# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER
+# attention, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -27,12 +26,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
+  prefill_nodes: 2
+  decode_nodes: 4
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -71,7 +70,7 @@ backend:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 4
+      data-parallel-size: 8
       data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
@@ -79,31 +78,36 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 4
+      data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-batched-tokens: 2048
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 128
 
 benchmark:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 558c5d894..b60b17515 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -1,9 +1,10 @@
 name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
 
 # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve).
-# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP,
-# no expert parallel: lowest TTFT/ITL for small concurrencies.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
+# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
+# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
+# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
 
 model:
   path: "minimax-m3-mxfp8"
@@ -74,12 +75,14 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
@@ -91,13 +94,15 @@ backend:
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
similarity index 74%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
index 4a440766a..853095727 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -1,10 +1,10 @@
-name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k"
 
 # MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled).
-# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4,
-# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at
-# mid/high concurrencies without starving a single decode worker.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16
+# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt
+# ingest into a single wide decode at high concurrency. FLASHINFER
+# attention, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,12 +26,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 1
+  prefill_nodes: 4
+  decode_nodes: 4
   prefill_workers: 2
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -68,42 +68,50 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "256x512x1024"
+  concurrencies: "2048x4096"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
similarity index 71%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
index eeefc68c1..4a6aa5d0f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -1,8 +1,10 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k"
 
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200.
-# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput).
+# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector ->
+# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max
+# prefill fan-in for the highest-concurrency points. FLASHINFER attention,
+# block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -24,12 +26,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
+  prefill_nodes: 8
+  decode_nodes: 4
+  prefill_workers: 4
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -66,42 +68,50 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "64x128x256x512"
+  concurrencies: "4096x8192"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
deleted file mode 100644
index c14b9fb3b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k.
-# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "64x128x256"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 627ed5bb1..295a8e694 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3652,13 +3652,14 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo"
-    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
-    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
-    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
-    - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)"
-    - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048"
-    - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
+    - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo"
+    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
+    - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
+    - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks"
+    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
+    - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
+    - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
+    - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From c8cd5670cc1878c9d9109c8b212c2e02adb7eb98 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:00:11 -0700
Subject: [PATCH 8/8] feat: tune 1k1k low-conc latency + add 8k1k sweep for M3
 GB200
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1k1k TP4 low-conc tuning: stream-interval 1 (was 128 decode / 32
prefill), cudagraph cap 128 (was 512), conc range extended to 1-64
(was 4-64) to match B200 coverage.

8k1k sweep: 5 disagg recipes mirroring the 1k1k megamoe ladder
(TP4, DEP8, DEP8→DEP16, 2P1D, 4P1D) with max-model-len 9472
(74×128 blocks = ISL+OSL+256 headroom). Concurrencies shifted ~4x
lower for 8x heavier prefill: TP4 1-16, DEP8 32-128,
DEP8→DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  81 ++++++++++++-
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        |   8 +-
 .../8k1k/disagg-gb200-1p1d-dep8-4n.yaml       | 111 ++++++++++++++++++
 .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 111 ++++++++++++++++++
 .../8k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 106 +++++++++++++++++
 .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 110 +++++++++++++++++
 .../disagg-gb200-4p1d-dep8-dep16-12n.yaml     | 110 +++++++++++++++++
 perf-changelog.yaml                           |   3 +-
 8 files changed, 634 insertions(+), 6 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 70ec293af..32957e282 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11807,7 +11807,7 @@ minimaxm3-fp8-gb200-dynamo-vllm:
       search-space:
       # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP
       # would idle DP ranks at small concurrencies, so stay narrow here.
-      - conc-list: [4, 8, 16, 32, 64]
+      - conc-list: [1, 2, 4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
           tp: 4
@@ -11885,6 +11885,85 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 16
           dp-attn: true
 
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Low latency 8k1k: 1P+1D TP4, 2 nodes. stream-interval 1 + cudagraph
+      # cap 128 for best interactivity at small concurrencies.
+      - conc-list: [1, 2, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
+      - conc-list: [32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
+      - conc-list: [512, 1024]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput 8k1k: 4P+1D, 12 nodes.
+      - conc-list: [1024, 2048]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index b60b17515..f3e79340a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -83,7 +83,7 @@ backend:
       no-enable-prefix-caching: true
       numa-bind: true
       enable-sleep-mode: true
-      stream-interval: 32
+      stream-interval: 1
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -92,7 +92,7 @@ backend:
       max-model-len: 2304
       max-num-seqs: 256
       max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
+      max-cudagraph-capture-size: 128
       block-size: 128
       attention-backend: FLASHINFER
       language-model-only: true
@@ -102,10 +102,10 @@ backend:
       no-enable-prefix-caching: true
       numa-bind: true
       enable-sleep-mode: true
-      stream-interval: 128
+      stream-interval: 1
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "4x8x16x32x64"
+  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
new file mode 100644
index 000000000..f6f2c7874
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -0,0 +1,111 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k).
+# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
+# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
+# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards
+# 16 experts/rank. FLASHINFER attention, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x64x128"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
new file mode 100644
index 000000000..0d7d44843
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -0,0 +1,111 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k).
+# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16
+# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72
+# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has
+# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
new file mode 100644
index 000000000..b0602354c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k).
+# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
+# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
+# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
+# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
+# Low-conc tuned: stream-interval 1, cudagraph cap 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
new file mode 100644
index 000000000..6a0765c60
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -0,0 +1,110 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k).
+# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16
+# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute;
+# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 4
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
new file mode 100644
index 000000000..9e4ff3c2b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -0,0 +1,110 @@
+name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k).
+# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16
+# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill
+# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 8
+  decode_nodes: 4
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024x2048"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8329ac1da..46ac06a08 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3659,7 +3659,8 @@
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192"
+    - "1k1k concurrency sweep: TP4 1-64 (low-conc latency tuned: stream-interval 1, cudagraph cap 128), DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192"
+    - "8k1k concurrency sweep (same 5 topologies, shifted ~4x lower for 8x heavier prefill): TP4 1-16, DEP8 32-128, DEP8->DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048; max-model-len 9472 (74*128)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys: