SemiAnalysisAI · Oseltamivir · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
@@ -2871,3 +2871,33 @@ minimaxm3-fp8-mi300x-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
+
+# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
+# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
+# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only
+# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like
+# H100), with the TP8 latency rows started at conc 1 to capture single-request
+# latency — matching the H100/MI355X MTP recipes. The shipped ROCm image lacks
+# SupportsEagle3 on the AMD MiniMax-M3 model, so the recipe applies that fix
+# in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546;
+# validated green on MI355X) before serving.
+minimaxm3-fp8-mi300x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
@@ -11773,6 +11773,101 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: true
 
+# MiniMax-M3 GB300 full sweep — safetensors-load-strategy removed from all
+# GB300 recipes (host-memory OOM with prefetch on CW Grace Blackwell nodes).
+# srun_options mem=0 required (DefMemPerCPU=4096 cgroup limit).
+minimaxm3-fp8-gb300-dynamo-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: gb300-cw
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Disagg-only. Every recipe enables NixlConnector KV transfer over
+      # multi-node NVLink (UCX_CUDA_IPC_ENABLE_MNNVL=y + --enable-cumem-allocator),
+      # which moved the cross-node prefill->decode KV handoff off the RDMA/TCP
+      # fallback (~268 MB/s) onto the NVL fabric (~1.4-1.7 GB/s): +17/+23/+49%
+      # out tok/s/gpu @ conc 64/128/256 (1P1D). GB300-only win — B300's 8-GPU
+      # IB islands cannot move KV over multi-node NVLink. GB300-cw is 4 GPU/node,
+      # so prefill and decode each take whole 4-GPU nodes (no 8-GPU collocation).
+      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency:
+      # peaks at ~1646 out/s/gpu @ conc 256 (1k1k). Covers latency-to-mid.
+      - conc-list: [8, 16, 32, 64, 128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # Rack-saturating max throughput: 5 prefill + 12 decode, TP4 ep1 (17 nodes).
+      # Balanced prefill:decode ratio (single prefill starved the decode pool)
+      # + TP-only decode (Qwen3.5-A17B pattern; M3 wide-EP all-to-all was slower).
+      - conc-list: [2048, 4096, 8192]
+        prefill:
+          num-worker: 5
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml"
+        decode:
+          num-worker: 12
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 8k1k long context: prefill is heavier and KV is larger, so concurrency
+      # is lower than 1k1k. Same disagg shapes + multi-node-NVLink KV transfer.
+      # 1P+1D TP4+EP4 split, 2 nodes (8 GPU). Best per-GPU efficiency:
+      # peaks at ~1209 out/s/gpu @ conc 256 (8k1k). Covers latency-to-mid.
+      - conc-list: [8, 16, 32, 64, 128, 256]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # Prefill-heavy rack-saturating: 10P + 7D, TP4 ep1 (17 nodes).
+      # At 8k context, prefill is 8x heavier — 5P:12D showed 616-req prefill
+      # backlog. DSR1/DSV4 GB300 patterns use 6-10 prefill workers for 8k1k.
+      - conc-list: [1024, 2048, 4096]
+        prefill:
+          num-worker: 10
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-10p7d-tp4ep1-17n.yaml"
+        decode:
+          num-worker: 7
+          tp: 4
+          ep: 1
+          dp-attn: false
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -123,6 +123,11 @@ on:
 
 env:
   RANDOM_RANGE_RATIO: 0.8
+  # Day-zero models resolved via hf: ids download from the Hub inside the
+  # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
+  # get 429-rate-limited when several workers pull a 444 GB snapshot at
+  # once; sbatch/srun inherit this env so the token reaches the workers.
+  HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}

diff --git a/...ti_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml b/...ti_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-tp4ep4-2n.yaml
@@ -0,0 +1,126 @@
+name: "minimax-m3-vllm-disagg-gb300-1p1d-tp4ep4-2n-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB300, 2 nodes.
+# 1 node prefill (4 GPUs, TP4+EP4) → NixlConnector → 1 node decode (4 GPUs, TP4+EP4).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# safetensors-load-strategy omitted — prefetch OOMs on CW GB300 host memory.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+# CW gb300 has no per-GPU Slurm defaults: without sbatch-level mem=0 the
+# job allocation is ntasks x DefMemPerCPU = 4 GB/node and worker cgroups
+# OOM-kill during engine init; srun_options.mem=0 alone only grants a
+# step what the job already holds. cpus-per-task=72 (one NUMA socket)
+# keeps the gpu-less infra step (etcd/nats) off the 1-CPU default.
+# Full rationale: vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    # Cache-only at runtime: the launcher pre-stages the full snapshot and
+    # verifies it offline before submit, so workers must NOT re-fetch. Without
+    # this, dynamo fetch_model takes a per-blob .lock on the shared NFS cache;
+    # co-fetching workers (e.g. both nodes of a TP8-2n job) collide and the
+    # loser dies with "Lock acquisition failed" (no retry like Python hf_hub).
+    HF_HUB_OFFLINE: "1"
+    # Multi-node-NVLink KV transfer: keep cross-node prefill->decode KV on the
+    # NVL fabric (5-6x vs RDMA/TCP fallback, +17-49% tok/s/gpu). Needs VMM-
+    # registered KV (enable-cumem-allocator below).
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enable-cumem-allocator: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enable-cumem-allocator: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "64x128x256x512"
diff --git a/..._node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml b/..._node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-5p12d-tp4ep1-17n.yaml
@@ -0,0 +1,102 @@
+name: "minimax-m3-vllm-disagg-gb300-5p12d-tp4ep1-1k1k"
+
+# MiniMax-M3 GB300 max-throughput disagg, rack-saturating (17 nodes / 68 GPU).
+# Lessons baked in (validated on gb300-cw): (1) TP-only decode (ep1, NO expert
+# parallelism) per Qwen3.5-397B-A17B recipes — M3's MoE all-to-all overhead
+# made wide-EP/DP-attention slower; (2) PREFILL is the bottleneck, not decode
+# or KV transfer — a single prefill starved 64 decode GPUs (3967 reqs queued),
+# so this balances 5 prefill : 12 decode TP4 workers (prefill backlog -> 0,
+# +57% tok/s/gpu vs the 1-prefill ratio at conc 2048). NixlConnector KV
+# transfer stays on multi-node NVLink (MNNVL + cumem). --block-size 128
+# mandatory (MSA cache); text-only -> language-model-only.
+
+model:
+  path: "hf:MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+slurm:
+  time_limit: "8:00:00"
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+extra_mount:
+  - "__M3_HF_HOME__:__M3_HF_HOME__"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 12
+  prefill_workers: 5
+  decode_workers: 12
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    HF_HOME: "__M3_HF_HOME__"
+    HF_HUB_OFFLINE: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      enforce-eager: true
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      max-model-len: 2304
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-cumem-allocator: true
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      max-model-len: 2304
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048x4096x8192"