SemiAnalysisAI · Oseltamivir · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
@@ -2780,3 +2780,39 @@ dsv4-fp4-mi355x-atom-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+
+# MiniMax-M3 day-zero on AMD MI325X (Vultr fleet)
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3?hardware=mi325x). ROCm sibling of the
+# B300 minimaxm3-fp8-b300-vllm entry: 427B total / 26B active MoE with MSA sparse
+# attention, MXFP8 checkpoint (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) on the dedicated
+# vllm/vllm-openai-rocm:minimax-m3 image. --block-size 128 is mandatory (MSA sparse/
+# index cache alignment; default 16 crashes "No common block size for 16" on AMD);
+# --attention-backend TRITON_ATTN is the MI325X recipe backend; --language-model-only
+# skips the vision encoder for text-only throughput. New mi325x-vultr runner-type ->
+# launch_mi325x-vultr.sh. tp2 is dropped vs B300: ~444 GB MXFP8 at TP2 = 222 GB/GPU
+# would OOM on the 256 GB MI325X.
+minimaxm3-fp8-mi325x-vllm:
+  image: vllm/vllm-openai-rocm:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64 }
+      - { tp: 4, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64 }
+      - { tp: 4, conc-start: 1, conc-end: 128 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
@@ -126,6 +126,13 @@ mi325x-disagg:
 - 'mi325x-amds_06'
 - 'mi325x-amds_07'
 - 'mi325x-amds_08'
+mi325x-vultr:
+- 'mi325x-vultr_00'
+- 'mi325x-vultr_01'
+- 'mi325x-vultr_02'
+- 'mi325x-vultr_03'
+- 'mi325x-vultr_04'
+- 'mi325x-vultr_05'
 mi355x:
 - 'mi355x-amds_00'
 - 'mi355x-amds_01'

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+# MiniMax-M3 MXFP8 MI325X (Vultr) single-node vLLM recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3?hardware=mi325x). ROCm sibling of
+# minimaxm3_fp8_b300.sh: same M3 essentials (--block-size 128 for MSA sparse/index
+# cache alignment, --language-model-only for text-only throughput, conc-scaled
+# cudagraph capture) with the MI325X recipe's --attention-backend TRITON_ATTN and the
+# gfx942 ROCm idioms from minimaxm2.5_fp8_mi325x.sh (HIP_VISIBLE_DEVICES for vLLM 0.14+
+# Ray, VLLM_ROCM_USE_AITER). The vultr launcher bind-mounts the staged HF cache over
+# HF_HUB_CACHE, so `hf download` reuses staged weights (or pulls ~444 GB MXFP8 on first
+# run); the server is launched with MODEL directly, no MODEL_PATH split.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    EP_SIZE \
+    DP_ATTENTION \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+
+# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+export VLLM_ROCM_USE_AITER=1
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+  echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+rocm-smi || true
+amd-smi || true
+
+SERVER_LOG=/workspace/server.log
+
+# ~444 GB of MXFP8 weights off shared FS; engine startup can exceed the
+# default 600s readiness window.
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+if [ "${DP_ATTENTION}" = "true" ]; then
+  PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
+elif [ "$EP_SIZE" -gt 1 ]; then
+  PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
+else
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
+fi
+
+# Fixed-seq-len runs don't need graphs past the request concurrency: capture
+# up to the next power of two >= CONC, capped at vLLM's 2048 ceiling.
+CAPTURE_SIZE=4
+while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
+(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+vllm serve "$MODEL" --host 0.0.0.0 --port $PORT \
+$PARALLEL_ARGS \
+--gpu-memory-utilization 0.90 \
+--max-model-len $MAX_MODEL_LEN \
+--block-size 128 \
+--attention-backend TRITON_ATTN \
+--language-model-only \
+--max-cudagraph-capture-size $CAPTURE_SIZE \
+--max-num-batched-tokens "$((ISL * 2 ))" \
+--no-enable-prefix-caching \
+--trust-remote-code > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3649,6 +3649,15 @@
     - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724
 
+- config-keys:
+    - minimaxm3-fp8-mi325x-vllm
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 single-node vLLM benchmark on MI325X (Vultr fleet) (model: MiniMaxAI/MiniMax-M3-MXFP8, 427B total / 26B active MoE with MSA sparse attention)"
+    - "Image: vllm/vllm-openai-rocm:minimax-m3"
+    - "New mi325x-vultr runner-type (launch_mi325x-vultr.sh); --block-size 128 mandatory (MSA sparse/index cache alignment), --attention-backend TRITON_ATTN, --language-model-only for text-only throughput, conc-scaled --max-cudagraph-capture-size"
+    - "Layouts: TP8 / TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1738
+
 - config-keys:
     - minimaxm3-fp8-b200-vllm
   description:

diff --git a/runners/launch_mi325x-vultr.sh b/runners/launch_mi325x-vultr.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Pre-staged model weights / HF hub cache for the vultr mi325x fleet. Bind-mounted
+# over the container-side HF_HUB_CACHE (/mnt/hf_hub_cache/); the bench scripts'
+# `hf download "$MODEL"` resolves against the models--org--name caches already
+# staged here (e.g. DeepSeek-R1-0528, Qwen3.5-397B-A17B-FP8, GLM-5-FP8) so weights
+# are not re-downloaded from HF in CI.
+export HF_HUB_CACHE_MOUNT="/nfsdata/sa/models/"
+
+# enroot cache (import layer cache + the imported .sqsh images) for this fleet.
+# Node-local ext4 present at the same path on every compute node; import and run
+# happen in the same Slurm job on a single node, so node-local storage suffices.
+export ENROOT_CACHE_PATH="/enroot/sa"
+mkdir -p "$ENROOT_CACHE_PATH"
+
+PARTITION="compute"
+SQUASH_FILE="$ENROOT_CACHE_PATH/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+LOCK_FILE="${SQUASH_FILE}.lock"
+
+cleanup_stale_benchmark_logs() {
+    if [[ -n "${GITHUB_WORKSPACE:-}" ]]; then
+        sudo -n rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \
+            rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || true
+    fi
+}
+cleanup_stale_benchmark_logs
+
+set -x
+
+# Exclude known-broken mi325x nodes:
+#   chi-mi325x-pod1-121: has a history of failing enroot container image import
+#                        (root-caused via #1467/#1468/#1469 sweep failures);
+#                        excluded for the same reason as the amds fleet.
+#   chi-mi325x-pod1-027: fails SLURM resume/boot — salloc grants an allocation then
+#                        relinquishes it with "Something is wrong with the boot of the
+#                        nodes" (run 27454108525), which gated the whole sweep at the
+#                        canary; excluded until the node is repaired.
+JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-027.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+if [ -z "$JOB_ID" ]; then
+    echo "ERROR: salloc failed to allocate a job" >&2
+    exit 1
+fi
+
+export PORT=$(( 40000 + (JOB_ID % 10000) ))
+
+trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; cleanup_stale_benchmark_logs; exit "$rc"' EXIT
+
+# Use flock to serialize concurrent imports to the same squash file
+srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c "
+    set -euo pipefail
+    exec 9>\"$LOCK_FILE\"
+    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE' >&2; exit 1; }
+    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+        echo 'Squash file already exists and is valid, skipping import'
+    else
+        rm -f \"$SQUASH_FILE\"
+        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+    fi
+"
+srun --jobid="$JOB_ID" \
+--container-image="$SQUASH_FILE" \
+--container-mounts="$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \
+--container-mount-home \
+--container-writable \
+--container-remap-root \
+--container-workdir=/workspace/ \
+--no-container-entrypoint --export=ALL \
+bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh
+
+scancel $JOB_ID