From c4756c7fc5265e771f4c807ff6b63280fd87d582 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 01:18:44 +0000 Subject: [PATCH 1/4] feat(runners): add mi325x-vultr launch script Add runners/launch_mi325x-vultr.sh for the vultr mi325x fleet. Modeled on launch_mi325x-amds.sh (same SKU, same compute partition, same single-node salloc/import/srun flow and *_mi325x.sh bench invocation), with the two cluster-specific paths: - enroot cache (import layer cache + imported .sqsh) at /enroot/sa - pre-staged model weights / HF hub cache at /nfsdata/sa/models/, bind-mounted over the container HF_HUB_CACHE so `hf download "$MODEL"` reuses the staged models--org--name caches instead of re-downloading from HF. Both paths are node-local ext4 at the same path on every compute node; import and run share one Slurm job on a single node, so node-local storage suffices. Co-Authored-By: Claude Opus 4.8 (1M context) --- runners/launch_mi325x-vultr.sh | 68 ++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 runners/launch_mi325x-vultr.sh diff --git a/runners/launch_mi325x-vultr.sh b/runners/launch_mi325x-vultr.sh new file mode 100644 index 000000000..24786d708 --- /dev/null +++ b/runners/launch_mi325x-vultr.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Pre-staged model weights / HF hub cache for the vultr mi325x fleet. Bind-mounted +# over the container-side HF_HUB_CACHE (/mnt/hf_hub_cache/); the bench scripts' +# `hf download "$MODEL"` resolves against the models--org--name caches already +# staged here (e.g. DeepSeek-R1-0528, Qwen3.5-397B-A17B-FP8, GLM-5-FP8) so weights +# are not re-downloaded from HF in CI. +export HF_HUB_CACHE_MOUNT="/nfsdata/sa/models/" + +# enroot cache (import layer cache + the imported .sqsh images) for this fleet. +# Node-local ext4 present at the same path on every compute node; import and run +# happen in the same Slurm job on a single node, so node-local storage suffices. +export ENROOT_CACHE_PATH="/enroot/sa" +mkdir -p "$ENROOT_CACHE_PATH" + +PARTITION="compute" +SQUASH_FILE="$ENROOT_CACHE_PATH/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" + +cleanup_stale_benchmark_logs() { + if [[ -n "${GITHUB_WORKSPACE:-}" ]]; then + sudo -n rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \ + rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || true + fi +} +cleanup_stale_benchmark_logs + +set -x + +# Exclude known-broken mi325x node: +# chi-mi325x-pod1-121: has a history of failing enroot container image import +# (root-caused via #1467/#1468/#1469 sweep failures); +# excluded for the same reason as the amds fleet. +JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + +if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" >&2 + exit 1 +fi + +export PORT=$(( 40000 + (JOB_ID % 10000) )) + +trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; cleanup_stale_benchmark_logs; exit "$rc"' EXIT + +# Use flock to serialize concurrent imports to the same squash file +srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c " + set -euo pipefail + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE' >&2; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi +" +srun --jobid="$JOB_ID" \ +--container-image="$SQUASH_FILE" \ +--container-mounts="$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \ +--container-mount-home \ +--container-writable \ +--container-remap-root \ +--container-workdir=/workspace/ \ +--no-container-entrypoint --export=ALL \ +bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh + +scancel $JOB_ID From e9a0c41892f3ee5c4f28c1a1384f91969a4249cd Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 19:15:11 -0700 Subject: [PATCH 2/4] M3 --- .github/configs/amd-master.yaml | 36 ++++++ .github/configs/runners.yaml | 7 ++ .../fixed_seq_len/minimaxm3_fp8_mi325x.sh | 109 ++++++++++++++++++ perf-changelog.yaml | 9 ++ 4 files changed, 161 insertions(+) create mode 100755 benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 70a79a273..7c52ff8d4 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2780,3 +2780,39 @@ dsv4-fp4-mi355x-atom-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + +# MiniMax-M3 day-zero on AMD MI325X (Vultr fleet) +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3?hardware=mi325x). ROCm sibling of the +# B300 minimaxm3-fp8-b300-vllm entry: 427B total / 26B active MoE with MSA sparse +# attention, MXFP8 checkpoint (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) on the dedicated +# vllm/vllm-openai-rocm:minimax-m3 image. --block-size 128 is mandatory (MSA sparse/ +# index cache alignment; default 16 crashes "No common block size for 16" on AMD); +# --attention-backend TRITON_ATTN is the MI325X recipe backend; --language-model-only +# skips the vision encoder for text-only throughput. New mi325x-vultr runner-type -> +# launch_mi325x-vultr.sh. tp2 is dropped vs B300: ~444 GB MXFP8 at TP2 = 222 GB/GPU +# would OOM on the 256 GB MI325X. +minimaxm3-fp8-mi325x-vllm: + image: vllm/vllm-openai-rocm:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi325x-vultr + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 4, conc-start: 1, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 } diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index eee8405d0..6b7dcf952 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -115,6 +115,13 @@ mi325x-disagg: - 'mi325x-amds_06' - 'mi325x-amds_07' - 'mi325x-amds_08' +mi325x-vultr: +- 'mi325x-vultr_00' +- 'mi325x-vultr_01' +- 'mi325x-vultr_02' +- 'mi325x-vultr_03' +- 'mi325x-vultr_04' +- 'mi325x-vultr_05' mi355x: - 'mi355x-amds_00' - 'mi355x-amds_01' diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh new file mode 100755 index 000000000..b7386d988 --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash + +# MiniMax-M3 MXFP8 MI325X (Vultr) single-node vLLM recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3?hardware=mi325x). ROCm sibling of +# minimaxm3_fp8_b300.sh: same M3 essentials (--block-size 128 for MSA sparse/index +# cache alignment, --language-model-only for text-only throughput, conc-scaled +# cudagraph capture) with the MI325X recipe's --attention-backend TRITON_ATTN and the +# gfx942 ROCm idioms from minimaxm2.5_fp8_mi325x.sh (HIP_VISIBLE_DEVICES for vLLM 0.14+ +# Ray, VLLM_ROCM_USE_AITER). The vultr launcher bind-mounts the staged HF cache over +# HF_HUB_CACHE, so `hf download` reuses staged weights (or pulls ~444 GB MXFP8 on first +# run); the server is launched with MODEL directly, no MODEL_PATH split. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + EP_SIZE \ + DP_ATTENTION \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi + +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export VLLM_ROCM_USE_AITER=1 + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +rocm-smi || true +amd-smi || true + +SERVER_LOG=/workspace/server.log + +# ~444 GB of MXFP8 weights off shared FS; engine startup can exceed the +# default 600s readiness window. +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" +elif [ "$EP_SIZE" -gt 1 ]; then + PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" +else + PARALLEL_ARGS="--tensor-parallel-size=$TP" +fi + +# Fixed-seq-len runs don't need graphs past the request concurrency: capture +# up to the next power of two >= CONC, capped at vLLM's 2048 ceiling. +CAPTURE_SIZE=4 +while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done +(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port $PORT \ +$PARALLEL_ARGS \ +--gpu-memory-utilization 0.90 \ +--max-model-len $MAX_MODEL_LEN \ +--block-size 128 \ +--attention-backend TRITON_ATTN \ +--language-model-only \ +--max-cudagraph-capture-size $CAPTURE_SIZE \ +--max-num-batched-tokens "$((ISL * 2 ))" \ +--no-enable-prefix-caching \ +--trust-remote-code > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d29c9a5d3..f73c82abc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3646,3 +3646,12 @@ - "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k" - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724 + +- config-keys: + - minimaxm3-fp8-mi325x-vllm + description: + - "Initial submission: MiniMax-M3 MXFP8 single-node vLLM benchmark on MI325X (Vultr fleet) (model: MiniMaxAI/MiniMax-M3-MXFP8, 427B total / 26B active MoE with MSA sparse attention)" + - "Image: vllm/vllm-openai-rocm:minimax-m3" + - "New mi325x-vultr runner-type (launch_mi325x-vultr.sh); --block-size 128 mandatory (MSA sparse/index cache alignment), --attention-backend TRITON_ATTN, --language-model-only for text-only throughput, conc-scaled --max-cudagraph-capture-size" + - "Layouts: TP8 / TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1738 From 441ba6d49249b1c67787d4a3664ba48c25675a50 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 19:20:59 -0700 Subject: [PATCH 3/4] Update amd-master.yaml --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7c52ff8d4..196772cfc 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2795,7 +2795,7 @@ minimaxm3-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:minimax-m3 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 - runner: mi325x-vultr + runner: mi325x precision: fp8 framework: vllm multinode: false From 0bd8981e3e17b809128860e111128e06bfcf3dbf Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 20:22:55 -0700 Subject: [PATCH 4/4] fix(runners): exclude chi-mi325x-pod1-027 from mi325x-vultr salloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Node chi-mi325x-pod1-027 fails SLURM resume/boot — salloc grants an allocation then relinquishes it with "Something is wrong with the boot of the nodes" (run 27454108525), gating the minimaxm3-fp8-mi325x canary and thus the whole sweep. Add it to the --exclude list alongside the existing pod1-121 exclusion until the node is repaired. Co-Authored-By: Claude Opus 4.8 (1M context) --- runners/launch_mi325x-vultr.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/runners/launch_mi325x-vultr.sh b/runners/launch_mi325x-vultr.sh index 24786d708..687fd2a25 100644 --- a/runners/launch_mi325x-vultr.sh +++ b/runners/launch_mi325x-vultr.sh @@ -28,11 +28,15 @@ cleanup_stale_benchmark_logs set -x -# Exclude known-broken mi325x node: +# Exclude known-broken mi325x nodes: # chi-mi325x-pod1-121: has a history of failing enroot container image import # (root-caused via #1467/#1468/#1469 sweep failures); # excluded for the same reason as the amds fleet. -JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# chi-mi325x-pod1-027: fails SLURM resume/boot — salloc grants an allocation then +# relinquishes it with "Something is wrong with the boot of the +# nodes" (run 27454108525), which gated the whole sweep at the +# canary; excluded until the node is repaired. +JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-027.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" >&2