Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2780,3 +2780,39 @@ dsv4-fp4-mi355x-atom-disagg:
dp-attn: false
additional-settings:
- "DECODE_NODES=1"

# MiniMax-M3 day-zero on AMD MI325X (Vultr fleet)
# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3?hardware=mi325x). ROCm sibling of the
# B300 minimaxm3-fp8-b300-vllm entry: 427B total / 26B active MoE with MSA sparse
# attention, MXFP8 checkpoint (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) on the dedicated
# vllm/vllm-openai-rocm:minimax-m3 image. --block-size 128 is mandatory (MSA sparse/
# index cache alignment; default 16 crashes "No common block size for 16" on AMD);
# --attention-backend TRITON_ATTN is the MI325X recipe backend; --language-model-only
# skips the vision encoder for text-only throughput. New mi325x-vultr runner-type ->
# launch_mi325x-vultr.sh. tp2 is dropped vs B300: ~444 GB MXFP8 at TP2 = 222 GB/GPU
# would OOM on the 256 GB MI325X.
minimaxm3-fp8-mi325x-vllm:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi325x

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrong runner type in config

High Severity

The new Vultr MiniMax-M3 entry sets runner to mi325x, so CI schedules the AMDS fleet and launch_mi325x-amds.sh instead of mi325x-vultr and launch_mi325x-vultr.sh. Staged weights at /nfsdata/sa/models/ and enroot cache at /enroot/sa are never used for this config.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 441ba6d. Configure here.

precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 4, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 512 }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
7 changes: 7 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ mi325x-disagg:
- 'mi325x-amds_06'
- 'mi325x-amds_07'
- 'mi325x-amds_08'
mi325x-vultr:
- 'mi325x-vultr_00'
- 'mi325x-vultr_01'
- 'mi325x-vultr_02'
- 'mi325x-vultr_03'
- 'mi325x-vultr_04'
- 'mi325x-vultr_05'
mi355x:
- 'mi355x-amds_00'
- 'mi355x-amds_01'
Expand Down
109 changes: 109 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 MI325X (Vultr) single-node vLLM recipe
# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3?hardware=mi325x). ROCm sibling of
# minimaxm3_fp8_b300.sh: same M3 essentials (--block-size 128 for MSA sparse/index
# cache alignment, --language-model-only for text-only throughput, conc-scaled
# cudagraph capture) with the MI325X recipe's --attention-backend TRITON_ATTN and the
# gfx942 ROCm idioms from minimaxm2.5_fp8_mi325x.sh (HIP_VISIBLE_DEVICES for vLLM 0.14+
# Ray, VLLM_ROCM_USE_AITER). The vultr launcher bind-mounts the staged HF cache over
# HF_HUB_CACHE, so `hf download` reuses staged weights (or pulls ~444 GB MXFP8 on first
# run); the server is launched with MODEL directly, no MODEL_PATH split.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export VLLM_ROCM_USE_AITER=1

if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

rocm-smi || true
amd-smi || true

SERVER_LOG=/workspace/server.log

# ~444 GB of MXFP8 weights off shared FS; engine startup can exceed the
# default 600s readiness window.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# Fixed-seq-len runs don't need graphs past the request concurrency: capture
# up to the next power of two >= CONC, capped at vLLM's 2048 ceiling.
CAPTURE_SIZE=4
while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
vllm serve "$MODEL" --host 0.0.0.0 --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--attention-backend TRITON_ATTN \
--language-model-only \
--max-cudagraph-capture-size $CAPTURE_SIZE \
--max-num-batched-tokens "$((ISL * 2 ))" \
--no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3649,6 +3649,15 @@
- "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724

- config-keys:
- minimaxm3-fp8-mi325x-vllm
description:
- "Initial submission: MiniMax-M3 MXFP8 single-node vLLM benchmark on MI325X (Vultr fleet) (model: MiniMaxAI/MiniMax-M3-MXFP8, 427B total / 26B active MoE with MSA sparse attention)"
- "Image: vllm/vllm-openai-rocm:minimax-m3"
- "New mi325x-vultr runner-type (launch_mi325x-vultr.sh); --block-size 128 mandatory (MSA sparse/index cache alignment), --attention-backend TRITON_ATTN, --language-model-only for text-only throughput, conc-scaled --max-cudagraph-capture-size"
- "Layouts: TP8 / TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1738

- config-keys:
- minimaxm3-fp8-b200-vllm
description:
Expand Down
72 changes: 72 additions & 0 deletions runners/launch_mi325x-vultr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env bash
set -euo pipefail

# Pre-staged model weights / HF hub cache for the vultr mi325x fleet. Bind-mounted
# over the container-side HF_HUB_CACHE (/mnt/hf_hub_cache/); the bench scripts'
# `hf download "$MODEL"` resolves against the models--org--name caches already
# staged here (e.g. DeepSeek-R1-0528, Qwen3.5-397B-A17B-FP8, GLM-5-FP8) so weights
# are not re-downloaded from HF in CI.
export HF_HUB_CACHE_MOUNT="/nfsdata/sa/models/"

# enroot cache (import layer cache + the imported .sqsh images) for this fleet.
# Node-local ext4 present at the same path on every compute node; import and run
# happen in the same Slurm job on a single node, so node-local storage suffices.
export ENROOT_CACHE_PATH="/enroot/sa"
mkdir -p "$ENROOT_CACHE_PATH"
Comment thread
Oseltamivir marked this conversation as resolved.

PARTITION="compute"
SQUASH_FILE="$ENROOT_CACHE_PATH/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

cleanup_stale_benchmark_logs() {
if [[ -n "${GITHUB_WORKSPACE:-}" ]]; then
sudo -n rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \
rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || true
fi
}
cleanup_stale_benchmark_logs

set -x

# Exclude known-broken mi325x nodes:
# chi-mi325x-pod1-121: has a history of failing enroot container image import
# (root-caused via #1467/#1468/#1469 sweep failures);
# excluded for the same reason as the amds fleet.
# chi-mi325x-pod1-027: fails SLURM resume/boot — salloc grants an allocation then
# relinquishes it with "Something is wrong with the boot of the
# nodes" (run 27454108525), which gated the whole sweep at the
# canary; excluded until the node is repaired.
JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-027.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')

if [ -z "$JOB_ID" ]; then
echo "ERROR: salloc failed to allocate a job" >&2
exit 1
fi

export PORT=$(( 40000 + (JOB_ID % 10000) ))

trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; cleanup_stale_benchmark_logs; exit "$rc"' EXIT

# Use flock to serialize concurrent imports to the same squash file
srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c "
set -euo pipefail
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE' >&2; exit 1; }
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
rm -f \"$SQUASH_FILE\"
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
fi
"
srun --jobid="$JOB_ID" \
--container-image="$SQUASH_FILE" \
--container-mounts="$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \
--container-mount-home \
--container-writable \
--container-remap-root \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh

scancel $JOB_ID
Loading