Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2847,6 +2847,35 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 MXFP8 MI325X day-zero recipe. Reuse the dedicated ROCm image
# and serving flags validated on MI355X, with the H200 search space: TP4 and
# TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention.
minimaxm3-fp8-mi325x-vllm:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi325x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 128, conc-end: 256 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 32 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
# MI355X serving shape, but retain the default BF16 KV cache because this
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
Expand Down
85 changes: 85 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 MI325X (gfx942) single-node vLLM recipe.
# https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3?hardware=mi325x&variant=mxfp8
# MXFP8 runs from TP=4 on gfx942; block size 128 is mandatory for MSA.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP")
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(
--tensor-parallel-size 1
--data-parallel-size "$TP"
--enable-expert-parallel
)
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

start_gpu_monitor

set -x
vllm serve "$MODEL" --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
--block-size 128 \
--language-model-only \
--max-model-len "$MAX_MODEL_LEN" \
--attention-backend TRITON_ATTN \
--no-enable-prefix-caching \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing FP8 KV cache flag

Medium Severity

The new MI325X vllm serve invocation omits --kv-cache-dtype fp8 even though the PR recipe alignment, changelog, and the existing minimaxm3_fp8_mi355x.sh baseline all specify FP8 KV cache. Without it, vLLM may use a non-FP8 KV default, skewing memory headroom and throughput versus the official MI325X MXFP8 recipe and other MiniMax M3 entries.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 5ec3e11. Configure here.


SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3745,3 +3745,12 @@
- "H100-aligned layouts and concurrency ranges: TP8 and TP8+EP8 across 1k1k and 8k1k"
- "Fix launch_mi300x-amds.sh node exclusion to use the current short Slurm node name"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1746

- config-keys:
- minimaxm3-fp8-mi325x-vllm
description:
- "Initial submission: MiniMax-M3 MXFP8 day-zero single-node vLLM benchmark on MI325X / gfx942"
- "Follows the official MI325X MXFP8 recipe with vllm/vllm-openai-rocm:minimax-m3, block size 128, TRITON_ATTN, and MiniMax-M3 tool/reasoning parsers; fixed-sequence text benchmarking adds language-model-only, the default BF16 KV cache, disabled prefix caching, and a scenario-specific max model length"
- "H200-aligned layouts and concurrency ranges: TP4 and TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention across 1k1k and 8k1k"
- "Route the MI325X Hugging Face cache and runtime compiler caches to node-local storage, and mount ROCm GPU devices explicitly"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1748
6 changes: 4 additions & 2 deletions runners/launch_mi325x-amds.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail

export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
export HF_HUB_CACHE_MOUNT="/local-nvme/hf-hub-cache/"

PARTITION="compute"
SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
Expand All @@ -17,6 +17,8 @@ if [ -z "$JOB_ID" ]; then
fi

export PORT=$(( 40000 + (JOB_ID % 10000) ))
export XDG_CACHE_HOME="/tmp/xdg-cache-$JOB_ID"
export TRITON_CACHE_DIR="/tmp/triton-cache-$JOB_ID"

trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; exit "$rc"' EXIT

Expand All @@ -34,7 +36,7 @@ srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c "
"
srun --jobid="$JOB_ID" \
--container-image="$SQUASH_FILE" \
--container-mounts="$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \
--container-mounts="$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,/dev/kfd:/dev/kfd,/dev/dri:/dev/dri" \
--container-mount-home \
--container-writable \
--container-remap-root \
Expand Down
Loading