Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2846,3 +2846,28 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
# MI355X serving shape, but retain the default BF16 KV cache because this
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency.
minimaxm3-fp8-mi300x-vllm:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi300x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
88 changes: 88 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe.
# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128
# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on
# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8
# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP")
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(
--tensor-parallel-size 1
--data-parallel-size "$TP"
--enable-expert-parallel
)
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

start_gpu_monitor

set -x
vllm serve "$MODEL" --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
--block-size 128 \
--no-enable-prefix-caching \
--language-model-only \
--max-model-len "$MAX_MODEL_LEN" \
--attention-backend TRITON_ATTN \
--enforce-eager \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3735,3 +3735,13 @@
- "Layouts: TP8 / TP4 (latency), TP8+EP8 / TP4+EP4 (TEP), TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k — non-MTP search space trimmed at the extreme-concurrency end, tp2-ep2 dropped, mirroring the minimaxm3-fp8-b300-vllm-mtp search space"
- "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, ported from nvidia/model.py) before serving — validates EAGLE3 on MI355X ahead of an image rebuild"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1745

- config-keys:
- minimaxm3-fp8-mi300x-vllm
description:
- "Initial submission: MiniMax-M3 MXFP8 day-zero single-node vLLM benchmark on MI300X / gfx942"
- "Image and serving shape reuse the MI355X recipe: vllm/vllm-openai-rocm:minimax-m3, block size 128, prefix caching disabled, TRITON_ATTN, language-model-only, and eager execution"
- "Use the default BF16 KV cache on MI300X because MiniMax-M3-MXFP8 lacks calibrated q/prob scales for ROCm FP8 attention; vLLM warns that its fallback scale of 1.0 may cause accuracy issues"
- "H100-aligned layouts and concurrency ranges: TP8 and TP8+EP8 across 1k1k and 8k1k"
- "Fix launch_mi300x-amds.sh node exclusion to use the current short Slurm node name"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1746
8 changes: 4 additions & 4 deletions runners/launch_mi300x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ LOCK_FILE="${SQUASH_FILE}.lock"
set -x

# Exclude known-bad nodes; let Slurm pick from anything else:
# chi-mi300x-049: drained (persistent /nvme_home disk-full)
JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
# chi-mi300x-049: persistent /nvme_home disk-full
JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049 --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')

if [ -z "$JOB_ID" ]; then
echo "ERROR: salloc failed to allocate a job"
Expand All @@ -31,12 +31,12 @@ srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
"
srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,/dev/kfd:/dev/kfd,/dev/dri:/dev/dri \
--container-mount-home \
--container-writable \
--container-remap-root \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh

scancel $JOB_ID
scancel $JOB_ID