Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2871,3 +2871,33 @@ minimaxm3-fp8-mi300x-vllm:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only
# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like
# H100), with the TP8 latency rows started at conc 1 to capture single-request
# latency — matching the H100/MI355X MTP recipes. The shipped ROCm image lacks
# SupportsEagle3 on the AMD MiniMax-M3 model, so the recipe applies that fix
# in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546;
# validated green on MI355X) before serving.
minimaxm3-fp8-mi300x-vllm-mtp:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi300x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
212 changes: 212 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe with EAGLE3
# speculative decoding — the spec-decoding=mtp variant of
# minimaxm3_fp8_mi300x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via
# --speculative-config with 3 speculative tokens. Everything else mirrors the
# non-MTP MI300X recipe: mandatory --block-size 128, --language-model-only for
# the text-only benchmark, --attention-backend TRITON_ATTN, --enforce-eager,
# and --no-enable-prefix-caching. The default BF16 KV cache is retained (unlike
# the MI355X recipe's FP8 KV cache): gfx942 has no calibrated q/prob scales for
# ROCm FP8 attention and vLLM's fallback scale of 1.0 corrupts accuracy.
#
# Unlike the CUDA recipes, the drafter needs no attention_backend override:
# the FlashInfer "page size 128 requires GQA/MQA" limitation that forced
# FLASH_ATTN for the EAGLE3 MHA head on Blackwell is FlashInfer/CUDA-specific.
# Here the whole server runs on TRITON_ATTN (set globally below), which serves
# the MHA draft fine.
#
# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image
# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3
# engine init fails with "Model does not support EAGLE3 interface but
# aux_hidden_state_outputs was requested". This recipe applies that fix
# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as
# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we
# can validate EAGLE3 on real MI300X hardware ahead of an image rebuild. The
# same patch is validated green on MI355X. It is idempotent and fails the job
# loudly if the installed amd/model.py has drifted from the expected base.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# MODEL is a bare HF id on the mi300x single-node runner (a fast cache hit when
# pre-staged). The EAGLE3 draft is not staged; fetch it into the same cache.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
hf download "$DRAFT_MODEL"
fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP")
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(
--tensor-parallel-size 1
--data-parallel-size "$TP"
--enable-expert-parallel
)
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546).
# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model +
# aux-hidden-state emission, and SupportsEagle3 to the two outer classes.
# Idempotent; hard-fails if the installed file has drifted from the expected
# base (so we never silently run unpatched and mislabel the result).
python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; }
import ast, importlib.util, pathlib, sys

spec = importlib.util.find_spec("vllm")
root = pathlib.Path(spec.submodule_search_locations[0])
target = root / "models" / "minimax_m3" / "amd" / "model.py"
src = target.read_text()

if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src:
print(f"[eagle3-patch] already applied: {target}")
sys.exit(0)

edits = [
(
"from vllm.model_executor.models.interfaces import (\n"
" MultiModalEmbeddings,\n"
" SupportsMultiModal,\n"
")",
"from vllm.model_executor.models.interfaces import (\n"
" EagleModelMixin,\n"
" MultiModalEmbeddings,\n"
" SupportsEagle3,\n"
" SupportsMultiModal,\n"
")",
),
(
"class MiniMaxM3Model(nn.Module):",
"class MiniMaxM3Model(nn.Module, EagleModelMixin):",
),
(
" inputs_embeds: torch.Tensor | None = None,\n"
" ) -> torch.Tensor:\n"
" if inputs_embeds is not None:",
" inputs_embeds: torch.Tensor | None = None,\n"
" ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n"
" if inputs_embeds is not None:",
),
(
" residual = None\n\n"
" for layer in self.layers[self.start_layer : self.end_layer]:\n"
" hidden_states, residual = layer(positions, hidden_states, residual)\n\n"
" hidden_states, _ = self.norm(hidden_states, residual)\n"
" return hidden_states",
" residual = None\n\n"
" # EAGLE3 is not yet compatible with pipeline parallel\n"
" aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n"
" for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n"
" hidden_states, residual = layer(positions, hidden_states, residual)\n"
" self._maybe_add_hidden_state(\n"
" aux_hidden_states, idx + 1, hidden_states, residual\n"
" )\n\n"
" hidden_states, _ = self.norm(hidden_states, residual)\n\n"
" if len(aux_hidden_states) > 0:\n"
" return hidden_states, aux_hidden_states\n"
" return hidden_states",
),
(
"class MiniMaxM3SparseForCausalLM(nn.Module):",
"class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):",
),
(
"class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):",
"class MiniMaxM3SparseForConditionalGeneration(\n"
" nn.Module, SupportsMultiModal, SupportsEagle3\n"
"):",
),
]

for old, new in edits:
count = src.count(old)
if count != 1:
sys.exit(
f"[eagle3-patch] anchor matched {count} times (expected 1); "
f"installed {target} has drifted from the expected base — aborting"
)
src = src.replace(old, new)

ast.parse(src)
target.write_text(src)
print(f"[eagle3-patch] applied EAGLE3 support to {target}")
PYEOF

start_gpu_monitor

set -x
vllm serve "$MODEL" --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
--block-size 128 \
--no-enable-prefix-caching \
--language-model-only \
--max-model-len "$MAX_MODEL_LEN" \
--attention-backend TRITON_ATTN \
--enforce-eager \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

# Spec-decode acceptance rate degrades on raw random tokens; route prompts
# through the chat template as the other MTP recipes do.
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code \
--use-chat-template

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3745,3 +3745,13 @@
- "H100-aligned layouts and concurrency ranges: TP8 and TP8+EP8 across 1k1k and 8k1k"
- "Fix launch_mi300x-amds.sh node exclusion to use the current short Slurm node name"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1746

- config-keys:
- minimaxm3-fp8-mi300x-vllm-mtp
description:
- "Initial submission: MiniMax-M3 MXFP8 MI300X (gfx942) vLLM benchmark with EAGLE3 speculative decoding (target: MiniMaxAI/MiniMax-M3-MXFP8, draft: Inferact/MiniMax-M3-EAGLE3, 3 speculative tokens) — spec-decoding=mtp variant of the MI300X day-zero recipe"
- "Image: vllm/vllm-openai-rocm:minimax-m3 (same day-zero ROCm build as the non-MTP entry)"
- "Serve shape follows minimaxm3-fp8-mi300x-vllm: --block-size 128, --no-enable-prefix-caching, --language-model-only, --attention-backend TRITON_ATTN, --enforce-eager, minimax_m3 parsers, and the default BF16 KV cache (gfx942 lacks calibrated ROCm FP8 attention scales); prompts routed through the chat template for realistic acceptance"
- "TP8-only search space (gfx942 192 GB is memory-tight, like H100): TP8 latency rows started at conc 1, TP8+EP8 (TEP) at high concurrency, across 1k1k and 8k1k"
- "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X) before serving — validates EAGLE3 on MI300X ahead of an image rebuild"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1749
6 changes: 5 additions & 1 deletion runners/launch_mi300x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ PARTITION="compute"
SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

# Route spec-decoding=mtp configs to the _mtp benchmark script (parity with
# the h200 launchers, which have carried SPEC_SUFFIX since #392).
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

set -x

# Exclude known-bad nodes; let Slurm pick from anything else:
Expand Down Expand Up @@ -37,6 +41,6 @@ srun --jobid=$JOB_ID \
--container-remap-root \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh
bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x${SPEC_SUFFIX}.sh

scancel $JOB_ID
Loading