diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7e4918e09..845e79bd7 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2966,3 +2966,31 @@ minimaxm3-fp8-mi325x-vllm-mtp: - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } + +# [DO NOT MERGE — experimental] MI325X (gfx942) counterpart of +# minimaxm3arf-fp8-mi355x-vllm: validates vllm-project/vllm#45639 (AITER fused +# all-reduce + Gemma-RMSNorm for MiniMax-M3) on MI325X by applying that PR's diff +# in-place to the shipped minimax-m3 image before serving (recipe +# benchmarks/single_node/fixed_seq_len/minimaxm3arf_fp8_mi325x.sh; BF16 KV on +# gfx942). Smoke test at conc 4 and 8, TP8 (the AR+RMS fusion needs TP>1). +minimaxm3arf-fp8-mi325x-vllm: + image: vllm/vllm-openai-rocm:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3arf + runner: mi325x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-list: [ 4, 8 ] } + # 8k1k conc-16 row exists solely so the eval policy (8k1k + conc >= 16) marks + # an lm-eval entry — validates #45639 fused-kernel correctness. (Evals can't + # run at the conc 4/8 perf points; MIN_EVAL_CONC=16.) + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-list: [ 16 ] } diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3arf_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3arf_fp8_mi325x.sh new file mode 100644 index 000000000..e00bd3798 --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3arf_fp8_mi325x.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash + +# [DO NOT MERGE — experimental] MiniMax-M3 MXFP8 MI325X (gfx942) single-node vLLM +# recipe that validates vllm-project/vllm#45639 ("[ROCm][M3] Enable AITER AR + +# Gemma-RMS fusion for MiniMax-M3") on real MI325X hardware before an image +# rebuild. It applies #45639 in-place to the shipped vllm/vllm-openai-rocm:minimax-m3 +# image, then serves with the AITER fused all-reduce + RMSNorm path enabled. +# +# Mirrors minimaxm3_fp8_mi325x.sh otherwise (--block-size 128, --language-model-only, +# TRITON_ATTN, BF16 KV — gfx942 has no calibrated FP8 attention scales). The +# #45639-specific knobs: +# VLLM_ROCM_USE_AITER=1 (AITER kernels) +# --compilation-config custom_ops=["-minimax_gemma_rms_norm"] (allow IR lowering) +# --compilation-config pass_config.fuse_allreduce_rms=true (the fusion pass) +# The fusion needs TP>1; this recipe is swept at TP8. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + EP_SIZE \ + DP_ATTENTION \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi + +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +# ---- Apply vllm-project/vllm#45639 in-place ------------------------------- +# The shipped minimax-m3 image predates #45639 (base m3_release). Apply the +# vendored diff to the installed vllm. Idempotent: if it is already applied +# (reverse-applies cleanly) we proceed; if it neither applies cleanly nor is +# already applied, the image has drifted from the PR base — hard-fail so we never +# silently benchmark an unpatched server. +PATCH_FILE="$(cd "$(dirname "$0")/patches" && pwd)/vllm-45639-aiter-ar-gemma-rms.diff" +command -v patch >/dev/null 2>&1 || { apt-get update -q -y && apt-get install -q -y patch; } +VLLM_SP="$(python3 -c 'import os, vllm; print(os.path.dirname(os.path.dirname(vllm.__file__)))')" +if ( cd "$VLLM_SP" && patch -p1 -R --dry-run < "$PATCH_FILE" >/dev/null 2>&1 ); then + echo "[vllm#45639] already applied to $VLLM_SP/vllm" +elif ( cd "$VLLM_SP" && patch -p1 --dry-run < "$PATCH_FILE" >/dev/null 2>&1 ); then + ( cd "$VLLM_SP" && patch -p1 < "$PATCH_FILE" ) + echo "[vllm#45639] applied to $VLLM_SP/vllm" +else + echo "FATAL: vllm#45639 patch neither applies cleanly nor is already applied" >&2 + echo " ($VLLM_SP/vllm has drifted from the PR's m3_release base)" >&2 + exit 1 +fi + +SERVER_LOG=/workspace/server.log +export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +# #45639: AITER fused all-reduce + Gemma-RMSNorm. +export VLLM_ROCM_USE_AITER=1 +# DEBUG so the server log carries the fusion-pass match/replace counts +# ("RocmAiterAllReduceFusionPass Replaced N patterns", "fusion pass matches: {}") +# in addition to the (default-level) registration bail warnings. +export VLLM_LOGGING_LEVEL=DEBUG + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context +fi + +PARALLEL_ARGS=(--tensor-parallel-size "$TP") +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=( + --tensor-parallel-size 1 + --data-parallel-size "$TP" + --enable-expert-parallel + ) +elif [ "$EP_SIZE" -gt 1 ]; then + PARALLEL_ARGS+=(--enable-expert-parallel) +fi + +start_gpu_monitor + +# When PROFILE=1 (profile.yml), arm vLLM's torch profiler via --profiler-config. +# This minimax-m3 image's vLLM does NOT honour the VLLM_TORCH_PROFILER_DIR env +# var, so the serve flag is what makes /start_profile emit a trace. Write to the +# dir benchmark_lib's relay scans (VLLM_TORCH_PROFILER_DIR, default /workspace/). +PROFILE_ARGS=() +if [ "${PROFILE:-}" = "1" ]; then + PROFILE_ARGS=(--profiler-config "{\"profiler\": \"torch\", \"torch_profiler_dir\": \"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}") +fi + +set -x +vllm serve "$MODEL" --port "$PORT" \ + "${PARALLEL_ARGS[@]}" \ + "${PROFILE_ARGS[@]}" \ + --block-size 128 \ + --language-model-only \ + --max-model-len "$MAX_MODEL_LEN" \ + --attention-backend TRITON_ATTN \ + --no-enable-prefix-caching \ + --compilation-config '{"custom_ops": ["-minimax_gemma_rms_norm"], "pass_config": {"fuse_allreduce_rms": true}}' \ + --tool-call-parser minimax_m3 \ + --reasoning-parser minimax_m3 \ + --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- #45639 AITER AR + Gemma-RMS fusion diagnostics (definitive) ---------- +# Engine init (incl. torch.compile fusion passes) has finished by now, so the +# fusion-pass logging is in the server log. Two questions, answered from the log: +# 1) Did the pass REGISTER? Any of these warning_once strings => it registered +# ZERO patterns (match count is 0 by construction): +# "AllReduce fusion pass is disabled", "AITER allreduce fusion must be +# initialized", "AITER allreduce-rmsnorm fusion disabled: aiter<0.1.12" +# (the M3/6144 one), "Custom Allreduce is required". +# 2) Did it MATCH+REPLACE? "RocmAiterAllReduceFusionPass Replaced N patterns" +# (N>0 => matched & replaced; N==0 => matched nothing) and the per-pass +# "fusion pass matches: {...}" table. +set +x +echo "================ #45639 fusion-pass verdict ================" +echo "--- [1] registration bail warnings (presence => registered 0 patterns) ---" +grep -nE "AllReduce fusion pass is disabled|AITER allreduce fusion must be initialized|AITER allreduce-rmsnorm fusion disabled|Custom Allreduce is required" "$SERVER_LOG" \ + || echo " (none — no registration bail)" +echo "--- [2] match / replace counts ---" +grep -nE "RocmAiterAllReduceFusionPass Replaced [0-9]+ patterns|fusion pass matches:" "$SERVER_LOG" \ + || echo " (no 'Replaced N patterns' / 'fusion pass matches' line found)" +echo "===========================================================" +set -x + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/fixed_seq_len/patches/vllm-45639-aiter-ar-gemma-rms.diff b/benchmarks/single_node/fixed_seq_len/patches/vllm-45639-aiter-ar-gemma-rms.diff new file mode 100644 index 000000000..a946949ed --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/patches/vllm-45639-aiter-ar-gemma-rms.diff @@ -0,0 +1,203 @@ +diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +index 4de5c6cf7ae5..0fd5cb830f5e 100644 +--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py ++++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +@@ -1439,6 +1439,51 @@ def _replacement( + return _replacement + + ++class AiterAllreduceFusedAddRMSNormWithCopyPattern(BasePattern, VllmPatternReplacement): ++ """Non-quant AR+RMS fusion for all_reduce with 2 users (copy_). ++ ++ In GemmaRMSNorm models, the post-attention all_reduce has a copy_ ++ node for cross-chunk residual state, giving it 2 users and preventing ++ the standard pattern from matching. This pattern returns ar_out as ++ an explicit output so the pattern matcher rewires external users ++ (the copy_) to the fused kernel's residual output. ++ """ ++ ++ def __init__(self, epsilon, dtype, device): ++ super().__init__(dtype, device) ++ self.epsilon = epsilon ++ self.FUSED_OP = rocm_aiter_ops.get_fused_allreduce_rmsnorm_op() ++ ++ def get_inputs(self): ++ return [self.empty(5, 16), self.empty(5, 16), self.empty(16)] ++ ++ @property ++ def pattern(self): ++ eps = self.epsilon ++ ++ def _pattern(residual, input_, weight): ++ ar_out = tensor_model_parallel_all_reduce(input_) ++ rms, res_out = vllm.ir.ops.fused_add_rms_norm(ar_out, residual, weight, eps) ++ return rms, res_out, ar_out ++ ++ return _pattern ++ ++ @property ++ def replacement(self): ++ eps = self.epsilon ++ ++ def _replacement(residual, input_, weight): ++ fused = self.FUSED_OP( ++ input_=input_, ++ residual=residual, ++ weight=weight.to(input_.dtype), ++ epsilon=eps, ++ ) ++ return fused[0], fused[1], fused[1] ++ ++ return _replacement ++ ++ + class RocmAiterAllReduceFusionPass(VllmFusionPatternMatcherPass): + def __init__(self, config: VllmConfig) -> None: + super().__init__(config, "rocm_aiter_allreduce_fusion_pass") +@@ -1503,9 +1548,13 @@ def __init__(self, config: VllmConfig) -> None: + return + + max_token_num = max_size // (hidden_dim * element_size) ++ # Cap at max_cudagraph_capture_size so fusion only fires ++ # for decode. Prefill uses quickreduce + triton rmsnorm. ++ max_cg = config.compilation_config.max_cudagraph_capture_size or 512 + self.max_token_num = min( + max_token_num, + config.scheduler_config.max_num_batched_tokens, ++ max_cg, + ) + + # Only register the AR+RMS+per-group-FP8-quant patterns when the +@@ -1524,6 +1573,15 @@ def __init__(self, config: VllmConfig) -> None: + "FP8 quant fusion." + ) + ++ # Non-quant copy-aware pattern for post-attention allreduce ++ for epsilon in [1e-5, 1e-6]: ++ self.register( ++ AiterAllreduceFusedAddRMSNormWithCopyPattern( ++ epsilon, self.model_dtype, self.device ++ ) ++ ) ++ torch._inductor.pattern_matcher._seen_patterns.clear() ++ + for epsilon in [1e-5, 1e-6]: + # Quant-fused variants must register first so the pattern matcher + # tries them before the AR+RMS-only variants. Otherwise the +diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py +index 13b0ae781314..8d0336622249 100644 +--- a/vllm/model_executor/layers/layernorm.py ++++ b/vllm/model_executor/layers/layernorm.py +@@ -159,7 +159,7 @@ def forward_native( + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" +- weight = self.weight.float() + 1.0 ++ weight = self.weight.data.to(x.dtype) + 1.0 + if residual is None: + return ir.ops.rms_norm(x, weight, self.variance_epsilon) + return ir.ops.fused_add_rms_norm(x, residual, weight, self.variance_epsilon) +diff --git a/vllm/models/minimax_m3/amd/model.py b/vllm/models/minimax_m3/amd/model.py +index b80d3b8b3b8c..eecd6d335fdd 100644 +--- a/vllm/models/minimax_m3/amd/model.py ++++ b/vllm/models/minimax_m3/amd/model.py +@@ -23,7 +23,7 @@ + from torch import nn + from transformers import PretrainedConfig + +-from vllm import _custom_ops as ops ++from vllm import _custom_ops as ops, ir + from vllm.compilation.breakable_cudagraph import eager_break_during_capture + from vllm.config import ( + CacheConfig, +@@ -32,6 +32,7 @@ + ) + from vllm.distributed import get_tensor_model_parallel_world_size + from vllm.forward_context import get_forward_context ++from vllm.model_executor.custom_op import CustomOp + from vllm.model_executor.layers.attention import Attention + from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase + from vllm.model_executor.layers.fused_allreduce_gemma_rms_norm import ( +@@ -160,17 +161,24 @@ def _build_rotary_emb(config: PretrainedConfig, head_dim: int): + ) + + +-class MiniMAXGemmaRMSNorm(nn.Module): +- """Gemma-style RMS normalization (native ROCm implementation). +- +- Normalizes in fp32 and scales by ``(1 + weight)`` — numerically equivalent +- to the FlashInfer ``gemma_rmsnorm`` / ``gemma_fused_add_rmsnorm`` kernels +- used in the NVIDIA path, which are unavailable on ROCm. When ``residual`` is +- given, the fused add + norm returns the updated ``(normed, residual)`` pair. +- +- The fp32 normalize + scale + (optional) residual-add run in a single fused +- Triton pass (``amd.ops.gemma_rmsnorm`` / ``gemma_fused_add_rmsnorm``) instead +- of a chain of elementwise PyTorch kernels. ++@CustomOp.register("minimax_gemma_rms_norm") ++class MiniMAXGemmaRMSNorm(CustomOp): ++ """Gemma-style RMS normalization for the M3 ROCm path. ++ ++ Default (custom op enabled, ``forward_hip``): an fp32 Triton pass ++ (``amd.ops.gemma_rmsnorm`` / ``gemma_fused_add_rmsnorm``) — numerically ++ equivalent to the FlashInfer ``gemma_rmsnorm`` kernels used in the NVIDIA ++ path, which are unavailable on ROCm. This is the unchanged M3 default. ++ ++ When the custom op is disabled (``--compilation-config ++ '{"custom_ops":["-minimax_gemma_rms_norm"]}'``), ``forward_native`` instead ++ emits the plain ``ir.ops.rms_norm`` / ``ir.ops.fused_add_rms_norm`` IR ops, ++ with the Gemma ``1 + weight`` offset folded into the weight. That exposes the ++ post-attention ``all_reduce -> fused_add_rms_norm`` sequence to the AITER ++ AR+RMS fusion pass (``RocmAiterAllReduceFusionPass``), letting it fuse the ++ decode-time allreduce + residual-add + rmsnorm into a single AITER kernel at ++ TP>1. Opt-in because it swaps M3's accuracy-tuned fp32 norm path for the IR ++ lowering: validate gsm8k parity before enabling by default. + """ + + def __init__( +@@ -182,11 +190,24 @@ def __init__( + self.weight = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + +- def forward( ++ def forward_native( ++ self, ++ x: torch.Tensor, ++ residual: torch.Tensor | None = None, ++ ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ++ # Fusable IR path, matched by the AITER AR+RMS fusion pass. Fold the ++ # Gemma ``1 + weight`` offset into the weight in x's dtype. ++ weight = self.weight.data.to(x.dtype) + 1.0 ++ if residual is None: ++ return ir.ops.rms_norm(x, weight, self.variance_epsilon) ++ return ir.ops.fused_add_rms_norm(x, residual, weight, self.variance_epsilon) ++ ++ def forward_hip( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ++ # Default ROCm path: fp32 Triton gemma kernels (unchanged M3 behavior). + if residual is None: + return gemma_rmsnorm(x, self.weight, self.variance_epsilon) + return gemma_fused_add_rmsnorm(x, residual, self.weight, self.variance_epsilon) +diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py +index aaf1fdce36bb..b8d49b6efc68 100644 +--- a/vllm/platforms/rocm.py ++++ b/vllm/platforms/rocm.py +@@ -976,9 +976,13 @@ def get_default_ir_op_priority( + using_inductor = cc.backend == "inductor" and cc.mode != CompilationMode.NONE + default = ["native"] if using_inductor else ["vllm_c", "native"] + +- # Aiter rms norm perform best when CUDA Graph capture is enabled. +- # TODO(luka/TJ) remove env vars completely +- if ( ++ # When allreduce+rmsnorm fusion is enabled (default on ROCm TP>1), ++ # use native priority so triton can fuse rmsnorm with adjacent ops. ++ # The aiter CK rmsnorm is opaque to triton and blocks fusion. ++ fuse_ar_rms = cc.pass_config.fuse_allreduce_rms ++ if using_inductor and fuse_ar_rms is not False: ++ rms_norm = default # ["native"] ++ elif ( + cc.cudagraph_mode != CUDAGraphMode.NONE + and envs.VLLM_ROCM_USE_AITER + and envs.VLLM_ROCM_USE_AITER_RMSNORM diff --git a/perf-changelog.yaml b/perf-changelog.yaml index bee038a7a..ce26a1e9e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3842,3 +3842,9 @@ - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 + +- config-keys: + - minimaxm3arf-fp8-mi325x-vllm + description: + - "[DO NOT MERGE — experimental] MiniMax-M3 MXFP8 MI325X (gfx942) smoke test (perf at conc 4, 8 TP8 1k1k; plus an 8k1k conc-16 row so the eval policy marks an lm-eval, and VLLM_LOGGING_LEVEL=DEBUG + a fusion-pass verdict grep) validating vllm-project/vllm#45639 (AITER fused all-reduce + Gemma-RMSNorm for M3): applies the PR diff in-place to the shipped minimax-m3 image before serving (BF16 KV on gfx942), then enables it via VLLM_ROCM_USE_AITER=1 + --compilation-config (custom_ops -minimax_gemma_rms_norm, pass_config.fuse_allreduce_rms). Hard-fails if the patch neither applies cleanly nor is already applied (image drifted)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1772