Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2966,3 +2966,24 @@ minimaxm3-fp8-mi325x-vllm-mtp:
- { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }

# [DO NOT MERGE — experimental] MI325X (gfx942) counterpart of
# minimaxm3arf-fp8-mi355x-vllm: validates vllm-project/vllm#45639 (AITER fused
# all-reduce + Gemma-RMSNorm for MiniMax-M3) on MI325X by applying that PR's diff
# in-place to the shipped minimax-m3 image before serving (recipe
# benchmarks/single_node/fixed_seq_len/minimaxm3arf_fp8_mi325x.sh; BF16 KV on
# gfx942). Smoke test at conc 4 and 8, TP8 (the AR+RMS fusion needs TP>1).
minimaxm3arf-fp8-mi325x-vllm:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3arf
runner: mi325x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-list: [ 4, 8 ] }
11 changes: 11 additions & 0 deletions .github/profile-target.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Target for the label-triggered Profile workflow (.github/workflows/profile.yml).
# When a PR carries the 'profile-enabled' label, that workflow profiles this
# config-key and emits a Perfetto trace per (concurrency, tp) (artifact + relay
# link in the run summary). CONC is space-separated; the workflow runs one job
# per (conc, tp) — i.e. both TP4 and TP8 at each conc.
#
# Experiment: MiniMax-M3 MXFP8 on MI325X (gfx942) with vllm-project/vllm#45639
# (AITER AR + Gemma-RMS fusion) applied in-place; single-node vLLM, TP8, conc 4 and 8.
CONFIG_KEY=minimaxm3arf-fp8-mi325x-vllm
CONFIG_FILE=.github/configs/amd-master.yaml
CONC=4 8
87 changes: 80 additions & 7 deletions .github/workflows/profile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ on:
description: "Ref (branch/sha) to checkout"
required: false
type: string
# Label-triggered profiling: add the 'profile-enabled' label to a PR and this
# workflow profiles the target declared in .github/profile-target.env
# (CONFIG_KEY / CONFIG_FILE / CONC), emitting a Perfetto trace as an artifact
# plus a relay link. Gated in the get-jobs `if` below so only labelled PRs run.
pull_request:
types: [labeled, synchronize, reopened]

permissions:
contents: read
Expand All @@ -40,26 +46,68 @@ env:

jobs:
get-jobs:
# Run for manual dispatch, or for a PR carrying the 'profile-enabled' label.
if: >-
${{ github.event_name == 'workflow_dispatch' ||
(github.event_name == 'pull_request' &&
contains(github.event.pull_request.labels.*.name, 'profile-enabled')) }}
runs-on: ubuntu-latest
outputs:
filtered-matrix: ${{ steps.filter.outputs.filtered }}
count: ${{ steps.filter.outputs.count }}
ref: ${{ steps.preref.outputs.ref }}
moe-debug: ${{ steps.target.outputs.moe_debug }}
steps:
- name: Resolve checkout ref
id: preref
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
else
echo "ref=${{ inputs.ref || github.sha }}" >> "$GITHUB_OUTPUT"
fi

- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ inputs.ref || github.sha }}
ref: ${{ steps.preref.outputs.ref }}

- name: Resolve profile target (dispatch inputs or PR target file)
id: target
run: |
set -euo pipefail
if [ "${{ github.event_name }}" = "pull_request" ]; then
f=.github/profile-target.env
if [ ! -f "$f" ]; then
echo "::error::$f is required for label-triggered profiling" >&2
exit 1
fi
ck=$(grep -E '^CONFIG_KEY=' "$f" | head -1 | cut -d= -f2-)
cf=$(grep -E '^CONFIG_FILE=' "$f" | head -1 | cut -d= -f2-)
cc=$(grep -E '^CONC=' "$f" | head -1 | cut -d= -f2-)
md=$(grep -E '^MOE_DEBUG=' "$f" | head -1 | cut -d= -f2- || true)
if [ -z "$ck" ]; then echo "::error::CONFIG_KEY missing in $f" >&2; exit 1; fi
echo "config_key=${ck}" >> "$GITHUB_OUTPUT"
echo "config_file=${cf:-.github/configs/nvidia-master.yaml}" >> "$GITHUB_OUTPUT"
echo "conc=${cc:-64}" >> "$GITHUB_OUTPUT"
echo "moe_debug=${md:-false}" >> "$GITHUB_OUTPUT"
else
echo "config_key=${{ inputs.config-key }}" >> "$GITHUB_OUTPUT"
echo "config_file=${{ inputs.config-file }}" >> "$GITHUB_OUTPUT"
echo "conc=${{ inputs.conc }}" >> "$GITHUB_OUTPUT"
echo "moe_debug=${{ inputs.moe-debug }}" >> "$GITHUB_OUTPUT"
fi

- id: gen
name: Generate matrix via script
run: |
pip install pydantic
CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }}"
CLI_ARGS="test-config --config-files ${{ steps.target.outputs.config_file }} --config-keys ${{ steps.target.outputs.config_key }} --conc ${{ steps.target.outputs.conc }}"
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py $CLI_ARGS)
echo "raw=$CONFIG_JSON" >> $GITHUB_OUTPUT

- id: filter
name: Take first generated job
name: Select one job per concurrency
shell: python
run: |
import json, os, sys
Expand All @@ -78,7 +126,16 @@ jobs:
f.write("filtered=[]\ncount=0\n")
raise SystemExit(1)

filt = data[:1]
# One job per (concurrency, tp): the first config generated for each
# (conc, tp) pair — i.e. the leading 1k1k search-space row of each TP —
# ordered by (conc, tp). This profiles both TP4 and TP8 at every conc.
# A single conc/tp still yields one job (backward compatible).
by_key = {}
for job in data:
k = (job.get("conc"), job.get("tp"))
if k not in by_key:
by_key[k] = job
filt = [by_key[k] for k in sorted(by_key)]

out = json.dumps(filt)
print(out)
Expand Down Expand Up @@ -115,8 +172,14 @@ jobs:
CONC: ${{ matrix.config.conc }}
SPEC_DECODING: ${{ matrix.config.spec-decoding }}
DISAGG: ${{ matrix.config.disagg }}
# The single-node launchers resolve the recipe path as
# benchmarks/single_node/${SCENARIO_SUBDIR}<exp>_<prec>_<hw>.sh and run
# under `set -u`; the sweep's benchmark-tmpl.yml sets this, so profile.yml
# must too. Profiling is fixed-seq-len (mi325x/mi300x launchers don't
# default it the way mi355x does).
SCENARIO_SUBDIR: fixed_seq_len/
MOE_DEBUG: '0'
MOE_DEBUG_LOG: ${{ (inputs.moe-debug) && '/workspace/moe_debug.tp0.log' || '' }}
MOE_DEBUG_LOG: ${{ needs.get-jobs.outputs.moe-debug == 'true' && '/workspace/moe_debug.tp0.log' || '' }}
steps:
- name: Resource cleanup
run: |
Expand Down Expand Up @@ -145,7 +208,7 @@ jobs:
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
ref: ${{ inputs.ref || github.sha }}
ref: ${{ needs.get-jobs.outputs.ref }}
clean: false

- name: Launch + Profile (single-node sglang/vllm)
Expand Down Expand Up @@ -261,7 +324,17 @@ jobs:
git config user.email "github-actions@github.com"
git add -A
git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit"
git push
# Parallel matrix jobs (one per conc) all push to this same repo, so a
# plain push races and is rejected non-fast-forward. Rebase onto the
# latest remote and retry with jitter until it lands.
for attempt in 1 2 3 4 5 6 7 8; do
if git push; then break; fi
echo "push rejected (attempt ${attempt}); rebasing on origin/master"
git fetch origin master --quiet
git rebase origin/master
if [ "$attempt" = 8 ]; then echo "push failed after ${attempt} attempts" >&2; exit 1; fi
sleep $(( (RANDOM % 6) + 2 ))
done

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR traces use merge SHA

Medium Severity

Label-triggered profiling checks out the PR head commit via needs.get-jobs.outputs.ref, but trace storage paths, commit messages, and Perfetto raw URLs still use GITHUB_SHA. On pull_request runs that SHA is GitHub’s synthetic merge commit, not the profiled head, so artifacts and relay links won’t align with the PR revision that was actually run.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit edfa135. Configure here.

STORAGE_SHA="$(git rev-parse HEAD)"
popd >/dev/null

Expand Down
129 changes: 129 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3arf_fp8_mi325x.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env bash

# [DO NOT MERGE — experimental] MiniMax-M3 MXFP8 MI325X (gfx942) single-node vLLM
# recipe that validates vllm-project/vllm#45639 ("[ROCm][M3] Enable AITER AR +
# Gemma-RMS fusion for MiniMax-M3") on real MI325X hardware before an image
# rebuild. It applies #45639 in-place to the shipped vllm/vllm-openai-rocm:minimax-m3
# image, then serves with the AITER fused all-reduce + RMSNorm path enabled.
#
# Mirrors minimaxm3_fp8_mi325x.sh otherwise (--block-size 128, --language-model-only,
# TRITON_ATTN, BF16 KV — gfx942 has no calibrated FP8 attention scales). The
# #45639-specific knobs:
# VLLM_ROCM_USE_AITER=1 (AITER kernels)
# --compilation-config custom_ops=["-minimax_gemma_rms_norm"] (allow IR lowering)
# --compilation-config pass_config.fuse_allreduce_rms=true (the fusion pass)
# The fusion needs TP>1; this recipe is swept at TP8.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

# ---- Apply vllm-project/vllm#45639 in-place -------------------------------
# The shipped minimax-m3 image predates #45639 (base m3_release). Apply the
# vendored diff to the installed vllm. Idempotent: if it is already applied
# (reverse-applies cleanly) we proceed; if it neither applies cleanly nor is
# already applied, the image has drifted from the PR base — hard-fail so we never
# silently benchmark an unpatched server.
PATCH_FILE="$(cd "$(dirname "$0")/patches" && pwd)/vllm-45639-aiter-ar-gemma-rms.diff"
command -v patch >/dev/null 2>&1 || { apt-get update -q -y && apt-get install -q -y patch; }
VLLM_SP="$(python3 -c 'import os, vllm; print(os.path.dirname(os.path.dirname(vllm.__file__)))')"
if ( cd "$VLLM_SP" && patch -p1 -R --dry-run < "$PATCH_FILE" >/dev/null 2>&1 ); then
echo "[vllm#45639] already applied to $VLLM_SP/vllm"
elif ( cd "$VLLM_SP" && patch -p1 --dry-run < "$PATCH_FILE" >/dev/null 2>&1 ); then
( cd "$VLLM_SP" && patch -p1 < "$PATCH_FILE" )
echo "[vllm#45639] applied to $VLLM_SP/vllm"
else
echo "FATAL: vllm#45639 patch neither applies cleanly nor is already applied" >&2
echo " ($VLLM_SP/vllm has drifted from the PR's m3_release base)" >&2
exit 1
fi

SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_USE_BREAKABLE_CUDAGRAPH=0
# #45639: AITER fused all-reduce + Gemma-RMSNorm.
export VLLM_ROCM_USE_AITER=1

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP")
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(
--tensor-parallel-size 1
--data-parallel-size "$TP"
--enable-expert-parallel
)
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

start_gpu_monitor

# When PROFILE=1 (profile.yml), arm vLLM's torch profiler via --profiler-config.
# This minimax-m3 image's vLLM does NOT honour the VLLM_TORCH_PROFILER_DIR env
# var, so the serve flag is what makes /start_profile emit a trace. Write to the
# dir benchmark_lib's relay scans (VLLM_TORCH_PROFILER_DIR, default /workspace/).
PROFILE_ARGS=()
if [ "${PROFILE:-}" = "1" ]; then
PROFILE_ARGS=(--profiler-config "{\"profiler\": \"torch\", \"torch_profiler_dir\": \"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}")
fi

set -x
vllm serve "$MODEL" --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
"${PROFILE_ARGS[@]}" \
--block-size 128 \
--language-model-only \
--max-model-len "$MAX_MODEL_LEN" \
--attention-backend TRITON_ATTN \
--no-enable-prefix-caching \
--compilation-config '{"custom_ops": ["-minimax_gemma_rms_norm"], "pass_config": {"fuse_allreduce_rms": true}}' \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
Loading
Loading