From 31b4fbe4ff6f60642106f3fab63df1e050487712 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 21:38:15 +0900 Subject: [PATCH 1/7] [AMD] dsv4-fp4-mi355x-atom: enable DPA TBO at high concurrency, update image to atom0.1.4 - Enable --enable-tbo for ISL=1024/OSL=1024 at CONC>=1024 and ISL=8192/OSL=1024 at CONC>=256 - Update image to atom0.1.4_20260612 - Update ISL=8192 search-space to start at conc=4 and use DPA from conc=128 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 19 +++++++++---------- .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 11 +++++++++-- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 70a79a273..77e4f0040 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2257,15 +2257,8 @@ dsv4-fp4-mi355x-vllm-mtp: search-space: - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } -# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). -# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks -# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until -# the AITER sparse-attention kernel / multi-request path lands upstream. -# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is -# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom); -# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA. dsv4-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 + image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x @@ -2277,13 +2270,19 @@ dsv4-fp4-mi355x-atom: - isl: 1024 osl: 1024 search-space: + # conc4-64, TP8 + # conc128-512, DPA + # conc1024, DPA TBO - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 } - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 512 } + # conc4-64, TP8 + # conc128, DPA + # conc256-1024, DPA TBO + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 1024 } dsv4-fp4-mi355x-atom-mtp: image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh index e485dc9a6..4f4545824 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh @@ -25,8 +25,15 @@ PARALLEL_ARGS=(-tp "$TP") #TP if [ "$DP_ATTENTION" = "true" ]; then if [ "$EP_SIZE" -gt 1 ]; then #DP+EP PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) - else #DP+TP - PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) + else #DPA+TP + #DPA+TP+TBO + if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo) + elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo) + else + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) + fi fi fi From c566e28e05a85be0c06bf531c6d6d92548f3aebf Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 21:40:22 +0900 Subject: [PATCH 2/7] [AMD] perf-changelog: dsv4-fp4-mi355x-atom DPA TBO + image atom0.1.4 Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c0642188b..e68d5d3e0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3600,3 +3600,11 @@ - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k" - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692 + +- config-keys: + - dsv4-fp4-mi355x-atom + description: + - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612" + - "Enable --enable-tbo (Token-Bucket Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256" + - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" + pr-link: From 7e1aa060dbbc69f072ac51a5c43e475b4014da01 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 21:41:35 +0900 Subject: [PATCH 3/7] [AMD] perf-changelog: add PR link #1717 Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e68d5d3e0..f236a6d60 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3605,6 +3605,6 @@ - dsv4-fp4-mi355x-atom description: - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612" - - "Enable --enable-tbo (Token-Bucket Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256" + - "Enable --enable-tbo (Two Batch Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256" - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" - pr-link: + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 From 65e0fa328cd93161c5e736cc9f2fe7f8f22aed16 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 21:59:22 +0900 Subject: [PATCH 4/7] [AMD] dsv4_fp4_mi355x_atom.sh: disable prefix caching Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh index 4f4545824..369b72281 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh @@ -53,6 +53,7 @@ python3 -m atom.entrypoints.openai_server \ --kv_cache_dtype fp8 \ --trust-remote-code \ --gpu-memory-utilization 0.85 \ + --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & SERVER_PID=$! From 3f3560b7ef5b39461065bad07da780e063438313 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 12 Jun 2026 22:17:00 +0900 Subject: [PATCH 5/7] [AMD] dsv4-fp4-mi355x-atom: add max-model-len, eval context, extend conc range - Pass --max-model-len to server using SERVE_MAX_MODEL_LEN - Add EVAL_ONLY path: compute eval context length via compute_eval_context_length - Extend conc-end to 8192 (isl=1024) and 4096 (isl=8192) in amd-master.yaml Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 8 ++++---- .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 12 ++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 77e4f0040..977f0ef2a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2272,17 +2272,17 @@ dsv4-fp4-mi355x-atom: search-space: # conc4-64, TP8 # conc128-512, DPA - # conc1024, DPA TBO + # conc1024-8192, DPA TBO - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 } + - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 } - isl: 8192 osl: 1024 search-space: # conc4-64, TP8 # conc128, DPA - # conc256-1024, DPA TBO + # conc256-4096, DPA TBO - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 1024 } + - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 } dsv4-fp4-mi355x-atom-mtp: image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh index 369b72281..cfd4354b8 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh @@ -37,6 +37,15 @@ if [ "$DP_ATTENTION" = "true" ]; then fi fi +BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" + +if [ "${EVAL_ONLY}" = "true" ]; then + EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN") + export EVAL_MAX_MODEL_LEN + SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +else + SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -44,8 +53,6 @@ set -x export ATOM_DISABLE_MMAP=true export AITER_BF16_FP8_MOE_BOUND=0 export ATOM_MOE_GU_ITLV=1 -# TODO: add --no-enable_chunked_prefill, when dsv4 prefix caching is supported -#https://github.com/ROCm/ATOM/commit/7df93a181da4d3c3250c2441c7d5e2745a03d0cd#diff-61b1ba0b8b74523530d2d5cdc739d4f3a23a43bedf69015a5235844d46e9373bL1127 python3 -m atom.entrypoints.openai_server \ --model $MODEL \ --server-port $PORT \ @@ -54,6 +61,7 @@ python3 -m atom.entrypoints.openai_server \ --trust-remote-code \ --gpu-memory-utilization 0.85 \ --no-enable_prefix_caching \ + --max-model-len "$SERVE_MAX_MODEL_LEN" \ > $SERVER_LOG 2>&1 & SERVER_PID=$! From c3b32890500af732e024b08051195bd8d47398e3 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 13 Jun 2026 22:07:18 +0900 Subject: [PATCH 6/7] [AMD] dsv4-fp4-mi355x-atom: narrow eval to single conc=1024 point, disable max-model-len Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 27 ++++++++++++------- .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 2 +- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 977f0ef2a..6835d9abc 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2267,22 +2267,29 @@ dsv4-fp4-mi355x-atom: multinode: false scenarios: fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # conc4-64, TP8 - # conc128-512, DPA - # conc1024-8192, DPA TBO - - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 } + #- isl: 1024 + # osl: 1024 + # search-space: + # # conc4-64, TP8 + # # conc128-512, DPA + # # conc1024-8192, DPA TBO + # - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 } + # - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 } + #- isl: 8192 + # osl: 1024 + # search-space: + # # conc4-64, TP8 + # # conc128, DPA + # # conc256-4096, DPA TBO + # - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + # - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 } - isl: 8192 osl: 1024 search-space: # conc4-64, TP8 # conc128, DPA # conc256-4096, DPA TBO - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 } + - { tp: 8, ep: 1, dp-attn: true, conc-start: 1024, conc-end: 1024 } dsv4-fp4-mi355x-atom-mtp: image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh index cfd4354b8..03b9ff0a0 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh @@ -61,8 +61,8 @@ python3 -m atom.entrypoints.openai_server \ --trust-remote-code \ --gpu-memory-utilization 0.85 \ --no-enable_prefix_caching \ - --max-model-len "$SERVE_MAX_MODEL_LEN" \ > $SERVER_LOG 2>&1 & + #--max-model-len "$SERVE_MAX_MODEL_LEN" \ SERVER_PID=$! From 7ffa976e1868cf5086bbd8f8b70f47d5512d32c3 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 13 Jun 2026 22:10:47 +0900 Subject: [PATCH 7/7] [AMD] dsv4_fp4_mi355x_atom.sh: add cudagraph-capture-sizes and max-num-seqs Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh index 03b9ff0a0..898dac45e 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh @@ -22,6 +22,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO SERVER_LOG=/workspace/server.log PARALLEL_ARGS=(-tp "$TP") #TP +CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]' if [ "$DP_ATTENTION" = "true" ]; then if [ "$EP_SIZE" -gt 1 ]; then #DP+EP PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) @@ -29,6 +30,7 @@ if [ "$DP_ATTENTION" = "true" ]; then #DPA+TP+TBO if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo) + CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024]' elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo) else @@ -61,6 +63,8 @@ python3 -m atom.entrypoints.openai_server \ --trust-remote-code \ --gpu-memory-utilization 0.85 \ --no-enable_prefix_caching \ + --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \ + --max-num-seqs ${CONC} \ > $SERVER_LOG 2>&1 & #--max-model-len "$SERVE_MAX_MODEL_LEN" \